File: WordDoc.php

package info (click to toggle)
php-xml-htmlsax3 3.0.0%2Bcvs01112007-2
  • links: PTS
  • area: main
  • in suites: lenny, squeeze, wheezy
  • size: 160 kB
  • ctags: 327
  • sloc: php: 1,060; xml: 152; makefile: 2
file content (42 lines) | stat: -rw-r--r-- 1,204 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
<?php
/***
 * $Id: WordDoc.php,v 1.4 2004/06/02 14:33:38 hfuecks Exp $
 * Shows HTMLSax parsing Word generated HTML
 */
require_once('XML/HTMLSax3.php');

class MyHandler {
    function escape($parser,$data) {
        echo('<pre>'.$data."\n\n\n</pre>");
    }
}

$h = & new MyHandler();

// Instantiate the parser
$parser=& new XML_HTMLSax3();

$parser->set_object($h);
$parser->set_escape_handler('escape');

if ( isset($_GET['strip_escapes']) ) {
    $parser->set_option('XML_OPTION_STRIP_ESCAPES');
}
?>
<h1>Parsing Word Documents</h1>
<p>Shows HTMLSax parsing a simple Word generated HTML document and the impact of the option 'XML_OPTION_STRIP_ESCAPES' which can be set like;
<pre>
$parser->set_option('XML_OPTION_STRIP_ESCAPES');
</pre>
</p>
<p>Word generates some strange XML / HTML escape sequences like &lt;![endif]&gt; - now (3.0.0+) handled by HTMLSax correctly.</p>
<p>
    <a href="<?php echo $_SERVER['PHP_SELF']; ?>">XML_OPTION_STRIP_ESCAPES = 0</a> :
    <a href="<?php echo $_SERVER['PHP_SELF']; ?>?strip_escapes=1">XML_OPTION_STRIP_ESCAPES = 1</a>
</p>
<p>Starting to parse...</p>
<?php
// Parse the document
$parser->parse(file_get_contents('worddoc.htm'));
?>
<p>Parsing completed</p>