1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
|
<?php
/***
* $Id: WordDoc.php,v 1.4 2004/06/02 14:33:38 hfuecks Exp $
* Shows HTMLSax parsing Word generated HTML
*/
require_once('XML/HTMLSax3.php');
class MyHandler {
function escape($parser,$data) {
echo('<pre>'.$data."\n\n\n</pre>");
}
}
$h = & new MyHandler();
// Instantiate the parser
$parser=& new XML_HTMLSax3();
$parser->set_object($h);
$parser->set_escape_handler('escape');
if ( isset($_GET['strip_escapes']) ) {
$parser->set_option('XML_OPTION_STRIP_ESCAPES');
}
?>
<h1>Parsing Word Documents</h1>
<p>Shows HTMLSax parsing a simple Word generated HTML document and the impact of the option 'XML_OPTION_STRIP_ESCAPES' which can be set like;
<pre>
$parser->set_option('XML_OPTION_STRIP_ESCAPES');
</pre>
</p>
<p>Word generates some strange XML / HTML escape sequences like <![endif]> - now (3.0.0+) handled by HTMLSax correctly.</p>
<p>
<a href="<?php echo $_SERVER['PHP_SELF']; ?>">XML_OPTION_STRIP_ESCAPES = 0</a> :
<a href="<?php echo $_SERVER['PHP_SELF']; ?>?strip_escapes=1">XML_OPTION_STRIP_ESCAPES = 1</a>
</p>
<p>Starting to parse...</p>
<?php
// Parse the document
$parser->parse(file_get_contents('worddoc.htm'));
?>
<p>Parsing completed</p>
|