1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
|
<?php
/**
* Takes a well formed list of tokens and fixes their nesting.
*
* HTML elements dictate which elements are allowed to be their children,
* for example, you can't have a p tag in a span tag. Other elements have
* much more rigorous definitions: tables, for instance, require a specific
* order for their elements. There are also constraints not expressible by
* document type definitions, such as the chameleon nature of ins/del
* tags and global child exclusions.
*
* The first major objective of this strategy is to iterate through all the
* nodes (not tokens) of the list of tokens and determine whether or not
* their children conform to the element's definition. If they do not, the
* child definition may optionally supply an amended list of elements that
* is valid or require that the entire node be deleted (and the previous
* node rescanned).
*
* The second objective is to ensure that explicitly excluded elements of
* an element do not appear in its children. Code that accomplishes this
* task is pervasive through the strategy, though the two are distinct tasks
* and could, theoretically, be seperated (although it's not recommended).
*
* @note Whether or not unrecognized children are silently dropped or
* translated into text depends on the child definitions.
*
* @todo Enable nodes to be bubbled out of the structure.
*/
class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
{
public function execute($tokens, $config, $context) {
//####################################################################//
// Pre-processing
// get a copy of the HTML definition
$definition = $config->getHTMLDefinition();
// insert implicit "parent" node, will be removed at end.
// DEFINITION CALL
$parent_name = $definition->info_parent;
array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
$tokens[] = new HTMLPurifier_Token_End($parent_name);
// setup the context variable 'IsInline', for chameleon processing
// is 'false' when we are not inline, 'true' when it must always
// be inline, and an integer when it is inline for a certain
// branch of the document tree
$is_inline = $definition->info_parent_def->descendants_are_inline;
$context->register('IsInline', $is_inline);
// setup error collector
$e =& $context->get('ErrorCollector', true);
//####################################################################//
// Loop initialization
// stack that contains the indexes of all parents,
// $stack[count($stack)-1] being the current parent
$stack = array();
// stack that contains all elements that are excluded
// it is organized by parent elements, similar to $stack,
// but it is only populated when an element with exclusions is
// processed, i.e. there won't be empty exclusions.
$exclude_stack = array();
// variable that contains the start token while we are processing
// nodes. This enables error reporting to do its job
$start_token = false;
$context->register('CurrentToken', $start_token);
//####################################################################//
// Loop
// iterate through all start nodes. Determining the start node
// is complicated so it has been omitted from the loop construct
for ($i = 0, $size = count($tokens) ; $i < $size; ) {
//################################################################//
// Gather information on children
// child token accumulator
$child_tokens = array();
// scroll to the end of this node, report number, and collect
// all children
for ($j = $i, $depth = 0; ; $j++) {
if ($tokens[$j] instanceof HTMLPurifier_Token_Start) {
$depth++;
// skip token assignment on first iteration, this is the
// token we currently are on
if ($depth == 1) continue;
} elseif ($tokens[$j] instanceof HTMLPurifier_Token_End) {
$depth--;
// skip token assignment on last iteration, this is the
// end token of the token we're currently on
if ($depth == 0) break;
}
$child_tokens[] = $tokens[$j];
}
// $i is index of start token
// $j is index of end token
$start_token = $tokens[$i]; // to make token available via CurrentToken
//################################################################//
// Gather information on parent
// calculate parent information
if ($count = count($stack)) {
$parent_index = $stack[$count-1];
$parent_name = $tokens[$parent_index]->name;
if ($parent_index == 0) {
$parent_def = $definition->info_parent_def;
} else {
$parent_def = $definition->info[$parent_name];
}
} else {
// processing as if the parent were the "root" node
// unknown info, it won't be used anyway, in the future,
// we may want to enforce one element only (this is
// necessary for HTML Purifier to clean entire documents
$parent_index = $parent_name = $parent_def = null;
}
// calculate context
if ($is_inline === false) {
// check if conditions make it inline
if (!empty($parent_def) && $parent_def->descendants_are_inline) {
$is_inline = $count - 1;
}
} else {
// check if we're out of inline
if ($count === $is_inline) {
$is_inline = false;
}
}
//################################################################//
// Determine whether element is explicitly excluded SGML-style
// determine whether or not element is excluded by checking all
// parent exclusions. The array should not be very large, two
// elements at most.
$excluded = false;
if (!empty($exclude_stack)) {
foreach ($exclude_stack as $lookup) {
if (isset($lookup[$tokens[$i]->name])) {
$excluded = true;
// no need to continue processing
break;
}
}
}
//################################################################//
// Perform child validation
if ($excluded) {
// there is an exclusion, remove the entire node
$result = false;
$excludes = array(); // not used, but good to initialize anyway
} else {
// DEFINITION CALL
if ($i === 0) {
// special processing for the first node
$def = $definition->info_parent_def;
} else {
$def = $definition->info[$tokens[$i]->name];
}
if (!empty($def->child)) {
// have DTD child def validate children
$result = $def->child->validateChildren(
$child_tokens, $config, $context);
} else {
// weird, no child definition, get rid of everything
$result = false;
}
// determine whether or not this element has any exclusions
$excludes = $def->excludes;
}
// $result is now a bool or array
//################################################################//
// Process result by interpreting $result
if ($result === true || $child_tokens === $result) {
// leave the node as is
// register start token as a parental node start
$stack[] = $i;
// register exclusions if there are any
if (!empty($excludes)) $exclude_stack[] = $excludes;
// move cursor to next possible start node
$i++;
} elseif($result === false) {
// remove entire node
if ($e) {
if ($excluded) {
$e->send(E_ERROR, 'Strategy_FixNesting: Node excluded');
} else {
$e->send(E_ERROR, 'Strategy_FixNesting: Node removed');
}
}
// calculate length of inner tokens and current tokens
$length = $j - $i + 1;
// perform removal
array_splice($tokens, $i, $length);
// update size
$size -= $length;
// there is no start token to register,
// current node is now the next possible start node
// unless it turns out that we need to do a double-check
// this is a rought heuristic that covers 100% of HTML's
// cases and 99% of all other cases. A child definition
// that would be tricked by this would be something like:
// ( | a b c) where it's all or nothing. Fortunately,
// our current implementation claims that that case would
// not allow empty, even if it did
if (!$parent_def->child->allow_empty) {
// we need to do a double-check
$i = $parent_index;
array_pop($stack);
}
// PROJECTED OPTIMIZATION: Process all children elements before
// reprocessing parent node.
} else {
// replace node with $result
// calculate length of inner tokens
$length = $j - $i - 1;
if ($e) {
if (empty($result) && $length) {
$e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed');
} else {
$e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized');
}
}
// perform replacement
array_splice($tokens, $i + 1, $length, $result);
// update size
$size -= $length;
$size += count($result);
// register start token as a parental node start
$stack[] = $i;
// register exclusions if there are any
if (!empty($excludes)) $exclude_stack[] = $excludes;
// move cursor to next possible start node
$i++;
}
//################################################################//
// Scroll to next start node
// We assume, at this point, that $i is the index of the token
// that is the first possible new start point for a node.
// Test if the token indeed is a start tag, if not, move forward
// and test again.
$size = count($tokens);
while ($i < $size and !$tokens[$i] instanceof HTMLPurifier_Token_Start) {
if ($tokens[$i] instanceof HTMLPurifier_Token_End) {
// pop a token index off the stack if we ended a node
array_pop($stack);
// pop an exclusion lookup off exclusion stack if
// we ended node and that node had exclusions
if ($i == 0 || $i == $size - 1) {
// use specialized var if it's the super-parent
$s_excludes = $definition->info_parent_def->excludes;
} else {
$s_excludes = $definition->info[$tokens[$i]->name]->excludes;
}
if ($s_excludes) {
array_pop($exclude_stack);
}
}
$i++;
}
}
//####################################################################//
// Post-processing
// remove implicit parent tokens at the beginning and end
array_shift($tokens);
array_pop($tokens);
// remove context variables
$context->destroy('IsInline');
$context->destroy('CurrentToken');
//####################################################################//
// Return
return $tokens;
}
}
// vim: et sw=4 sts=4
|