1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
|
#include "ParserDom.h"
#include "wincstring.h"
#include <iostream>
#include <vector>
//#define DEBUG
#include "debug.h"
#define TAG_NAME_MAX 10
using namespace std;
using namespace htmlcxx;
using namespace HTML;
using namespace kp;
const tree<HTML::Node>& ParserDom::parseTree(const std::string &html)
{
this->parse(html);
return this->getTree();
}
void ParserDom::beginParsing()
{
mHtmlTree.clear();
tree<HTML::Node>::iterator top = mHtmlTree.begin();
HTML::Node lambda_node;
lambda_node.offset(0);
lambda_node.length(0);
lambda_node.isTag(true);
lambda_node.isComment(false);
mCurrentState = mHtmlTree.insert(top,lambda_node);
}
void ParserDom::endParsing()
{
tree<HTML::Node>::iterator top = mHtmlTree.begin();
top->length(mCurrentOffset);
}
void ParserDom::foundComment(Node node)
{
//Add child content node, but do not update current state
mHtmlTree.append_child(mCurrentState, node);
}
void ParserDom::foundText(Node node)
{
//Add child content node, but do not update current state
mHtmlTree.append_child(mCurrentState, node);
}
void ParserDom::foundTag(Node node, bool isEnd)
{
if (!isEnd)
{
//append to current tree node
tree<HTML::Node>::iterator next_state;
next_state = mHtmlTree.append_child(mCurrentState, node);
mCurrentState = next_state;
}
else
{
//Look if there is a pending open tag with that same name upwards
//If mCurrentState tag isn't matching tag, maybe a some of its parents
// matches
vector< tree<HTML::Node>::iterator > path;
tree<HTML::Node>::iterator i = mCurrentState;
bool found_open = false;
while (i != mHtmlTree.begin())
{
#ifdef DEBUG
cerr << "comparing " << node.tagName() << " with " << i->tagName()<<endl<<":";
if (!i->tagName().length()) cerr << "Tag with no name at" << i->offset()<<";"<<i->offset()+i->length();
#endif
assert(i->isTag());
assert(i->tagName().length());
bool equal;
const char *open = i->tagName().c_str();
const char *close = node.tagName().c_str();
equal = !(strcasecmp(open,close));
if (equal)
{
DEBUGP("Found matching tag %s\n", i->tagName().c_str());
//Closing tag closes this tag
//Set length to full range between the opening tag and
//closing tag
i->length(node.offset() + node.length() - i->offset());
i->closingText(node.text());
mCurrentState = mHtmlTree.parent(i);
found_open = true;
break;
}
else
{
path.push_back(i);
}
i = mHtmlTree.parent(i);
}
if (found_open)
{
//If match was upper in the tree, so we need to invalidate child
//nodes that were waiting for a close
for (unsigned int j = 0; j < path.size(); ++j)
{
// path[j]->length(node.offset() - path[j]->offset());
mHtmlTree.flatten(path[j]);
}
}
else
{
DEBUGP("Unmatched tag %s\n", node.text().c_str());
// Treat as comment
node.isTag(false);
node.isComment(true);
mHtmlTree.append_child(mCurrentState, node);
}
}
}
ostream &HTML::operator<<(ostream &stream, const tree<HTML::Node> &tr)
{
tree<HTML::Node>::pre_order_iterator it = tr.begin();
tree<HTML::Node>::pre_order_iterator end = tr.end();
int rootdepth = tr.depth(it);
stream << "-----" << endl;
unsigned int n = 0;
while ( it != end )
{
int cur_depth = tr.depth(it);
for(int i=0; i < cur_depth - rootdepth; ++i) stream << " ";
stream << n << "@";
stream << "[" << it->offset() << ";";
stream << it->offset() + it->length() << ") ";
stream << (string)(*it) << endl;
++it, ++n;
}
stream << "-----" << endl;
return stream;
}
|