1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
|
-----------------------------------------------------------------------------------------
-- LUA only HtmlParser from Christian Liesch based on Alexander Makeev's XmlParser
-- Dec. 2011
-----------------------------------------------------------------------------------------
HtmlParser = {};
-- Set implementation
-- @param list IN list of Set elements
-- @return a set of keys with value true
function Set(list)
local set = {}
for _, l in ipairs(list) do
set[l] = true
end
return set
end
-- Do a subset of html conversions
-- @param value IN HTML string
-- @return resolved HTML string
function HtmlParser:FromHtmlString(value)
value = string.gsub(value, "&#x([%x]+)%;",
function(h)
return string.char(tonumber(h,16))
end);
value = string.gsub(value, "&#([0-9]+)%;",
function(h)
return string.char(tonumber(h,10))
end);
value = string.gsub (value, """, "\"");
value = string.gsub (value, "'", "'");
value = string.gsub (value, ">", ">");
value = string.gsub (value, "<", "<");
value = string.gsub (value, "&", "&");
return value;
end
-- Parse args in a tag
-- @param args IN args string
-- @return table of arguments
function HtmlParser:ParseArgs(args)
local arg = {}
string.gsub(args, "(%w+)=([\"'])(.-)%2",
function (w, _, a)
arg[w] = self:FromHtmlString(a);
end)
return arg
end
-- HTML parser itself
-- @param htmlText IN the html text to parse
-- @return a tree of html nodes
function HtmlParser:ParseHtmlText(htmlText)
local emptySet = Set{ "br", "hr" }
local stack = {}
local top = { Name = nil, Value = nil, Attributes = {}, ChildNodes = {} }
table.insert(stack, top)
local ni, close, label, xarg, empty
local i, j = 1, 1
while true do
ni, j, close, label, xarg, empty = string.find(htmlText, "<(%/?)([%w:]+)(.-)(%/?)>", i)
if not ni then break end
local text = string.sub(htmlText, i, ni-1);
if not string.find(text, "^%s*$") then
top.Value = (top.Value or "")..self:FromHtmlString(text);
end
if emptySet[label] or empty == "/" then
table.insert(top.ChildNodes, { Name = label, Value = nil, Attributes = self:ParseArgs(xarg), ChildNodes = {}})
elseif close == "" then
top = { Name = label, Value = nil, Attributes = self:ParseArgs(xarg), ChildNodes = {} }
table.insert(stack, top)
else
local toclose = table.remove(stack)
top = stack[#stack]
if #stack < 1 then
break;
elseif toclose.Name ~= label then
local tmp = {}
table.insert(tmp, toclose)
while toclose.Name ~= label do
toclose = table.remove(stack)
if toclose.Name ~= label then
table.insert(tmp, toclose)
end
end
for i = 1,#tmp do
table.insert(toclose.ChildNodes, tmp[i])
end
top = stack[#stack]
table.insert(top.ChildNodes, toclose)
else
table.insert(top.ChildNodes, toclose)
end
end
i = j + 1
end
local text = string.sub(htmlText, i);
if not string.find(text, "^%s*$") then
stack[#stack].Value = (stack[#stack].Value or "")..self:FromHtmlString(text);
end
return stack[1].ChildNodes[1];
end
------------------------------------------------------------------------------------------
function log(str)
print(str)
end
function dump(_class, no_func, depth)
if(not _class) then
log("nil");
return;
end
if(depth==nil) then depth=0; end
local str="";
for n=0,depth,1 do
str=str.."\t";
end
log(str.."["..type(_class).."]");
log(str.."{");
for i,field in pairs(_class) do
if(type(field)=="table") then
log(str.."\t"..tostring(i).." =");
dump(field, no_func, depth+1);
else
if(type(field)=="number") then
log(str.."\t"..tostring(i).."="..field);
elseif(type(field) == "string") then
log(str.."\t"..tostring(i).."=".."\""..field.."\"");
elseif(type(field) == "boolean") then
log(str.."\t"..tostring(i).."=".."\""..tostring(field).."\"");
else
if(not no_func)then
if(type(field)=="function")then
log(str.."\t"..tostring(i).."()");
else
log(str.."\t"..tostring(i).."<userdata=["..type(field).."]>");
end
end
end
end
end
log(str.."}");
end
local htmlTree = HtmlParser:ParseHtmlText([[
<html>
<head>
<meta foo="bar1">
<meta foo="bar2">
<meta foo="bar3">
</head>
<body>
<table>
<tr>
bla
</tr>
</table>
<img href=foo.bar.ch/bla alt=blabla>
</body>
</html>
]])
dump(htmlTree)
function HtmlParser:GetTag(htmlTree, path)
for tag in string.gmatch(path, "([^\.]+)\.?") do
print(tag)
end
end
-- Much better would be:
-- html.head.meta[1].foo
HtmlParser:GetTag(htmlTree, "html.head.meta[1].foo")
|