1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
|
import logging
import os
import os.path
import tempfile
import traceback
import sys
import shutil
from html.parser import HTMLParser
from dataclasses import Tag
from utilfunctions import update_attribute, change_filename
class OutputParser(HTMLParser):
"""The class is designed to be used as a base class. It will output the same html structure as the input file into a file like object (only needs write)."""
def __init__(self, file, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs)
if hasattr(file, 'write'):
self.outputfile = file
else:
raise Exception("file is not a file like object with a write function")
self.tagstack = list()
def handle_starttag(self, tag, attrs):
logging.debug(" Handle starttag: %s", tag)
logging.debug(" %s", attrs)
self.tagstack.append(Tag(tag, attrs))
def handle_startendtag(self, tag, attrs):
logging.debug(" Handle start and end tag: %s", tag)
logging.debug(" %s", attrs)
self.write_tag(Tag(tag, attrs))
def handle_endtag(self, tagname):
"""Finds the matching tag. If the tags are miss matched, function will go down the stack until it finds the match"""
logging.debug(" End tag: %s", tagname)
tag = self.find_match(tagname)
self.write_tag(tag)
def handle_data(self, data):
logging.debug(" Data: %s", data)
if len(self.tagstack) > 0:
tag = self.tagstack.pop()
if tag.data == None:
tag.data = data
else:
tag.data += data
self.tagstack.append(tag)
else:
self.outputfile.write(data)
def find_match(self, tagtofind, indent=0):
"""Recursive function to match the tag"""
tag = self.tagstack.pop()
if not tag.name == tagtofind:
logging.warning(" %smismatched tag found (expected %s, found %s)", " "*indent, tagtofind, tag.name)
self.write_tag(tag)
tag = self.find_match(tagtofind)
return tag
def write_tag(self, tag):
"""When tag stack is empty, writes tag to file passed in the constructor. When tag stack is not empty formats the tag and sets (or if the next tag.data is not None, appends) the formated string."""
if tag.data == None:
str = "<%s%s />" %(tag.name, self.format_attributes(tag))
else:
str = "<%s%s>%s</%s>" %(
tag.name, self.format_attributes(tag), tag.data, tag.name)
if len(self.tagstack) > 0:
tag = self.tagstack.pop()
if tag.data == None:
tag.data = str
else:
tag.data += str
self.tagstack.append(tag)
else:
self.outputfile.write(str)
def format_attributes(self, tag):
"""Returns the attributes formatted to be placed in a tag."""
ret = ""
if len(tag.attributes) > 0:
for name, value in tag.attributes:
ret = "%s %s=\"%s\"" % (ret, name, value)
return ret
class Stage2Parser(OutputParser):
def __init__(self, *args, **kwargs):
OutputParser.__init__(self, *args, **kwargs)
self.control = []
def handle_starttag(self, tag, attrs):
if tag == "meta":
if attrs[0][0] == "name" and attrs[0][1] == "control" and attrs[1][0] == "content":
self.control.append(attrs[1][1])
else:
OutputParser.handle_starttag(self, tag, attrs)
def handle_startendtag(self, tag, attrs):
if tag == "meta":
if attrs[0][0] == "name" and attrs[0][1] == "control" and attrs[1][0] == "content":
self.control.append(attrs[1][1])
else:
OutputParser.handle_startendtag(self, tag, attrs)
class Stage3Parser(OutputParser):
def __init__(self, options, files, extrafiles, subdir, *args, **kwargs):
OutputParser.__init__(self, *args, **kwargs)
self.options = options
self.files = files
self.extrafiles = extrafiles
self.subdir = subdir
logging.debug(" Subdirectory is '%s'", subdir)
def handle_startendtag(self, tag, attrs):
"""Find the image and copy it to the stage3 folder where it should
be in the file output."""
ALT_INDEX = None
if tag == "img":
# figure out which attribute is src
for x in range(0, len(attrs)):
if attrs[x][0] == "src":
SRC_INDEX = x
elif attrs[x][0] == "alt":
ALT_INDEX = x
if attrs[SRC_INDEX][1].startswith("/"):
# manual wants an absolute path, the help manual does not support
# absolute path, so make sure that the image exists where the
# absolute path indicates, then make the path into a relative path
# with the approriate number of updirs
test = os.path.join(self.options.indir, attrs[SRC_INDEX][1][1:])
if not os.path.exists(test):
raise IOError("Cannot find %s in base path"%(attrs[SRC_INDEX][1]))
# try find a valid relative path
subdirdepth = len(self.subdir.split(os.path.sep))
prefix = "../"*subdirdepth
relpath = os.path.join(prefix, attrs[SRC_INDEX][1][1:])
if not os.path.exists(os.path.join(self.options.indir, self.subdir,relpath)):
raise Exception("Cannot relativize path: %s"%(attrs[SRC_INDEX][1]))
else:
attrs = update_attribute(attrs, 'src', relpath)
location1 = os.path.join(self.options.indir, self.subdir, attrs[SRC_INDEX][1])
location = os.path.normpath(location1)
# check to make sure that the image I am including was in the onlinehelp
# folder, if not change the dst name so that it will be located
# correctly in the stage3 directory
logging.debug("%s - %s", location, self.options.indir)
if location.startswith(self.options.indir):
dst1 = os.path.join(self.files['stage3'], self.subdir, attrs[SRC_INDEX][1])
dst = os.path.normpath(dst1)
else:
# get extention
basename = os.path.basename(attrs[SRC_INDEX][1])
(name, ext) = os.path.splitext(basename)
(file, outname) = tempfile.mkstemp(ext, name, self.files['stage3'])
dst1 = outname.replace(os.getcwd(), ".") # make into a relative path
# fix up attrs
dst = os.path.normpath(dst1)
attrs = update_attribute(attrs, 'src', change_filename(dst, os.path.splitext(dst)[1], self.files['stage3'], ".", makedirs=False))
if ALT_INDEX is None:
ALT_INDEX = SRC_INDEX
logging.debug(" Image (%s) should be in %s and copying to %s", attrs[ALT_INDEX][1], location, dst)
try:
if not os.path.exists(os.path.dirname(dst)):
os.mkdir(os.path.dirname(dst))
shutil.copy2(location, dst)
except:
traceback.print_exc()
logging.error(" '%s' does not exist", location )
sys.exit(3)
self.extrafiles.append(dst)
OutputParser.handle_startendtag(self, tag, attrs)
class Stage4Parser(OutputParser):
def __init__(self, files, options, subdir, *args, **kwargs):
OutputParser.__init__(self, *args, **kwargs)
self.files = files
self.options = options
self.subdir = subdir
def handle_starttag(self, tag, attrs):
if tag == "a":
for name,value in attrs:
if name == "href" and value.startswith("/"):
if value.startswith("/"):
# manual wants an absolute path, the help manual does not support
# absolute path, so make sure that the image exists where the
# absolute path indicates, then make the path into a relative path
# with the approriate number of updirs
test = os.path.join(self.options.indir, value[1:])
if not os.path.exists(test):
raise IOError("Cannot find %s in base path" % (value))
# try find a valid relative path
subdirdepth = len(self.subdir.split(os.path.sep))
prefix = "../"*subdirdepth
relpath = os.path.join(prefix, value[1:])
if not os.path.exists(os.path.join(self.options.indir, self.subdir,relpath)):
raise Exception("Cannot relativize path: %s" % (value))
else:
attrs = update_attribute(attrs, 'src', relpath)
value = relpath
if name == "href" and not value.startswith("http://"):
# make sure the file being refered to exists in stage3
check_ref = change_filename(value, ".stage3", ".", self.files['stage3'], makedirs=False)
if not os.path.exists(check_ref):
logging.debug(" File (%s) does not exist to be bundled into archive!", check_ref)
fixed_ref = change_filename(value, ".htm", ".", ".", makedirs=False)
attrs = update_attribute(attrs, "href", fixed_ref)
OutputParser.handle_starttag(self, tag, attrs)
class Stage5Parser(OutputParser):
def __init__(self, *args, **kwargs):
OutputParser.__init__(self, *args, **kwargs)
self.title = "Untitled"
def handle_endtag(self, tag):
if tag == "title":
t = self.tagstack.pop()
self.title = t.data
self.tagstack.append(t)
elif tag == "h1" and self.title == "Untitled":
# make first h1 tag found into the title, will be overwritten by a title tag if it exists.
t = self.tagstack.pop()
self.title = t.data
self.tagstack.append(t)
OutputParser.handle_endtag(self, tag)
|