1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
|
"""
PC-BASIC - docs.usage
Usage textfile builder
(c) 2013--2023 Rob Hagemans
This file is released under the GNU GPL version 3 or later.
"""
import os
import re
import textwrap
from io import StringIO
from lxml import etree
# file locations
SOURCE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'source')
INPUT_HTML = os.path.join(SOURCE_PATH, 'options.html')
def make_usage(output_path, output_name):
"""Build USAGE.txt file."""
output_file = os.path.join(output_path, output_name)
with open(INPUT_HTML, mode='r', encoding='utf-8') as html_file:
with open(output_file, 'w', encoding='utf-8') as textfile:
textfile.write(_html_to_text(html_file.read()))
class TextBlock(object):
"""Block of text with minimal formatting."""
def __init__(self, indent=0, content='', break_after=0):
"""Cteate block of text."""
self.indent = indent
self.content = content
self.break_after = break_after
def __str__(self):
"""Convert to str."""
content = re.sub(' +', ' ', self.content.replace('\n', ' ')).strip()
block = (
'\t' * self.indent
+ ('\n' + '\t'*self.indent).join(textwrap.wrap(content, replace_whitespace=False))
)
return block + '\n' * self.break_after
# html tags to plaintext formatting
INDENT_TAGS = u'DD',
BLOCK_TAGS = u'P', u'H1', u'H2', u'H3', u'DT'
BREAK_AFTER_TAGS = u'DD', u'P', u'H1', u'H2', u'H3'
UPPER_TAGS = u'H1', u'H2', u'H3'
def _parse_element(element, blocklist=None):
"""Recursively parse an element of the document tree."""
if not blocklist:
blocklist = [TextBlock()]
last_indent = blocklist[-1].indent
tag = element.tag.upper()
inner = element.text if element.text else ''
tail = element.tail if element.tail else ''
if tag in UPPER_TAGS:
inner = inner.upper()
break_after = (tag in BREAK_AFTER_TAGS or element.get('class') == 'block')
if tag in BLOCK_TAGS or element.get('class') == 'block':
blocklist.append(TextBlock(last_indent, '', break_after))
elif tag in INDENT_TAGS:
blocklist.append(TextBlock(last_indent+1, '', break_after))
blocklist[-1].content += inner
for child in element.iterchildren(tag=etree.Element):
blocklist = _parse_element(child, blocklist)
if (tag in INDENT_TAGS + BLOCK_TAGS or blocklist[-1].indent != last_indent):
break_after = blocklist[-1].break_after
blocklist.append(TextBlock(last_indent, tail, break_after))
else:
blocklist[-1].content += tail
return blocklist
def _html_to_text(html):
"""Extract plain text from HTML."""
doc = etree.parse(StringIO(html), etree.HTMLParser(encoding='utf-8'))
blocklist = _parse_element(doc.getroot())
return u'\n'.join(str(block) for block in blocklist[1:] if str(block).strip())
|