1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
|
# -*- coding: utf-8 -*-
#***********************************************************************
# This file is part of OpenMolcas. *
# *
# OpenMolcas is free software; you can redistribute it and/or modify *
# it under the terms of the GNU Lesser General Public License, v. 2.1. *
# OpenMolcas is distributed in the hope that it will be useful, but it *
# is provided "as is" and without any express or implied warranties. *
# For more details see the full text of the license in the file *
# LICENSE or in <http://www.gnu.org/licenses/>. *
# *
# Copyright (C) 2020, Ignacio Fdez. Galván *
#***********************************************************************
import re
import textwrap
from xml.etree import ElementTree as ET
# Return the number of leading spaces and the "item label" format, if any
# The "item label" is the first word if it is an asterisk or a single
# letter or number followed by . or )
#
def _indentlabel(line):
match = re.match(r'(\s*)(\S*)?', line)
indent = len(match.group(1))
label = match.group(2)
match = re.match(r'[a-zA-Z0-9][.)]$', label)
if label == '*':
pass
elif match:
if re.match(r'[a-z]', label):
label = 'a' + label[1]
elif re.match(r'[A-Z]', label):
label = 'A' + label[1]
elif re.match(r'[0-9]', label):
label = '1' + label[1]
else:
match = re.match(r'(\s*)(\S+\s+-+)\s', line)
if match:
label = len(match.group(2))*'_'
else:
label = ''
return indent, label
# Return the number of spaces that should be used for "hanging indent",
# i.e., the number of leading spaces (plus the length of the item label plus one)
#
def _hangindent(label):
if label[2]:
return label[1] + len(label[2]) + 1
else:
return label[1]
# Finish a paragraph-level block, by assigning the current buffer text
#
def _finish_paragraph(element, paragraph):
if element is not None:
element.text = '\n'.join(paragraph).rstrip()
elif paragraph:
raise Exception
return None, []
# Append an element updating the parent map
#
def _append_element(element, parent, parent_map):
parent.append(element)
parent_map[element] = parent
# Parse a help text, returning an HTML-like tree
#
def parse_help_text(text):
el = None
par = []
level = [(None, -1, '')]
newpar = False
doc = ET.Element('root')
parent_map = {doc: None}
for line in text.split('\n'):
ind, lab = _indentlabel(line)
blank = not line.strip()
# A pre-formatted block continues until a line with less indent is found
if el is not None and el.tag == 'pre' and (ind >= level[-1][1] or blank):
par.append(line)
continue
# A paragraph-block is finished by a blank line
# or a line that does not match the current hanging indent level
if blank or ind != _hangindent(level[-1]):
el, par = _finish_paragraph(el, par)
newpar = True
# If a new paragraph starts, find out which type and level
if not blank and newpar:
newpar = False
# Start from innermost level and go outwards, until a match is found
for i in range(len(level)):
parent = level[-1][0]
if parent is None:
parent = doc
# Normal paragraph at the current level
if not lab and ind == _hangindent(level[-1]):
break
# List item at the current level
elif (ind, lab) == level[-1][1:3]:
# Create a new <li> element in the parent list, and update the current parent
li = ET.Element('li')
_append_element(li, parent_map[parent], parent_map)
parent = li
level[-1] = (parent,) + level[-1][1:]
break
# New list, or more indent
elif (lab and ind == level[-1][1]) or ind > level[-1][1]:
# New list: create new <ul> and <li> elements
if lab:
ul = ET.Element('ul')
ul.attrib['label'] = lab
_append_element(ul, parent, parent_map)
parent = ul
li = ET.Element('li')
_append_element(li, parent, parent_map)
parent = li
# Indented paragraph: child of the last added element
elif len(parent) > 0:
parent = parent[-1]
level.append((parent, ind, lab))
break
del level[-1]
# A child of <p> is actually a sibling <pre>
if parent.tag == 'p':
parent = parent_map[parent]
el = ET.Element('pre')
# Everything else is a <p> (possibly inside <li>)
else:
el = ET.Element('p')
_append_element(el, parent, parent_map)
# Whatever the case, add the current line to the buffer
if not blank:
par.append(line)
# Finish the last element
el, par = _finish_paragraph(el, par)
return doc
# Format an HTML element as plain text
#
def _format_element(element, parent_map, textwidth, indent):
out = ''
# Increase indent in lists by multiples of 2
if element.tag in ['ol', 'ul']:
indent += 2
# Increase indent of preformatted texts by 4
elif element.tag == 'pre':
indent += 4
# If there are child elements, call recursively
if len(element) > 0:
for el in element:
out += _format_element(el, parent_map, textwidth, indent)
# If there are no child elements, format the inner text
else:
# Pre-formatted text: Verbatim lines (replacing common indentation)
if element.tag == 'pre':
for line in textwrap.dedent(element.text).split('\n'):
if line:
out += indent*' ' + line + '\n'
else:
out += '\n'
# Other elements, use textwrap
elif element.text:
tw = textwrap.TextWrapper(break_long_words=False, break_on_hyphens=False, width=textwidth)
tw.initial_indent = indent*' '
tw.subsequent_indent = tw.initial_indent
# List items: Hanging indent in first child, extra indent in the rest
parent = parent_map[element]
text = element.text.lstrip()
lab = ''
if parent.tag == 'li':
hang = len(parent_map[parent].attrib['label'])
tw.subsequent_indent += (hang+1)*' '
if element is parent[0]:
# Split the label, so the spaces here are not altered
lab = text[0:hang] + ' '
text = text[hang:]
else:
tw.initial_indent += hang*' '
text = lab + ' '.join(text.split())
out += tw.fill(text) + '\n'
out += '\n'
return out
# Return a plain-text-formatted text from an HTML-like tree
#
def format_help_text(doc, textwidth=80):
parent_map = {e:p for p in doc.iter() for e in p}
return _format_element(doc, parent_map, textwidth, 0).strip()
|