1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
|
# coding: utf-8
"""
Deals with generating the per-page table of contents.
For the sake of simplicity we use an existing markdown extension to generate
an HTML table of contents, and then parse that into the underlying data.
"""
from __future__ import unicode_literals
try: # pragma: no cover
from html.parser import HTMLParser # noqa
except ImportError: # pragma: no cover
from HTMLParser import HTMLParser # noqa
def get_toc(toc_html):
items = _parse_html_table_of_contents(toc_html)
return TableOfContents(items)
class TableOfContents(object):
"""
Represents the table of contents for a given page.
"""
def __init__(self, items):
self.items = items
def __iter__(self):
return iter(self.items)
def __len__(self):
return len(self.items)
def __str__(self):
return ''.join([str(item) for item in self])
class AnchorLink(object):
"""
A single entry in the table of contents.
"""
def __init__(self, title, url, level):
self.title, self.url, self.level = title, url, level
self.children = []
def __str__(self):
return self.indent_print()
def indent_print(self, depth=0):
indent = ' ' * depth
ret = '%s%s - %s\n' % (indent, self.title, self.url)
for item in self.children:
ret += item.indent_print(depth + 1)
return ret
class _TOCParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.links = []
self.in_anchor = False
self.attrs = None
self.title = ''
# Prior to Python3.4 no convert_charrefs keyword existed.
# However, in Python3.5 the default was changed to True.
# We need the False behavior in all versions but can only
# set it if it exists.
if hasattr(self, 'convert_charrefs'): # pragma: no cover
self.convert_charrefs = False
def handle_starttag(self, tag, attrs):
if not self.in_anchor:
if tag == 'a':
self.in_anchor = True
self.attrs = dict(attrs)
def handle_endtag(self, tag):
if tag == 'a':
self.in_anchor = False
def handle_data(self, data):
if self.in_anchor:
self.title += data
def handle_charref(self, ref):
self.handle_entityref("#" + ref)
def handle_entityref(self, ref):
self.handle_data("&%s;" % ref)
def _parse_html_table_of_contents(html):
"""
Given a table of contents string that has been automatically generated by
the markdown library, parse it into a tree of AnchorLink instances.
Returns a list of all the parent AnchorLink instances.
"""
lines = html.splitlines()[2:-2]
ret, parents, level = [], [], 0
for line in lines:
parser = _TOCParser()
parser.feed(line)
if parser.title:
try:
href = parser.attrs['href']
except KeyError:
continue
title = parser.title
nav = AnchorLink(title, href, level)
# Add the item to its parent if required. If it is a topmost
# item then instead append it to our return value.
if parents:
parents[-1].children.append(nav)
else:
ret.append(nav)
# If this item has children, store it as the current parent
if line.endswith('<ul>'):
level += 1
parents.append(nav)
elif line.startswith('</ul>'):
level -= 1
if parents:
parents.pop()
# For the table of contents, always mark the first element as active
if ret:
ret[0].active = True
return ret
|