1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
|
"""
Simple HTML -> Telegram entity parser.
"""
from collections import deque
from html import escape
from html.parser import HTMLParser
from typing import Iterable, Tuple, List
from ..helpers import add_surrogate, del_surrogate, within_surrogate, strip_text
from ..tl import TLObject
from ..tl.types import (
MessageEntityBold, MessageEntityItalic, MessageEntityCode,
MessageEntityPre, MessageEntityEmail, MessageEntityUrl,
MessageEntityTextUrl, MessageEntityMentionName,
MessageEntityUnderline, MessageEntityStrike, MessageEntityBlockquote,
MessageEntityCustomEmoji, TypeMessageEntity
)
class HTMLToTelegramParser(HTMLParser):
def __init__(self):
super().__init__()
self.text = ''
self.entities = []
self._building_entities = {}
self._open_tags = deque()
self._open_tags_meta = deque()
def handle_starttag(self, tag, attrs):
self._open_tags.appendleft(tag)
self._open_tags_meta.appendleft(None)
attrs = dict(attrs)
EntityType = None
args = {}
if tag == 'strong' or tag == 'b':
EntityType = MessageEntityBold
elif tag == 'em' or tag == 'i':
EntityType = MessageEntityItalic
elif tag == 'u':
EntityType = MessageEntityUnderline
elif tag == 'del' or tag == 's':
EntityType = MessageEntityStrike
elif tag == 'blockquote':
EntityType = MessageEntityBlockquote
elif tag == 'code':
try:
# If we're in the middle of a <pre> tag, this <code> tag is
# probably intended for syntax highlighting.
#
# Syntax highlighting is set with
# <code class='language-...'>codeblock</code>
# inside <pre> tags
pre = self._building_entities['pre']
try:
pre.language = attrs['class'][len('language-'):]
except KeyError:
pass
except KeyError:
EntityType = MessageEntityCode
elif tag == 'pre':
EntityType = MessageEntityPre
args['language'] = ''
elif tag == 'a':
try:
url = attrs['href']
except KeyError:
return
if url.startswith('mailto:'):
url = url[len('mailto:'):]
EntityType = MessageEntityEmail
else:
if self.get_starttag_text() == url:
EntityType = MessageEntityUrl
else:
EntityType = MessageEntityTextUrl
args['url'] = del_surrogate(url)
url = None
self._open_tags_meta.popleft()
self._open_tags_meta.appendleft(url)
elif tag == 'tg-emoji':
try:
emoji_id = int(attrs['emoji-id'])
except (KeyError, ValueError):
return
EntityType = MessageEntityCustomEmoji
args['document_id'] = emoji_id
if EntityType and tag not in self._building_entities:
self._building_entities[tag] = EntityType(
offset=len(self.text),
# The length will be determined when closing the tag.
length=0,
**args)
def handle_data(self, text):
previous_tag = self._open_tags[0] if len(self._open_tags) > 0 else ''
if previous_tag == 'a':
url = self._open_tags_meta[0]
if url:
text = url
for tag, entity in self._building_entities.items():
entity.length += len(text)
self.text += text
def handle_endtag(self, tag):
try:
self._open_tags.popleft()
self._open_tags_meta.popleft()
except IndexError:
pass
entity = self._building_entities.pop(tag, None)
if entity:
self.entities.append(entity)
def parse(html: str) -> Tuple[str, List[TypeMessageEntity]]:
"""
Parses the given HTML message and returns its stripped representation
plus a list of the MessageEntity's that were found.
:param html: the message with HTML to be parsed.
:return: a tuple consisting of (clean message, [message entities]).
"""
if not html:
return html, []
parser = HTMLToTelegramParser()
parser.feed(add_surrogate(html))
text = strip_text(parser.text, parser.entities)
parser.entities.reverse()
parser.entities.sort(key=lambda entity: entity.offset)
return del_surrogate(text), parser.entities
ENTITY_TO_FORMATTER = {
MessageEntityBold: ('<strong>', '</strong>'),
MessageEntityItalic: ('<em>', '</em>'),
MessageEntityCode: ('<code>', '</code>'),
MessageEntityUnderline: ('<u>', '</u>'),
MessageEntityStrike: ('<del>', '</del>'),
MessageEntityBlockquote: ('<blockquote>', '</blockquote>'),
MessageEntityPre: lambda e, _: (
"<pre>\n"
" <code class='language-{}'>\n"
" ".format(e.language), "{}\n"
" </code>\n"
"</pre>"
),
MessageEntityEmail: lambda _, t: ('<a href="mailto:{}">'.format(t), '</a>'),
MessageEntityUrl: lambda _, t: ('<a href="{}">'.format(t), '</a>'),
MessageEntityTextUrl: lambda e, _: ('<a href="{}">'.format(escape(e.url)), '</a>'),
MessageEntityMentionName: lambda e, _: ('<a href="tg://user?id={}">'.format(e.user_id), '</a>'),
MessageEntityCustomEmoji: lambda e, _: ('<tg-emoji emoji-id="{}">'.format(e.document_id), '</tg-emoji>'),
}
def unparse(text: str, entities: Iterable[TypeMessageEntity]) -> str:
"""
Performs the reverse operation to .parse(), effectively returning HTML
given a normal text and its MessageEntity's.
:param text: the text to be reconverted into HTML.
:param entities: the MessageEntity's applied to the text.
:return: a HTML representation of the combination of both inputs.
"""
if not text:
return text
elif not entities:
return escape(text)
if isinstance(entities, TLObject):
entities = (entities,)
text = add_surrogate(text)
insert_at = []
for i, entity in enumerate(entities):
s = entity.offset
e = entity.offset + entity.length
delimiter = ENTITY_TO_FORMATTER.get(type(entity), None)
if delimiter:
if callable(delimiter):
delimiter = delimiter(entity, text[s:e])
insert_at.append((s, i, delimiter[0]))
insert_at.append((e, -i, delimiter[1]))
insert_at.sort(key=lambda t: (t[0], t[1]))
next_escape_bound = len(text)
while insert_at:
# Same logic as markdown.py
at, _, what = insert_at.pop()
while within_surrogate(text, at):
at += 1
text = text[:at] + what + escape(text[at:next_escape_bound]) + text[next_escape_bound:]
next_escape_bound = at
text = escape(text[:next_escape_bound]) + text[next_escape_bound:]
return del_surrogate(text)
|