File: whitespace.py

package info (click to toggle)
python-html5rdf 1.2.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,448 kB
  • sloc: python: 12,794; makefile: 3
file content (37 lines) | stat: -rw-r--r-- 1,147 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

import re

from . import base
from ..constants import rcdataElements, spaceCharacters
spaceCharacters = "".join(spaceCharacters)

SPACES_REGEX = re.compile("[%s]+" % spaceCharacters)


class Filter(base.Filter):
    """Collapses whitespace except in pre, textarea, and script elements"""
    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))

    def __iter__(self):
        preserve = 0
        for token in base.Filter.__iter__(self):
            type = token["type"]
            if type == "StartTag" \
                    and (preserve or token["name"] in self.spacePreserveElements):
                preserve += 1

            elif type == "EndTag" and preserve:
                preserve -= 1

            elif not preserve and type == "SpaceCharacters" and token["data"]:
                # Test on token["data"] above to not introduce spaces where there were not
                token["data"] = " "

            elif not preserve and type == "Characters":
                token["data"] = collapse_spaces(token["data"])

            yield token


def collapse_spaces(text):
    return SPACES_REGEX.sub(' ', text)