1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
|
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import codecs
import encodings.idna
import re
import sys
from make_dafsa import words_to_bin, words_to_cxx
"""
Processes a file containing effective TLD data. See the following URL for a
description of effective TLDs and of the file format that this script
processes (although for the latter you're better off just reading this file's
short source code).
http://wiki.mozilla.org/Gecko:Effective_TLD_Service
"""
def getEffectiveTLDs(path):
file = codecs.open(path, "r", "UTF-8")
domains = set()
for line in file:
# line always contains a line terminator unless the file is empty
if len(line) == 0:
raise StopIteration
line = line.rstrip()
# comment, empty, or superfluous line for explicitness purposes
if line.startswith("//") or not line.strip():
continue
line = re.split(r"[ \t\n]", line, 1)[0]
entry = EffectiveTLDEntry(line)
domain = entry.domain()
assert domain not in domains, "repeating domain %s makes no sense" % domain
domains.add(domain)
yield entry
def _normalizeHostname(domain):
"""
Normalizes the given domain, component by component. ASCII components are
lowercased, while non-ASCII components are processed using the ToASCII
algorithm.
"""
def convertLabel(label):
if _isASCII(label):
return label.lower()
return encodings.idna.ToASCII(label).decode("utf-8")
return ".".join(map(convertLabel, domain.split(".")))
def _isASCII(s):
"True if s consists entirely of ASCII characters, false otherwise."
for c in s:
if ord(c) > 127:
return False
return True
class EffectiveTLDEntry:
"""
Stores an entry in an effective-TLD name file.
"""
_exception = False
_wild = False
def __init__(self, line):
"""
Creates a TLD entry from a line of data, which must have been stripped of
the line ending.
"""
if line.startswith("!"):
self._exception = True
domain = line[1:]
elif line.startswith("*."):
self._wild = True
domain = line[2:]
else:
domain = line
self._domain = _normalizeHostname(domain)
def domain(self):
"The domain this represents."
return self._domain
def exception(self):
"True if this entry's domain denotes does not denote an effective TLD."
return self._exception
def wild(self):
"True if this entry represents a class of effective TLDs."
return self._wild
#################
# DO EVERYTHING #
#################
def main(output, effective_tld_filename, output_format="cxx"):
"""
effective_tld_filename is the effective TLD file to parse.
based on the output format, either a C++ array of a binary representation
of a DAFSA representing the eTLD file is then printed to standard output
or a binary file is written to disk.
"""
def typeEnum(etld):
"""
Maps the flags to the DAFSA's enum types.
"""
if etld.exception():
return 1
elif etld.wild():
return 2
else:
return 0
def dafsa_words():
"""
make_dafsa expects lines of the form "<domain_name><enum_value>"
"""
for etld in getEffectiveTLDs(effective_tld_filename):
yield "%s%d" % (etld.domain(), typeEnum(etld))
""" words_to_bin() returns a bytes while words_to_cxx() returns string """
if output_format == "bin":
output.write(words_to_bin(dafsa_words()))
else:
output.write(words_to_cxx(dafsa_words()))
if __name__ == "__main__":
"""
This program can output the DAFSA in two formats:
as C++ code that will be included and compiled at build time
or as a binary file that will be published in Remote Settings.
Flags for format options:
"cxx" -> C++ array [default]
"bin" -> Binary file
"""
output_format = "bin" if "--bin" in sys.argv else "cxx"
main(sys.stdout, sys.argv[1], output_format=output_format)
|