1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
|
"""Public Suffix List module for Python.
"""
import codecs
import os.path
from pkg_resources import resource_stream, get_distribution
import warnings
try:
from urllib.request import urlopen, Request
except ImportError:
from urllib2 import urlopen, Request
PUBLIC_SUFFIX_LIST_URL = 'http://publicsuffix.org/list/public_suffix_list.dat'
def fetch():
"""Downloads the latest public suffix list from publicsuffix.org.
Returns a file object containing the public suffix list.
"""
ua = 'Python-publicsuffix/%s' % (get_distribution(__name__).version)
req = Request(PUBLIC_SUFFIX_LIST_URL, headers={'User-Agent': ua})
res = urlopen(req)
try:
encoding = res.headers.get_content_charset()
except AttributeError:
encoding = res.headers.getparam('charset')
f = codecs.getreader(encoding)(res)
return f
class PublicSuffixList(object):
def __init__(self, input_file=None):
"""Reads and parses public suffix list.
input_file is a file object or another iterable that returns
lines of a public suffix list file.
The file format is described at http://publicsuffix.org/list/
"""
if input_file is None:
warnings.warn("Using the built-in public suffix list is deprecated. Please use input_file.",
DeprecationWarning, 2)
input_path = '/usr/share/publicsuffix/effective_tld_names.dat'
input_file = codecs.open(input_path, "r", "utf8")
do_close = True
else:
do_close = False
root = self._build_structure(input_file)
self.root = self._simplify(root)
if do_close:
input_file.close()
def _find_node(self, parent, parts):
if not parts:
return parent
if len(parent) == 1:
parent.append({})
assert len(parent) == 2
negate, children = parent
child = parts.pop()
child_node = children.get(child, None)
if not child_node:
children[child] = child_node = [0]
return self._find_node(child_node, parts)
def _add_rule(self, root, rule):
if rule.startswith('!'):
negate = 1
rule = rule[1:]
else:
negate = 0
parts = rule.split('.')
self._find_node(root, parts)[0] = negate
def _simplify(self, node):
if len(node) == 1:
return node[0]
return (node[0], dict((k, self._simplify(v)) for (k, v) in node[1].items()))
def _build_structure(self, fp):
root = [0]
for line in fp:
line = line.strip()
if line.startswith('//') or not line:
continue
self._add_rule(root, line.split()[0].lstrip('.'))
return root
def _lookup_node(self, matches, depth, parent, parts):
if parent in (0, 1):
negate = parent
children = None
else:
negate, children = parent
matches[-depth] = negate
if depth < len(parts) and children:
for name in ('*', parts[-depth]):
child = children.get(name, None)
if child is not None:
self._lookup_node(matches, depth+1, child, parts)
def get_public_suffix(self, domain):
"""get_public_suffix("www.example.com") -> "example.com"
Calling this function with a DNS name will return the
public suffix for that name.
Note that for internationalized domains the list at
http://publicsuffix.org uses decoded names, so it is
up to the caller to decode any Punycode-encoded names.
"""
parts = domain.lower().strip('.').split('.')
hits = [None] * len(parts)
self._lookup_node(hits, 1, self.root, parts)
for i, what in enumerate(hits):
if what is not None and what == 0:
return '.'.join(parts[i:])
|