File: publicsuffix.py

package info (click to toggle)
python-publicsuffix 1.0.5-1
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 232 kB
  • ctags: 43
  • sloc: python: 264; makefile: 5
file content (106 lines) | stat: -rw-r--r-- 2,570 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""Public Suffix List module for Python.
"""

import codecs
import os.path

class PublicSuffixList(object):
	def __init__(self, input_file=None):
		"""Reads and parses public suffix list.
		
		input_file is a file object or another iterable that returns
		lines of a public suffix list file. If input_file is None, an
		UTF-8 encoded file named "publicsuffix.txt" in the same
		directory as this Python module is used.
		
		The file format is described at http://publicsuffix.org/list/
		"""

		if input_file is None:
			input_path = '/usr/share/publicsuffix/effective_tld_names.dat'
			input_file = codecs.open(input_path, "r", "utf8")

		root = self._build_structure(input_file)
		self.root = self._simplify(root)

	def _find_node(self, parent, parts):
		if not parts:
			return parent

		if len(parent) == 1:
			parent.append({})

		assert len(parent) == 2
		negate, children = parent

		child = parts.pop()

		child_node = children.get(child, None)

		if not child_node:
			children[child] = child_node = [0]

		return self._find_node(child_node, parts)

	def _add_rule(self, root, rule):
		if rule.startswith('!'):
			negate = 1
			rule = rule[1:]
		else:
			negate = 0

		parts = rule.split('.')
		self._find_node(root, parts)[0] = negate

	def _simplify(self, node):
		if len(node) == 1:
			return node[0]

		return (node[0], dict((k, self._simplify(v)) for (k, v) in node[1].items()))

	def _build_structure(self, fp):
		root = [0]

		for line in fp:
			line = line.strip()
			if line.startswith('//') or not line:
				continue

			self._add_rule(root, line.split()[0].lstrip('.'))

		return root

	def _lookup_node(self, matches, depth, parent, parts):
		if parent in (0, 1):
			negate = parent
			children = None
		else:
			negate, children = parent

		matches[-depth] = negate

		if depth < len(parts) and children:
			for name in ('*', parts[-depth]):
				child = children.get(name, None)
				if child is not None:
					self._lookup_node(matches, depth+1, child, parts)

	def get_public_suffix(self, domain):
		"""get_public_suffix("www.example.com") -> "example.com"

		Calling this function with a DNS name will return the
		public suffix for that name.

		Note that for internationalized domains the list at
		http://publicsuffix.org uses decoded names, so it is
		up to the caller to decode any Punycode-encoded names.
		"""

		parts = domain.lower().strip('.').split('.')
		hits = [None] * len(parts)

		self._lookup_node(hits, 1, self.root, parts)

		for i, what in enumerate(hits):
			if what is not None and what == 0:
				return '.'.join(parts[i:])