File: matcher.py

package info (click to toggle)
wig 0.6-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,028 kB
  • sloc: python: 1,520; sh: 37; makefile: 5
file content (142 lines) | stat: -rw-r--r-- 3,469 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import re

class Match(object):
	def __init__(self):
		self.error_pages = set()	
	
	def _check_page(self, response, fingerprint):

		# check if the page is a 404
		is_404 = response.status['code'] == 404 or response.md5_404 in self.error_pages

		# fingerprints that do not have a 'code' set, default to 200
		# find the 'code' of the current fingerprint
		fp_code = 200 if not 'code' in fingerprint else fingerprint['code']

		if fp_code == 'any':
			return True

		# if the fingerprint is for a 404 but the page is not a 404, do not match
		elif (not is_404) and fp_code == 404:
			return False

		# if the page is a 404 but the fingerprint is not for a 404, do not match
		elif is_404 and (not fp_code == 404):
			return False

		# else match
		else:
			return True



	def get_result(self, fingerprints, response):
		# find the matching method to use
		matches = []

		if response is None: return matches

		# find out of the reponse is an image
		# this is used to avoid the crawler using string and regex
		# searching for matches in these files
		content_type = 'Content-Type'.lower()
		if content_type in response.headers:
			is_image = 'image' in response.headers[content_type]

		# default to the content being an image, since if the content-type
		# isn't set, the content is unknown
		else:
			is_image = True

		for fingerprint in fingerprints:
			match = None
			
			# only check the page if the status codes match
			if not self._check_page(response, fingerprint):
				match = None

			elif 'type' not in fingerprint:
				match = None

			elif 'header' in fingerprint:
				match = self.header(fingerprint, response)

			elif fingerprint['type'] == 'md5':
				match = self.md5(fingerprint, response)

			elif fingerprint['type'] == 'string' and not is_image:
				match = self.string(fingerprint, response)
			
			elif fingerprint['type'] == 'regex' and not is_image:
				match = self.regex(fingerprint, response)

			else:
				# fingerprint type is not supported yet
				match = None

			if match is not None:
				if match['url'] == '':
					match['url'] = response.get_url()

				matches.append(match)

		return matches

	
	def md5(self, fingerprint, response):
		if fingerprint["match"] == response.md5:
			return fingerprint
		else:
			return None

	
	def string(self, fingerprint, response):
		if fingerprint["match"] in response.body:
			return fingerprint
		else:
			return None

	
	def regex(self, fingerprint, response):
		# create copy of fingerprint
		copy = {key:fingerprint[key] for key in fingerprint}
		regex = copy["match"]
		output = copy["output"] if 'output' in copy else None

		matches = re.findall(regex, response.body)
		if len(matches):
			if output is None:
				copy['output'] = None
			elif "%" in output:
				copy['output'] = output % matches[0]
			
			return copy
		else:
			return None

	
	def header(self, fingerprint, response):
		fp_header = fingerprint['header']
		match_type = fingerprint['type']

		# a dummy class to mimic a response
		class response_dummy(object):
			self.body = ''

		# parse the headers searching for a match
		for header in response.headers:
			if header == fp_header.lower():

				# create an intance of the dummy class
				r = response_dummy()
				r.body = response.headers[header]

				# call the Match instances methods for string or regex matching
				if match_type == 'string':
					return self.string(fingerprint, r)
				elif match_type == 'regex':
					return self.regex(fingerprint, r)