File: scanner.py

package info (click to toggle)
debiancontributors 0.7.8-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 404 kB
  • sloc: python: 1,894; makefile: 18
file content (351 lines) | stat: -rw-r--r-- 11,628 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
# coding: utf8
# Debian Contributors data source core data structure
#
# Copyright (C) 2014  Enrico Zini <enrico@debian.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
import sys
import six

class ValidationError(Exception):
    pass


class ConfigField(object):
    def __init__(self, name=None, blank=True, default=None, help_text="", **kw):
        """
        name: value name in the configuration
        blank: True if it can have an empty value, False if an empty or missing
               value should give an error
        default: value to use if the field was not found in the configuration.
        help_text: documentation for this field
        """
        self.name = name
        self.blank = blank
        self.default = default
        self.help_text = help_text
        for k in kw.keys():
            raise ValueError("Unknown ConfigField argument: '{}'".format(k))

    def to_python(self, val):
        """
        Validate and convert the None or string value from the configuration
        file to the Python value.

        If val is None (missing in the configuration) and we have a default
        value, try to use the default value.

        If val is a string, strips it.

        If blank is False, makes sure that the string is not empty.

        Returns a string (possibly empty, if blank=True) if the value was found
        in the config file. A ValidationError (or None if blank=True) if it was
        not found.
        """
        # Handle value not found in the configuration
        if val is None:
            if self.blank:
                return self.default
            else:
                raise ValidationError("missing value")

        val = val.strip()

        if not self.blank and not val:
            raise ValidationError("empty value")

        return val

    def print_documentation(self, file=sys.stdout):
        from .scanners.utils.doc import docstring_trim, print_indented

        print("``{name}`` : {type}, {blank}, default: {default}.".format(
            name=self.name,
            type=self.type_name(),
            blank="optional" if self.blank else "required",
            default=repr(self.default),
        ), file=file)

        if self.help_text:
            ht = docstring_trim(self.help_text)
            print_indented(ht, indent=2, file=file)
        else:
            print("  currently undocumented.", file=file)
        print(file=file)

    @classmethod
    def type_name(cls):
        res = cls.__name__
        if res.endswith("Field"):
            return res[:-5]
        else:
            return res


class CharField(ConfigField):
    """
    A string value. Can be any UTF-8 string.
    """
    pass


class IntegerField(ConfigField):
    """
    An integer value.
    """
    def to_python(self, val):
        val = super(IntegerField, self).to_python(val)
        try:
            val = int(val)
        except ValueError:
            raise ValidationError("invalid integer value: {}".format(val))
        return val


class IdentifierTypeField(CharField):
    """
    An identifier type. Can be one of:

    ``auto``
      autodetect. "ident" or "Name <ident>" are accepted, and ident can be any
      email, login or OpenPGP fingerprint
    ``login``
      debian.org or Alioth login name.
    ``email``
      email address.
    ``fpr``
      OpenPGP key fingerprint.
    """
    def to_python(self, val):
        from .types import Identifier
        res = super(IdentifierTypeField, self).to_python(val)
        if res == "auto":
            return res
        if res not in Identifier.TYPE_VALIDATORS:
            raise ValidationError("invalid identifier type. Use one of auto, {}".format(
                ", ".join(sorted(Identifier.TYPE_VALIDATORS.iterkeys()))))
        return res


class GlobField(CharField):
    """
    A string with one or more filenames. Globbing is supported. Arguments can
    be quoted to deal with whitespace, but glob characters will always be
    expanded.
    """
    def to_python(self, val):
        """
        Splits with shlex, expands with glob, returns a list of pathnames
        """
        import shlex
        import glob
        val = super(GlobField, self).to_python(val)
        res = []
        if val is None: return res
        for fname in shlex.split(val):
            res.extend(glob.glob(fname))
        if not self.blank and not res:
            raise ValidationError("no such file or directory")
        return res


class EmailsField(CharField):
    """
    A list of email addresses, like in email To: or Cc: headers.
    """
    def to_python(self, val):
        """
        Parse everything using email.utils.getaddresses
        """
        from email.utils import getaddresses
        val = super(EmailsField, self).to_python(val)
        if val is None: return []
        res = [ email for name, email in getaddresses((val,)) ]
        if not self.blank and not res:
            raise ValidationError("no email addresses found")
        return res


class IdentMapField(CharField):
    """
    A string with one or more identifier mapping expressions.

    Each expression is on a line by its own. Leading and trailing spaces do not
    matter.

    Lines can be in one of two forms:

        regexp replace
        regexp replace flags

    If regexp, replace or flags contain spaces, they can be shell-quoted.

    Regexp and replace use the syntax as found in re.sub. Flags are as found in
    re.X.

    For each mapping line, re.sub if called on each value found.
    """
    def to_python(self, val):
        """
        Splits with shlex, expands with glob, returns a list of pathnames
        """
        import shlex
        import re

        val = super(IdentMapField, self).to_python(val)
        res = []
        if val is None: return res
        for line in val.splitlines():
            line = line.strip()
            if not line: continue
            vals = shlex.split(line)
            if len(vals) == 2:
                match = re.compile(vals[0])
            elif len(vals) == 3:
                flags = 0
                for flag in vals[2]:
                    val = getattr(re, flag.upper(), None)
                    if val is None:
                        raise ValidationError("unsupported flag {}".format(flag))
                    flags |= val
                match = re.compile(vals[0], flags)
            else:
                raise ValidationError("mapping line has {} fields, but only 2 or 3 are supported".format(len(vals)))
            res.append((match, vals[1]))
        if not self.blank and not res:
            raise ValidationError("no mapping expressions provided")
        return res


class ScannerFields(type):
    """
    Collects all class members that are instances of ConfigField, merges them
    to all the instances from the class parents, and set the results as the
    FIELD class member.
    """
    def __new__(meta, name, parents, attrs):
        # Harvest config fields
        config_fields = {}

        # Collect fields from parents
        for p in parents:
            fields = getattr(p, "FIELDS", None)
            if fields is None: continue
            config_fields.update(fields.items())

        # Add fields from ourselves
        for name, member in attrs.items():
            if not isinstance(member, ConfigField): continue
            # Set the default for field names
            if member.name is None: member.name = name
            config_fields[name] = member

        # Add a FIELDS dict with all the fields
        attrs["FIELDS"] = config_fields

        return super(ScannerFields, meta).__new__(meta, name, parents, attrs)

# https://wiki.python.org/moin/PortingToPy3k/BilingualQuickRef#metaclasses
class ScannerBase(object):
    """
    Base class for Scanner to have the ScannerFields as metaclass
    """

class Scanner(six.with_metaclass(ScannerFields, ScannerBase)):
    """
    Base class for all data mining scanners

    Declarative definition of scanner configuration goes here.

    Any class members that are instances of ConfigField will be used to parse
    and validate the configuration. Their validated results will be set as
    object members.

    Example:

      # When instantiated, self.dirs will be a list of pathnames
      dirs = GlobField(blank=False, help_text="Directories to scan")

    All ConfigField instances found as class members, will be stored in the
    class FIELDS dict. For example, you can crudely document all the config
    options of a scanner like this:

      for name, field in MyScanner.FIELDS.items():
          print("Config key {}, accessible as self.{}: {}".format(
                    field.name, name, field.__doc__))
    """

    # Scanner name, used to refer to the scanner in the mining configuration.
    # Defaults to the class name.
    NAME = None

    def __init__(self, cfg):
        """
        Initialize the scanner with the given configuration dictionary
        """
        # Parse configuration using our field definition
        for name, field in self.FIELDS.items():
            val = cfg.get(field.name, None)
            try:
                validated_val = field.to_python(val)
            except ValidationError as e:
                raise ValidationError("{} = {}: {}".format(name, val, str(e)))

            # Set the validated name=value pair as an object member
            setattr(self, name, validated_val)

    def scan(self):
        """
        Perform scan and generate 4-tuples of
        (identifier, begin, end, url)

        Only identifier cannot be None, everything else can be.
        """
        if False: yield None, None, None, None

    @classmethod
    def print_documentation(cls, file=sys.stdout):
        from .scanners.utils.doc import docstring_trim, print_indented
        print(cls.NAME, file=file)
        print("-" * len(cls.NAME), file=file)
        print(docstring_trim(cls.__doc__), file=file)
        print(file=file)
        if not cls.FIELDS:
            print("This scanning method has no specific configuration options", file=file)
        else:
            print("Configuration options", file=file)
            print("`````````````````````", file=file)
            print(file=file)

            types_used = {}
            for name, field in sorted(cls.FIELDS.items()):
                field.print_documentation(file=file)
                types_used.setdefault(field.__class__, name)

            print("Option types", file=file)
            print("````````````", file=file)
            print(file=file)

            for cls, name in sorted(types_used.items(), key=lambda x:x[0].type_name()):
                print("``{}``".format(cls.type_name()), file=file)
                ht = docstring_trim(cls.__doc__)
                print_indented(ht, indent=2, file=file)
                print(file=file)