File: datamine.py

package info (click to toggle)
debiancontributors 0.7.8-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 404 kB
  • sloc: python: 1,894; makefile: 18
file content (240 lines) | stat: -rw-r--r-- 8,255 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# coding: utf8
# Debian Contributors data source data mining tools
#
# Copyright (C) 2013--2014  Enrico Zini <enrico@debian.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from .submission import DEFAULT_BASE_URL, Submission
from debian import deb822
import os.path
import io
import re
import six
import sys

__all__ = ["Fail", "DataMine"]

class Fail(BaseException):
    pass

def read_config(fname):
    with io.open(fname, encoding="utf8") as fp:
        for par in deb822.Deb822.iter_paragraphs(fp):
            yield par

def read_configstr(s):
    if not isinstance(s, six.text_type):
        raise TypeError("configuration contents must be a unicode string")
    with io.StringIO(s) as fp:
        for par in deb822.Deb822.iter_paragraphs(fp):
            yield par


def load_scanners():
    """
    Load all scanners as a sequence of scanner classes
    """
    from . import scanners
    from .scanner import Scanner
    import inspect
    for name, cls in inspect.getmembers(scanners, inspect.isclass):
        if not issubclass(cls, Scanner): continue

        # Make sure that NAME is set
        if cls.NAME is None:
            cls.NAME = name.lower()

        yield cls


class DataMine(object):
    def __init__(self, configfname=None, configstr=None, source_name=None):
        """
        Create a data miner for a data source reading a configuration file.

        If the first paragraph does not have a "contribution:" field, it is
        used for general data source configuration, like auth key, source name
        (if not the same as the file name), and base url (if the default is not
        ok)

        The source name is the value of source_name, if given. Else it is the
        value in general/name. Else it is the basename of configfname, with
        .conf or .cfg extension stripped, if present.
        """
        # Read all the configuration as a dict { section: { key: value } }
        if configfname is not None:
            config = list(read_config(configfname))
        elif configstr is not None:
            config = list(read_configstr(configstr))
        else:
            raise TypeError("one of configfname or configstr should be provided")

        if not config:
            raise Fail("the configuration is empty")

        # Extract the general configuration
        name = source_name
        auth_token = None
        baseurl = None

        general = config[0]
        if "contribution" not in general:
            config = config[1:]

            if not name: name = general.get("source", None)
            auth_token = general.get("auth_token", None)
            baseurl = general.get("baseurl", DEFAULT_BASE_URL)

        # Default source with the config file name, without config-like
        # extensions
        if name is None:
            name = os.path.basename(configfname)
            name = re.sub(r".(?:cfg,conf)$", "", name)

        # Instantiate the submission that we are going to build
        self.submission = Submission(name, auth_token=auth_token, baseurl=baseurl)

        # Instantiate scanners
        self.scanners = []
        scanner_factories = { x.NAME: x for x in load_scanners() }
        for cfg in config:
            # Contribution type
            ctype = cfg.get("contribution", None)
            if ctype is None:
                raise Fail("'contribution' field not found in data miner configuration")

            # Get scanner class 'method' configuration
            method = cfg.get("method", None)
            if method is None:
                raise Fail("'method' field not found in data miner configuration")
            scanner_cls = scanner_factories.get(method, None)
            if scanner_cls is None:
                raise Fail("'{}' configuration requests unsupported method: '{}'".format(ctype, method))

            # Instantiate scanner
            self.scanners.append({
                "ctype": ctype,
                "method": method,
                "scanner": scanner_cls(cfg),
            })

    def scan(self):
        """
        Run all data miners and add their output to the submission
        """
        for s in self.scanners:
            ctype = s["ctype"]
            for ident, begin, until, url in s["scanner"].scan():
                self.submission.add_contribution_data(
                    ident, ctype, begin, until, url)

    @classmethod
    def print_documentation(cls, file=sys.stdout):
        print("""
===================
dc-tool data mining
===================

dc-tool has several methods of data mining that can be controlled via a
configuration file.

It works like this:

1. Read this documentation and create a configuration file to test.
2. Run ``dc-tool --mine=mysource.conf`` to perform data mining and print
   results to standard output.
3. When you are satisfied of the results, run ``dc-tool --mine=mysource.conf --post``
   to post data to contributors.debian.org. Run that via cron and you have a
   full working data source.

-------------------------
Configuration file syntax
-------------------------

The configuration file follows the usual Debian RFC822/Yaml-like syntax.

If the first group of options does not have a "contribution:" field, it is used
for general configuration of the data source. All other sections define methods
of mining the data you want.


The data source configuration section
=====================================

Example::

  # You don't need this option if you call this file nm.debian.org.conf
  #source: nm.debian.org
  # Auhentication token used to post data. Use a leading '@' as in '@filename'
  # to use the contents of another file as auth token. Do not make this file
  # world-readable!
  auth_token: @secrets/auth_token.txt

The general configuration section has three configurable keywords:

``source``
  Data source name, as configured in contributors.debian.org. If omitted,
  dc-tool will use the configuration file name. If the file name ends in ``.ini``,
  ``.conf`` or ``.cfg``, the extension will be removed.

``auth_token``
  The authentication token used for posting data to the site.

  Anyone with this authentication token can post data for this data source, so
  be careful not to give this file world-readable permissions.

``baseurl``
  You never need this unless you want to test a local copy of the
  contributors.debian.org codebase: it defaults to ``{DEFAULT_BASE_URL}``
  but you can change it to submit data to your local development version.


Data mining sections
====================

Example::

  contribution: committer
  # Data mining method
  method: gitdirs
  # Configuration specific to this method
  dirs: /srv/git.debian.org/git/collab-maint/*.git
  url: https://alioth.debian.org/users/{{user}}/

Each data mining section has at least two configurable keywords:

``contribution``
  Contribution type for this data source, as configured in contributors.debian.org.
  You can have many sections with the same contribution types, and the results
  of their data mining will all be merged.

``method``
  The mining method. There are several mining method available, each with its
  own configuration options, documented below.

The rest of the options are specific to each data mining method. Below is a
full documentation of them.


Data mining methods
===================

""".format(DEFAULT_BASE_URL=DEFAULT_BASE_URL), file=file)

        for scanner in sorted(load_scanners(), key=lambda x:x.NAME):
            scanner.print_documentation(file=file)