File: fetchdictdata.py

package info (click to toggle)
freedict 2016.12.12-1
links: PTS, VCS
area: main
in suites: stretch
size: 556,676 kB
ctags: 484
sloc: xml: 466,758; perl: 3,392; python: 936; makefile: 550; sh: 268; yacc: 207
file content (436 lines) | stat: -rwxr-xr-x 17,591 bytes
#!/usr/bin/env python3
"""This script fetches all available databases from
http://www.freedict.org/freedict-database.xml
It accepts the following options:

    -dc    generate debian/control and debian/copyright
           This may take some time, since the copyright information is parsed
           out of the TEI XML source file(s) (if licensecheck fails, which it
                   does often).
    -f    fetch new source from SourceForge and save it into the file
          ../freedict_VER.orig.tar.gz (where VER is date in format yyyy.mm.dd)
          -x when -f is given, -x can be used to exclude the import of some
          dictionaries. The supplied string must be space-separated, so and
          escaped. For instance -x 'xxx-xxx yyy-yyy'
    -x    fetch new xml freedict database
    -na <database>   output name and author of <database> (e. g. lat-deu)
    -a    perform all actions (except fetching new xml database)

IMPORTANT: you must run this script from the root of the freedict source,
otherwise the operations will fail.

Dependencies:
    tar, unzip, git, licensecheck

NOTE: this program works with python 3, legacy code for support will be removed.
"""

#pylint: disable=multiple-imports
import collections, re
import os, shutil, subprocess, sys
import datetime
import xml.etree.ElementTree as ET
import urllib.request

XML_URL = 'http://www.freedict.org/freedict-database.xml'
LANGCODE_TABLE = "debian/iso-639-3_20130123.tab"

def dictionarycode2longdescription(string):
    """
    Take a iso-639 dictionary string (like lat-deu) and translate it into the
    long version (like Latin-German)."""
    for item in os.listdir('debian'):
        if item.startswith('iso-'):
            tmp = open(os.path.join('debian', item), 'r', encoding='utf-8').read()
            tmp = tmp.split('\n')
    langtbl = {}
    for line in tmp[1:]:
        line = line.split('\t')
        langtbl[line[0]] = line[6]
        if line[1] != '': # some have two language code, add it:
            langtbl[line[1]] = line[6]
    string = string.split('-')
    return langtbl[string[0]] +'-' + langtbl[string[1]]

def get_xml_content(fetch_new=False):
    """Either read contents of freedict-database.xml or fetch it from the web
first and save it."""
    if fetch_new:
        data = urllib.request.urlopen(XML_URL).read().decode('utf-8')
        open("debian/freedict-database.xml", "w", encoding="utf-8").write(data)
    else: # read from file
        data = open('debian/freedict-database.xml', 'r', encoding='utf-8').read()
    return data

def find_license(dict):
    """Find out license of dictionary.
First run license-check, afterwards use a self-brewed license checker.
Currently only GPL is detected, else FIXMe is output. It tries to output
something like GPL, GPL-2, GPL-3, GPL-2+, GPL-3+"""
    def run_licensecheck_on(file_path):
        """Try to extract license with license check."""
        license = None
        proc = subprocess.Popen(['licensecheck', '-m', file_path], stdout=subprocess.PIPE)
        try:
            text = proc.communicate()[0].decode('utf-8').split('\t')[1]
        except IndexError:
            pass
        else: # match "GPL (v2 or later)"
            sre = re.search(r'gpl \(v(\d).*(or later).*', text.lower())
            if sre:
                tpl = sre.groups()
                license = 'GPL-%s%s' % (tpl[0], ('+' if(tpl[1] != '') else ''))
                return license
        return None


    tei_fn = '{0}{1}{0}.{2}'.format(dict, os.sep, 'tei')
    license = run_licensecheck_on(tei_fn)
    if not license:
        license = run_licensecheck_on(dict + os.sep + 'COPYING.tei')

        if not license:
            license = 'FIXME' # backup solution

    if license != 'FIXME': # found a license, return
        return license

    # try guessing the license from the TEI file
    with open(tei_fn, 'r', encoding='utf-8') as f:
        in_header = True
        line = 'start'
        lastline = ''
        while in_header and line != '':
            line = f.readline().lower()
            if '<body' in line:
                line = line[:line.find('<body')] # parse everything before
                in_header = False
            if 'gpl' in line.lower() or 'gnu general public lic' in line.lower():
                license = 'GPL'
                # try to extract version number
                res = re.search(r"(?:version|ver.|license|licence)\s+(\d+)", lastline+line.lower())
                if res:
                    license += '-%s' % res.groups()[0]
            elif 'attribution-sharealike' in line:
                license = 'CC-BY-SA'
                version = re.search(r'sharealike \(?v?(\d+\.?\d*)', line)
                if version:
                    license += '-%s' % version.groups()[0]
            lastlines = lastline + line
            if license.startswith('GPL') and not license.endswith('+'):
                if re.search('.*later.*version', lastlines) or \
                        'or later' in lastlines or 'and any later' in lastlines:
                    license += '+'
            lastline = line[:]
    return license

def recursive_text(node):
    text = ''
    if not node.text.strip() == '':
        text = node.text
    for child in node:
        text += '\n' + recursive_text(child)
    return text

class generate_control_copyright():
    def __init__(self, root):
        self.__dictionaries = {}
        self.root = root
        self.parse_data()

    def parse_data(self):
        """Iterate over XML tree to collect dictionary data."""
        for child in self.root:
            if len(child.getchildren()) == 0:
                continue # skip dictionaries without releases
            name = child.attrib['name']
            self.__dictionaries[name] = {} # initialize new dictionary
            for key in ('headwords', 'edition', 'status', 'maintainerName'):
                try:
                    self.__dictionaries[name][key] = child.attrib[key]
                except KeyError as e:
                    if e.args[0] == 'status':
                        pass # status is optional
                    else:
                        raise KeyError('missing attribute for %s: %s' % \
                                (child.attrib['name'], e.args[0]))

    def write_all(self):
        """Write both control as well as the copyright file."""
        self.sort_dictionaries()
        self.write_control()
        self.write_copyright()

    def write_control(self):
        """Generate debian/control from debian/control.HEAD and the gathered
        dictionary data."""
        HEAD = open('debian/control.HEAD', 'r', encoding='utf-8').read() +'\n'
        string = [HEAD]
        string += ["""Package: dict-freedict-all
Architecture: all
Depends: ${misc:Depends}, """ + 'dict-freedict-' + ', dict-freedict-'.join( self.__dictionaries.keys() )
                ]
        string += ["\nDescription: meta-package to install dictionary databases from the FreeDict project\n",
                " This package can be used to install all available bilingual dictionaries from\n" + \
                " the FreeDict project at once.",
                "\n\n"]

        for dict, content in self.__dictionaries.items():
            string.append('Package: dict-freedict-%s\n' % dict)
            string.append("""Architecture: all
Depends: ${misc:Depends}
Suggests: dictd | dicod, dict | kdict | gnome-dictionary | goldendict
Provides: dictd-dictionary\n""")
            status = ''
            if 'status' in content:
                status = ' (FreeDict status: %s)' % content['status']
            string.append('Description: %s dictionary for the dict server/client'\
                    % dictionarycode2longdescription(dict))
            longstr = '''
 This is the %s dictionary from the FreeDict project in version %s%s. It contains %s headwords. It can be used for the dict server in conjunction with a dict client.'''\
                    % (dictionarycode2longdescription(dict), content['edition'], \
                        status, content['headwords'])
            # format description to 80 characters per line
            tmp = ' '
            for piece in longstr.split(' '):
                if(len(tmp+piece) <= 80):
                    tmp += piece + ' '
                else:
                    string.append(tmp[:-1]+'\n')
                    tmp = ' '+piece+ ' '
            string.append(tmp+'\n\n')
        open('debian/control','w', encoding='utf-8').write( ''.join(string) )

    def write_copyright(self):
        """Generate debian/copyright from debian/copyright.HEAD and the gathered
        dictionary data."""
        cprght_snippets = '{1}{0}{2}{0}'.format(os.sep, 'debian',
                'copyright.snippets')
        HEAD = open(cprght_snippets + 'HEAD', encoding='utf-8').read()
        string = [HEAD, '\n']
        for dict in self.__dictionaries:
            # is there a manual copyright snippet?
            if os.path.exists(cprght_snippets + dict):
                with open(cprght_snippets + dict, encoding='utf-8') as f:
                    string.append('\n' + f.read())
            else:
                string.append('\nFiles: %s/*\n' % dict)
                string.append('Copyright: 2000-2016 FreeDict contributors\n')
                string.append('License: ' + find_license(dict) + '\n')
        with open(cprght_snippets + 'TAIL', encoding='UTF-8') as f:
            string += ['\n\n', f.read()]

        document = ''.join(string)
        with open('debian/copyright', 'w', encoding='utf-8') as f:
            f.write(document)
        if 'FIXME' in document:
            print('NOTE: some licenses could not be extracted, search for "FIXME" in debian/copyright.')

    def sort_dictionaries(self):
        """
        Overwrite the self.__dictionaries-dictionary with a sorted
        collectionss.OrderedDict. We cannot expect to find ordered data in the
        XML, so we should sort on our own, afterwards.
        """
        d = collections.OrderedDict()
        for key in sorted(self.__dictionaries):
            d[key] = self.__dictionaries[key]
        self.__dictionaries = d


class fetch_source():
    """Fetch the sources of all dictionaries and the tools directory."""
    def __init__(self, root):
        self.date = self.gen_date()
        self.dirname = 'freedict-%s.orig' % self.date
        self.root = root
        self.exclude_dictionaries = []
        if len(sys.argv) == 4: # there's the -x option given
            if sys.argv[2] == '-x':
                self.exclude_dictionaries = sys.argv[3].split(' ')

    def gen_date(self):
        """Return date in format "yyyy.mm.dd"."""
        d = datetime.datetime.now()
        return str(d.year) + '.' + str(d.month).zfill(2).replace(' ','0') \
                + '.' + str(d.day).zfill(2).replace(' ','0')

    def prepare_environment(self):
        """
        Perform all actions which are needed before downloading the
        source.
        """
        if os.path.exists(self.dirname):
            print("Removing %s; possibly left over from an interrupted run." %
                    self.dirname)
            shutil.rmtree(self.dirname)
        os.mkdir(self.dirname)
        os.chdir( self.dirname )

    def clean_up(self):
        """
        Compress the original source, move it to the right destination and
        remove download directory.
        """
        tarname = self.dirname.replace('-','_') + '.tar.gz'
        os.chdir('..')
        ret = os.system('tar czf %s %s' % (tarname, self.dirname))
        if ret:
            sys.exit(9)
        print('Moving tar archive upward to "..".')
        os.rename(tarname, '..'+os.sep+tarname)
        shutil.rmtree(self.dirname)

    def write_all(self):
        """Download all upstream source packages."""
        self.prepare_environment()
        imported = 0
        for dict in self.root:
            if dict.attrib['name'] in self.exclude_dictionaries:
                print("Skip %s (specified via commmand line)" \
                            % dict.attrib['name'])
                continue
            # iterate over source releases
            for release in dict:
                if not release.attrib.get('platform') or \
                        release.attrib['platform'] != 'src':
                    continue
                srcURL = release.attrib['URL']
                fn = self.get_dict_fn_from_url(release.attrib['URL'])
                print("Fetching %s from %s" % (dict.attrib['name'], srcURL))
                try:
                    with urllib.request.urlopen(srcURL) as u:
                        data = u.read()
                except urllib.error.HTTPError as h:
                    if int(h.code) == 404:
                        reason = '%s; url: %s' % (str(h), srcURL)
                        raise urllib.error.URLError(reason) from None
                    else:
                        raise h from None

                with open(fn, 'wb') as f:
                    f.write(data)
                print("Extracting",fn)
                if fn.endswith('.zip'):
                    os.system('unzip -qq "%s"' % fn)
                elif fn.endswith('tar.gz') or fn.endswith('tar.bz2'):
                    os.system('tar xf "%s"' % fn)
                elif fn.endswith('.bz2'):
                    os.mkdir(fn[:7])
                    os.system('tar xf "%s" -C "%s"' % (fn, fn[:7]))
                else:
                    print('E: unknown format of "%s".' % fn)
                    sys.exit(0)
                os.remove(fn)
                imported += 1
                break # do not search for further source releases, might be multiple archive formats
        print("Imported %d dictionaries." % imported)
        # fetch tools directory
        print("Fetching tools directory")
        def exec(cmd):
            ret = os.system(cmd)
            if ret:
                print(repr(cmd) + ' exited with error code ' + str(ret))
                sys.exit(ret)
        exec('git init tools.tmp')
        os.chdir('tools.tmp')
        exec('git remote add -f origin https://github.com/freedict/fd-dictionaries.git')
        exec('git fetch')
        exec('git config core.sparseCheckout true') # enable parse checkout
        with open(".git/info/sparse-checkout", "a") as f:
            f.write('tools\n') # configure which directory to actually check out
        exec('git pull origin master')
        os.rename('tools', '../tools')
        os.chdir('..')
        shutil.rmtree('tools.tmp')
        self.clean_up()

    def get_dict_fn_from_url(self, url):
        """The SF URL's end on '?download' or '/download', just before those
        strings in the file name, so remove that and return the file name."""
        if(url.endswith('?download')):
            # that's len('?download')
            return url.split('/')[-1][:-9]
        elif(url.endswith('/download')):
            return url.split('/')[-2]
        else:
            print("Unknown URL Format:", url)
            sys.exit(127)

class name_and_author():
    def __init__(self, tree):
        self.tree = tree
    def write_all(self):
        """Iterate over XML tree to collect dictionary data. Afterwards decide,
        wether sys.argv[2] is in there and print() name and author."""
        try:
            dict = sys.argv[2]
        except IndexError:
            print("Error: you must specify a dictionary.")
            sys.exit(127)

        for child in self.tree:
            attr = child.attrib # shortcut
            if(attr['name'] == dict):
                if(attr['maintainerName'] == '[up for grabs]'):
                    dict = (dictionarycode2longdescription( attr['name'] ), \
                                'up for adoption')
                else:
                    dict = (dictionarycode2longdescription( attr['name'] ),\
                                attr['maintainerName'])
        if isinstance(dict, str):
            print("Error: database not found.")
            sys.exit(127)
        else:
            print(dict[0]+';'+dict[1])

def clean_up_tree(root):
    """Iterate over XML tree and delete those <dictionary/>-nodes which have no
    release."""
    for child in root:
        if len(child.getchildren()) == 0:
            print('Removing '+child.attrib['name']+' (is empty).')
            root.remove(child)

def main():
    # are we in the correct directory?
    if os.getcwd().endswith('debian'):
        os.chdir('..')
    elif not os.getcwd().find('freedict') >= 0:
        print("You must run this script from the FreeDict packaging root.")
        sys.exit(127)

    # cmd args
    xmlsrc = None
    if len(sys.argv) == 1:
        print(__doc__)
        sys.exit(127)

    xmlsrc = get_xml_content()
    if sys.argv[1] == '-f':
        xmlsrc = get_xml_content(fetch_new=True) # fetch latest FreeDict API file
        objects = [fetch_source]
    elif sys.argv[1] == '-dc':
        objects = [generate_control_copyright]
    elif sys.argv[1] == '-a':
        objects = [generate_control_copyright, fetch_source]
        xmlsrc = get_xml_content(fetch_new=True)
    elif sys.argv[1] == '-x':
        # fetch again, but this time from the web, overhead should not be
        # perceptible
        xmlsrc = get_xml_content(fetch_new=True)
        sys.exit(0)
    elif sys.argv[1] == '-na':
        objects = [name_and_author]
    else:
        print(__doc__)
        sys.exit(127)

    # usual operation
    root = ET.fromstring(xmlsrc)
    clean_up_tree(root)
    for obj in objects:
        inst = obj(root)
        inst.write_all()

main()