File: fetcher.py

package info (click to toggle)
python-trx-python 0.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 408 kB
  • sloc: python: 3,394; makefile: 66
file content (93 lines) | stat: -rw-r--r-- 2,982 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-

import hashlib
import logging
import os
import shutil
import urllib.request


def get_home():
    """ Set a user-writeable file-system location to put files """
    if 'TRX_HOME' in os.environ:
        trx_home = os.environ['TRX_HOME']
    else:
        trx_home = os.path.join(os.path.expanduser('~'), '.tee_ar_ex')
    return trx_home


def get_testing_files_dict():
    """ Get dictionary linking zip file to their Figshare URL & MD5SUM """
    return {
        'DSI.zip':
        ('https://figshare.com/ndownloader/files/37624154',
         'b847f053fc694d55d935c0be0e5268f7'),  # V1 (27.09.2022)
        'memmap_test_data.zip':
        ('https://figshare.com/ndownloader/files/37624148',
         '03f7651a0f9e3eeabee9aed0ad5f69e1'),  # V2 (27.09.2022)
        'trx_from_scratch.zip':
        ('https://figshare.com/ndownloader/files/37624151',
         'd9f220a095ce7f027772fcd9451a2ee5'),  # V2 (27.09.2022)
        'gold_standard.zip':
        ('https://figshare.com/ndownloader/files/38146098',
         '57e3f9951fe77245684ede8688af3ae8')  # V1 (8.11.2022)
    }


def md5sum(filename):
    """ Compute one md5 checksum for a file """
    h = hashlib.md5()
    with open(filename, 'rb') as f:
        for chunk in iter(lambda: f.read(128 * h.block_size), b''):
            h.update(chunk)
    return h.hexdigest()


def fetch_data(files_dict, keys=None):
    """ Downloads files to folder and checks their md5 checksums

    Parameters
    ----------
    files_dict : dictionary
        For each file in `files_dict` the value should be (url, md5).
        The file will be downloaded from url, if the file does not already
        exist or if the file exists but the md5 checksum does not match.
        Zip files are automatically unzipped and its content* are md5 checked.

    Raises
    ------
    ValueError
        Raises if the md5 checksum of the file does not match the expected
        value. The downloaded file is not deleted when this error is raised.
    """
    trx_home = get_home()

    if not os.path.exists(trx_home):
        os.makedirs(trx_home)

    if keys is None:
        keys = files_dict.keys()
    elif isinstance(keys, str):
        keys = [keys]

    for f in keys:
        url, expected_md5 = files_dict[f]
        full_path = os.path.join(trx_home, f)

        logging.info('Downloading {} to {}'.format(f, trx_home))
        if not os.path.exists(full_path):
            urllib.request.urlretrieve(url, full_path)

        actual_md5 = md5sum(full_path)
        if expected_md5 != actual_md5:
            raise ValueError(
                f'Md5sum for {f} does not match. '
                'Please remove the file to download it again: ' +
                full_path
            )

        if f.endswith('.zip'):
            dst_dir = os.path.join(trx_home, f[:-4])
            shutil.unpack_archive(full_path,
                                  extract_dir=dst_dir,
                                  format='zip')