File: fetching.py

package info (click to toggle)
python-mne 0.19.1%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 100,440 kB
  • sloc: python: 120,243; pascal: 1,861; makefile: 225; sh: 15
file content (189 lines) | stat: -rw-r--r-- 7,610 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# -*- coding: utf-8 -*-
"""File downloading functions."""
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
#
# License: BSD (3-clause)

import os
import shutil
import sys
import time
from urllib import parse, request

from .progressbar import ProgressBar
from .numerics import hashfunc
from .misc import sizeof_fmt
from ._logging import warn, logger, verbose


# Adapted from nilearn


def _get_http(url, temp_file_name, initial_size, file_size, timeout,
              verbose_bool):
    """Safely (resume a) download to a file from http(s)."""
    # Actually do the reading
    req = request.Request(url)
    if initial_size > 0:
        logger.debug('  Resuming at %s' % (initial_size,))
        req.add_header('Range', "bytes=%s-" % (initial_size,))
        try:
            response = request.urlopen(req, timeout=timeout)
            content_range = response.info().get('Content-Range')
            if (content_range is None or not content_range.startswith(
                    'bytes %s-' % (initial_size,))):
                raise IOError('Server does not support resuming')
        except Exception:
            # A wide number of errors can be raised here. HTTPError,
            # URLError... I prefer to catch them all and rerun without
            # resuming.
            return _get_http(
                url, temp_file_name, 0, file_size, timeout, verbose_bool)
    else:
        response = request.urlopen(req, timeout=timeout)
    total_size = int(response.headers.get('Content-Length', '1').strip())
    if initial_size > 0 and file_size == total_size:
        logger.info('Resuming download failed (resume file size '
                    'mismatch). Attempting to restart downloading the '
                    'entire file.')
        initial_size = 0
    total_size += initial_size
    if total_size != file_size:
        raise RuntimeError('URL could not be parsed properly '
                           '(total size %s != file size %s)'
                           % (total_size, file_size))
    mode = 'ab' if initial_size > 0 else 'wb'
    progress = ProgressBar(total_size, initial_value=initial_size,
                           spinner=True, mesg='file_sizes',
                           verbose_bool=verbose_bool)
    chunk_size = 8192  # 2 ** 13
    with open(temp_file_name, mode) as local_file:
        while True:
            t0 = time.time()
            chunk = response.read(chunk_size)
            dt = time.time() - t0
            if dt < 0.005:
                chunk_size *= 2
            elif dt > 0.1 and chunk_size > 8192:
                chunk_size = chunk_size // 2
            if not chunk:
                if verbose_bool:
                    sys.stdout.write('\n')
                    sys.stdout.flush()
                break
            local_file.write(chunk)
            progress.update_with_increment_value(len(chunk),
                                                 mesg='file_sizes')


def _chunk_write(chunk, local_file, progress):
    """Write a chunk to file and update the progress bar."""
    local_file.write(chunk)
    progress.update_with_increment_value(len(chunk))


@verbose
def _fetch_file(url, file_name, print_destination=True, resume=True,
                hash_=None, timeout=30., hash_type='md5', verbose=None):
    """Load requested file, downloading it if needed or requested.

    Parameters
    ----------
    url: string
        The url of file to be downloaded.
    file_name: string
        Name, along with the path, of where downloaded file will be saved.
    print_destination: bool, optional
        If true, destination of where file was saved will be printed after
        download finishes.
    resume: bool, optional
        If true, try to resume partially downloaded files.
    hash_ : str | None
        The hash of the file to check. If None, no checking is
        performed.
    timeout : float
        The URL open timeout.
    hash_type : str
        The type of hashing to use such as "md5" or "sha1"
    %(verbose)s
    """
    # Adapted from NISL:
    # https://github.com/nisl/tutorial/blob/master/nisl/datasets.py
    if hash_ is not None and (not isinstance(hash_, str) or
                              len(hash_) != 32) and hash_type == 'md5':
        raise ValueError('Bad hash value given, should be a 32-character '
                         'string:\n%s' % (hash_,))
    temp_file_name = file_name + ".part"
    verbose_bool = (logger.level <= 20)  # 20 is info
    try:
        # Check file size and displaying it alongside the download url
        # this loop is necessary to follow any redirects
        for _ in range(10):  # 10 really should be sufficient...
            u = request.urlopen(url, timeout=timeout)
            try:
                last_url, url = url, u.geturl()
                if url == last_url:
                    file_size = int(
                        u.headers.get('Content-Length', '1').strip())
                    break
            finally:
                u.close()
                del u
        else:
            raise RuntimeError('Too many redirects')
        logger.info('Downloading %s (%s)' % (url, sizeof_fmt(file_size)))

        # Triage resume
        if not os.path.exists(temp_file_name):
            resume = False
        if resume:
            with open(temp_file_name, 'rb', buffering=0) as local_file:
                local_file.seek(0, 2)
                initial_size = local_file.tell()
            del local_file
        else:
            initial_size = 0
        # This should never happen if our functions work properly
        if initial_size > file_size:
            raise RuntimeError('Local file (%s) is larger than remote '
                               'file (%s), cannot resume download'
                               % (sizeof_fmt(initial_size),
                                  sizeof_fmt(file_size)))
        elif initial_size == file_size:
            # This should really only happen when a hash is wrong
            # during dev updating
            warn('Local file appears to be complete (file_size == '
                 'initial_size == %s)' % (file_size,))
        else:
            # Need to resume or start over
            scheme = parse.urlparse(url).scheme
            if scheme not in ('http', 'https'):
                raise NotImplementedError('Cannot use %s' % (scheme,))
            _get_http(url, temp_file_name, initial_size, file_size, timeout,
                      verbose_bool)

        # check hash sum eg md5sum
        if hash_ is not None:
            logger.info('Verifying hash %s.' % (hash_,))
            hashsum = hashfunc(temp_file_name, hash_type=hash_type)
            if hash_ != hashsum:
                raise RuntimeError('Hash mismatch for downloaded file %s, '
                                   'expected %s but got %s'
                                   % (temp_file_name, hash_, hashsum))
        shutil.move(temp_file_name, file_name)
        if print_destination is True:
            logger.info('File saved as %s.\n' % file_name)
    except Exception:
        logger.error('Error while fetching file %s.'
                     ' Dataset fetching aborted.' % url)
        raise


def _url_to_local_path(url, path):
    """Mirror a url path in a local destination (keeping folder structure)."""
    destination = parse.urlparse(url).path
    # First char should be '/', and it needs to be discarded
    if len(destination) < 2 or destination[0] != '/':
        raise ValueError('Invalid URL')
    destination = os.path.join(path, request.url2pathname(destination)[1:])
    return destination