File: _download.py

package info (click to toggle)
python-wn 1.0.0-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,100 kB
  • sloc: python: 8,429; xml: 566; sql: 238; makefile: 12
file content (132 lines) | stat: -rw-r--r-- 4,819 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import logging
from collections.abc import Sequence
from pathlib import Path

import httpx

from wn._add import add as add_to_db
from wn._config import config
from wn._exceptions import Error
from wn._util import is_url
from wn.util import ProgressBar, ProgressHandler

CHUNK_SIZE = 8 * 1024  # how many KB to read at a time
TIMEOUT = 10  # number of seconds to wait for a server response


logger = logging.getLogger("wn")


def download(
    project_or_url: str,
    add: bool = True,
    progress_handler: type[ProgressHandler] | None = ProgressBar,
) -> Path:
    """Download the resource specified by *project_or_url*.

    First the URL of the resource is determined and then, depending on
    the parameters, the resource is downloaded and added to the
    database.  The function then returns the path of the cached file.

    If *project_or_url* starts with `'http://'` or `'https://'`, then
    it is taken to be the URL for the resource. Otherwise,
    *project_or_url* is taken as a :ref:`project specifier
    <lexicon-specifiers>` and the URL is taken from a matching entry
    in Wn's project index. If no project matches the specifier,
    :exc:`wn.Error` is raised.

    If the URL has been downloaded and cached before, the cached file
    is used. Otherwise the URL is retrieved and stored in the cache.

    If the *add* paramter is ``True`` (default), the downloaded
    resource is added to the database.

    >>> wn.download("ewn:2020")
    Added ewn:2020 (English WordNet)

    The *progress_handler* parameter takes a subclass of
    :class:`wn.util.ProgressHandler`. An instance of the class will be
    created, used, and closed by this function.

    """
    if progress_handler is None:
        progress_handler = ProgressHandler
    progress = progress_handler(message="Download", unit=" bytes")

    cache_path, urls = _get_cache_path_and_urls(project_or_url)

    try:
        if cache_path and cache_path.exists():
            progress.flash(f"Cached file found: {cache_path!s}")
            path = cache_path
        elif urls:
            path = _download(urls, progress)
        else:
            raise Error("no urls to download")
    finally:
        progress.close()

    if add:
        try:
            add_to_db(path, progress_handler=progress_handler)
        except Error as exc:
            raise Error(
                f"could not add downloaded file: {path}\n  You might try "
                "deleting the cached file and trying the download again."
            ) from exc

    return path


def _get_cache_path_and_urls(project_or_url: str) -> tuple[Path | None, list[str]]:
    if is_url(project_or_url):
        return config.get_cache_path(project_or_url), [project_or_url]
    else:
        info = config.get_project_info(project_or_url)
        return info.get("cache"), info["resource_urls"]


def _download(urls: Sequence[str], progress: ProgressHandler) -> Path:
    client = httpx.Client(timeout=TIMEOUT, follow_redirects=True)
    try:
        for i, url in enumerate(urls, 1):
            path = config.get_cache_path(url)
            logger.info("download url: %s", url)
            logger.info("download cache path: %s", path)
            try:
                with open(path, "wb") as f:
                    progress.set(status="Requesting", count=0)
                    with client.stream("GET", url) as response:
                        response.raise_for_status()
                        total = int(response.headers.get("Content-Length", 0))
                        count = response.num_bytes_downloaded
                        progress.set(count=count, total=total, status="Receiving")
                        for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE):
                            if chunk:
                                f.write(chunk)
                            progress.update(response.num_bytes_downloaded - count)
                            count = response.num_bytes_downloaded
                        progress.set(status="Complete")
            except httpx.RequestError as exc:
                path.unlink(missing_ok=True)
                last_count = progress.kwargs["count"]
                if i == len(urls):
                    raise Error(f"download failed at {last_count} bytes") from exc
                else:
                    logger.info(
                        "download failed at %d bytes; trying next url", last_count
                    )
            else:
                break  # success

    except KeyboardInterrupt as exc:
        path.unlink(missing_ok=True)
        last_count = progress.kwargs["count"]
        raise Error(f"download cancelled at {last_count} bytes") from exc
    except Exception:
        path.unlink(missing_ok=True)
        raise
    finally:
        client.close()

    return path