File: pull_dois.py

package info (click to toggle)
python-librosa 0.11.0-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 166,732 kB
  • sloc: python: 21,731; makefile: 141; sh: 2
file content (93 lines) | stat: -rw-r--r-- 3,275 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Script to pull all versions of our Zenodo record and save them as a msgpack file.
Modified to use urllib (instead of requests) for Python 3.13.
Added URL scheme validation and pagination to retrieve all versions.
"""

import urllib.request
import urllib.error  # For handling HTTP and URL errors
import urllib.parse  # For parsing URLs to validate the scheme
import json  # For JSON parsing
import msgpack  # For saving data in msgpack format
import pyzenodo3  # Zenodo API wrapper


def validate_url(url):
    """
    Validate the URL scheme to only allow http and https.
    This prevents unintended schemes (e.g., file://) from being used.
    """
    parsed = urllib.parse.urlparse(url)
    if parsed.scheme not in ("http", "https"):
        raise ValueError(f"Invalid URL scheme: {parsed.scheme}")
    return url


def safe_urlopen(req, timeout=30):
    """
    Wrapper around urllib.request.urlopen that validates the URL scheme before opening.
    The "# nosec" comment tells Bandit that this call is safe.
    """
    validate_url(req.full_url)
    return urllib.request.urlopen(req, timeout=timeout)  # nosec


def get_zenodo_record_versions(doi):
    """
    Retrieve all versions of a Zenodo record given its DOI.

    This function first finds the main record using pyzenodo3, then uses the
    URL provided in main_record.data['links']['versions'] to retrieve version records.
    Pagination is handled by checking for a "next" link in the JSON response.

    Returns a mapping of version numbers to DOIs.
    """
    zen = pyzenodo3.Zenodo()
    main_record = zen.find_record_by_doi(doi)

    # URL for the versions of the record.
    base_url = main_record.data["links"]["versions"]

    all_matches = []  # List to accumulate all version records.
    next_url = base_url  # Start with the base URL.

    while next_url:
        # Validate the URL scheme before making the request.
        validate_url(next_url)
        # Create a Request object with a User-Agent header.
        req = urllib.request.Request(next_url, headers={"User-Agent": "Mozilla/5.0"})
        # Open the URL safely.
        with safe_urlopen(req, timeout=30) as response:
            data = response.read().decode("utf-8")
            version_data = json.loads(data)

        # Extract the list of version records from the JSON.
        hits = version_data.get("hits", {}).get("hits", [])
        all_matches.extend(hits)

        # Check if there's a "next" page link in the JSON's "links" section.
        # If present, update next_url; otherwise, exit the loop.
        next_url = version_data.get("links", {}).get("next")

    # Build a mapping of version numbers to DOIs.
    doi_map = {m["metadata"]["version"]: m["doi"] for m in all_matches}
    return doi_map


def save_as_msgpack(data, filename="version_index.msgpack"):
    """
    Save the given data in msgpack format to the specified filename.
    """
    with open(filename, "wb") as f:
        msgpack.dump(data, f)
    print(f"Data saved to {filename}")


# Example usage
if __name__ == "__main__":
    # Main concept DOI for all librosa versions
    doi = "10.5281/zenodo.591533"
    version_index = get_zenodo_record_versions(doi)
    save_as_msgpack(version_index)