File: download-docs

package info (click to toggle)
python-a38 0.1.8-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 440 kB
  • sloc: python: 4,065; xml: 174; makefile: 80; sh: 14
file content (104 lines) | stat: -rwxr-xr-x 3,035 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/python3
import argparse
import logging
import sys
import requests
import re
import urllib.parse
import os
from lxml import etree as ET

log = logging.getLogger("download_docs")

DOCS = [
    r"/Schema_del_file_xml.+\.xsd$",
    r"/Specifiche_tecniche.+\.pdf$",
    r"/Rappresentazione_tabellare_del_tracciato.+\.(?:pdf|xls)$",
    r"/fatturaPA.+\.xsl$",
    r"/fatturaordinaria.+\.xsl$"
    r"/changelog_formato\.pdf$",
    r"/Suggerimenti_Compilazione.+\.pdf$",
    r"/fatturapa.+\.xsl$",
    r"/fatturaordinaria.+\.xsl$",
    r"/Elenco_Controlli.+\.pdf$",
]

EXAMPLES = [
    r"/IT01234567890_FP.+\.xml",
]


def get_urls(index_url):
    index = requests.get(index_url)
    parser = ET.XMLParser(recover=True)
    root = ET.fromstring(index.text, parser)
    re_docs = [re.compile(r) for r in DOCS]
    re_examples = [re.compile(r) for r in EXAMPLES]
    links = []
    for a in root.iter("a"):
        href = a.attrib.get("href")
        if href is None:
            continue
        # There seem to be various wrong links to this file, so we ignore
        # them
        if "IT01234567890_11111" in href:
            continue
        links.append(href)
    for l in links:
        for r in re_docs:
            if r.search(l):
                yield {"type": "doc", "href": l}
        for r in re_examples:
            if r.search(l):
                yield {"type": "example", "href": l}


def download(index_url):
    for el in get_urls(index_url):
        url = urllib.parse.urljoin(index_url, el["href"])
        parsed = urllib.parse.urlparse(url)
        filename = os.path.basename(parsed.path)
        if el["type"] == "doc":
            dest = os.path.join("doc", filename)
        elif el["type"] == "example":
            dest = os.path.join("doc", filename)
        if os.path.exists(dest):
            log.info("%s: already downloaded", dest)
            continue
        res = requests.get(url, stream=True)
        with open(dest, 'wb') as fd:
            for chunk in res.iter_content(chunk_size=128):
                fd.write(chunk)
        log.info("%s: downloading", dest)


class Fail(Exception):
    pass


def main():
    parser = argparse.ArgumentParser(description="download documents and examples from www.fatturapa.gov.it")
    parser.add_argument("--verbose", "-v", action="store_true", help="verbose output")
    parser.add_argument("--debug", action="store_true", help="debug output")

    args = parser.parse_args()

    log_format = "%(asctime)-15s %(levelname)s %(message)s"
    level = logging.WARN
    if args.debug:
        level = logging.DEBUG
    elif args.verbose:
        level = logging.INFO
    logging.basicConfig(level=level, stream=sys.stderr, format=log_format)

    download("https://www.fatturapa.gov.it/it/norme-e-regole/documentazione-fattura-elettronica/formato-fatturapa/")


if __name__ == "__main__":
    try:
        main()
    except Fail as e:
        print(e, file=sys.stderr)
        sys.exit(1)
    except Exception:
        log.exception("uncaught exception")