1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
|
#!/usr/bin/python3
import argparse
import logging
import sys
import requests
import re
import urllib.parse
import os
from lxml import etree as ET
log = logging.getLogger("download_docs")
DOCS = [
r"/Schema_del_file_xml.+\.xsd$",
r"/Specifiche_tecniche.+\.pdf$",
r"/Rappresentazione_tabellare_del_tracciato.+\.(?:pdf|xls)$",
r"/fatturaPA.+\.xsl$",
r"/fatturaordinaria.+\.xsl$"
r"/changelog_formato\.pdf$",
r"/Suggerimenti_Compilazione.+\.pdf$",
r"/fatturapa.+\.xsl$",
r"/fatturaordinaria.+\.xsl$",
r"/Elenco_Controlli.+\.pdf$",
]
EXAMPLES = [
r"/IT01234567890_FP.+\.xml",
]
def get_urls(index_url):
index = requests.get(index_url)
parser = ET.XMLParser(recover=True)
root = ET.fromstring(index.text, parser)
re_docs = [re.compile(r) for r in DOCS]
re_examples = [re.compile(r) for r in EXAMPLES]
links = []
for a in root.iter("a"):
href = a.attrib.get("href")
if href is None:
continue
# There seem to be various wrong links to this file, so we ignore
# them
if "IT01234567890_11111" in href:
continue
links.append(href)
for l in links:
for r in re_docs:
if r.search(l):
yield {"type": "doc", "href": l}
for r in re_examples:
if r.search(l):
yield {"type": "example", "href": l}
def download(index_url):
for el in get_urls(index_url):
url = urllib.parse.urljoin(index_url, el["href"])
parsed = urllib.parse.urlparse(url)
filename = os.path.basename(parsed.path)
if el["type"] == "doc":
dest = os.path.join("doc", filename)
elif el["type"] == "example":
dest = os.path.join("doc", filename)
if os.path.exists(dest):
log.info("%s: already downloaded", dest)
continue
res = requests.get(url, stream=True)
with open(dest, 'wb') as fd:
for chunk in res.iter_content(chunk_size=128):
fd.write(chunk)
log.info("%s: downloading", dest)
class Fail(Exception):
pass
def main():
parser = argparse.ArgumentParser(description="download documents and examples from www.fatturapa.gov.it")
parser.add_argument("--verbose", "-v", action="store_true", help="verbose output")
parser.add_argument("--debug", action="store_true", help="debug output")
args = parser.parse_args()
log_format = "%(asctime)-15s %(levelname)s %(message)s"
level = logging.WARN
if args.debug:
level = logging.DEBUG
elif args.verbose:
level = logging.INFO
logging.basicConfig(level=level, stream=sys.stderr, format=log_format)
download("https://www.fatturapa.gov.it/it/norme-e-regole/documentazione-fattura-elettronica/formato-fatturapa/")
if __name__ == "__main__":
try:
main()
except Fail as e:
print(e, file=sys.stderr)
sys.exit(1)
except Exception:
log.exception("uncaught exception")
|