File: fetch_and_build.py

package info (click to toggle)
python-fingerprints 1.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 920 kB
  • sloc: python: 1,290; makefile: 17
file content (90 lines) | stat: -rw-r--r-- 2,728 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import io
import csv
import yaml
from typing import Dict, Set
from urllib.request import urlopen
from normality import slugify, stringify

from fingerprints.types.common import TYPES_PATH

ELF_URL = "https://www.gleif.org/about-lei/code-lists/iso-20275-entity-legal-forms-code-list/2021-10-21-elf-code-list-v1.4.1.csv"
OCCRP_URL = "https://docs.google.com/spreadsheets/d/1Cw2xQ3hcZOAgnnzejlY5Sv3OeMxKePTqcRhXQU8rCAw/pub?gid=0&single=true&output=csv"

TypesData = Dict[str, Set[str]]


def load_elf(types: TypesData):
    fh = urlopen(ELF_URL)
    fh = io.TextIOWrapper(fh, encoding="utf-8")
    for row in csv.DictReader(fh):
        data = {}
        for label, value in row.items():
            label = label.split("(")[0]
            data[slugify(label, sep="_")] = value
        # print(data)
        abb_translit = data["abbreviations_transliterated"].split(";")
        name_translit = data["entity_legal_form_name_transliterated_name"]
        abb_local = data["abbreviations_local_language"].split(";")
        name_local = data["entity_legal_form_name_local_name"]
        abb_all = []
        # for abb in chain(abb_translit, abb_local):
        for abb in abb_translit:
            abb = abb.strip()
            if len(abb):
                abb_all.append(abb)
        labels = set(abb_all)
        for abb in abb_local:
            abb = abb.strip()
            if len(abb):
                labels.add(abb)
        if len(name_translit):
            labels.add(name_translit)
        if len(name_local):
            labels.add(name_local)
        for abb in abb_all:
            other = set(labels)
            other.remove(abb)
            if abb not in types:
                types[abb] = other
            else:
                types[abb].update(other)


def load_occrp(types: TypesData):
    fh = urlopen(OCCRP_URL)
    fh = io.TextIOWrapper(fh, encoding="utf-8")
    for row in csv.DictReader(fh):
        name = stringify(row.get("Name"))
        abb = stringify(row.get("Abbreviation"))
        if name is None or abb is None:
            continue
        # print(name, abb)
        if abb not in types:
            types[abb] = set()
        types[abb].add(name)


def build() -> None:
    types: TypesData = {}
    load_elf(types)
    load_occrp(types)

    # from pprint import pprint
    # pprint(types)
    out = []
    for type_, forms in sorted(types.items()):
        out.append({"main": type_, "forms": sorted(forms)})

    with open(TYPES_PATH, "wb") as fh:
        fh.write(
            yaml.dump(
                {"types": out},
                allow_unicode=True,
                encoding="utf-8",
                sort_keys=False,
            )
        )


if __name__ == "__main__":
    build()