File: fetch_alternative_names.py

package info (click to toggle)
geonames 0.3.1-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 629,780 kB
  • sloc: ansic: 971; python: 41; makefile: 30; sh: 27; xml: 6
file content (65 lines) | stat: -rw-r--r-- 2,214 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python3
#
# Copyright 2022 UBports Foundation.
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 3, as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranties of
# MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR
# PURPOSE.  See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program.  If not, see <http://www.gnu.org/licenses/>.

import requests, os, threading, queue
from bs4 import BeautifulSoup
from urllib import request

url = "http://download.geonames.org/export/dump/alternatenames/"
res = requests.get(url)

so = BeautifulSoup(res.text, "html.parser")
pre = so.pre
all_a = pre.find_all("a")

if not os.path.exists("alternatenames"):
    os.mkdir("alternatenames")

if not os.path.exists("alternatenames_tmp"):
    os.mkdir("alternatenames_tmp")

q = queue.Queue()
for a in all_a:
   if not a.string.endswith(".zip"):
       continue
   q.put_nowait(a.string)

class Worker(threading.Thread):
    def __init__(self, q, *args, **kwargs):
        self.q = q
        super().__init__(*args, **kwargs)
    def run(self):
        while True:
            try:
                lang = self.q.get(timeout=3)
            except queue.Empty:
                return
            print("Downloading {} from {} to {}".format(lang, "{}/{}".format(url, lang), "alternatenames_tmp/{}".format(lang)))
            request.urlretrieve("{}/{}".format(url, lang), "alternatenames_tmp/{}".format(lang))
            print("Done downloading {}".format(lang))
            from zipfile import ZipFile

            print("Extracting {}".format(lang))
            with ZipFile("alternatenames_tmp/{}".format(lang), 'r') as zipObj:
                zipObj.extractall('alternatenames')
            print("Done extracting {}".format(lang))
            os.remove("alternatenames_tmp/{}".format(lang))
            self.q.task_done()

for _ in range(20):
    Worker(q).start()
q.join()
os.rmdir("alternatenames_tmp")