1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
|
#!/usr/bin/env python3
# update/cn_loc.py - script to fetch data from the CN Open Data community
#
# Copyright (C) 2014-2015 Jiangge Zhang
# Copyright (C) 2015-2026 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA
"""This downloads the birth place codes from from Wikipedia."""
import re
import unicodedata
from collections import defaultdict
import requests
# The wikipedia pages to download
wikipedia_pages = [f'中华人民共和国行政区划代码 ({i}区)' for i in range(1, 9)]
# The user agent that will be passed in requests
user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'
def get_wikipedia_url(page):
"""Get the Simplified Chinese Wikipedia page URL."""
return f'https://zh.wikipedia.org/w/index.php?title={page.replace(" ", "_")}&action=raw' # noqa: E231
# Regular expression for matching province heading
province_re = re.compile(r'^== *(?P<province>.*) +\((?P<prefix>[0-9]+)\) +==')
# Regular expression for matching table row
entry_re = re.compile(
r'^\| *(?P<number>[0-9]{6}) *' +
r'\|\| *(?P<activation>.*) *' +
r'\|\| *(?P<revocation>.*) *' +
r'\|\| *(?P<county>.*) *' +
r'\|\| *(?P<code>.*)')
def clean(value):
"""Normalise (partially) unicode strings."""
# Remove unicode parenthesis that include space with normal ones
value = value.replace(
unicodedata.lookup('FULLWIDTH LEFT PARENTHESIS'), ' (',
).replace(
unicodedata.lookup('FULLWIDTH RIGHT PARENTHESIS'), ') ',
)
# Remove Wikipedia links
return re.sub(r'\[\[([^]|]*\|)?([^]|]+)\]\]', r'\2', value)
def parse_county(county, activation, revocation):
"""Parse the county string and return ranges counties."""
for value in county.split('<br>'):
m = re.match(r'(?P<county>.*) +\((?P<year>[0-9]{4})年至今\) +', value)
# This parses various formats as seen on Wikipedia
if m: # starting with year
yield f'[{m.group("year")}-{revocation}]{m.group("county")}'
continue
m = re.match(r'(?P<county>.*) +\((?P<year>[0-9]{4})年前\) +', value)
if m: # before given year
yield f'[{activation}-{int(m.group("year")) - 1}]{m.group("county")}'
continue
m = re.match(r'(?P<county>.*) +\((?P<years>[0-9]{4}-[0-9]{4})年曾撤销\) +', value)
if m: # abolished between years
if activation or revocation:
yield f'[{activation}-{revocation}]{m.group("county")}'
else:
yield m.group('county')
continue
m = re.match(r'(?P<county>.*) +\((?P<start>[0-9]{4})年?-(?P<end>[0-9]{4})年\) +', value)
if m:
yield f'[{m.group("start")}-{int(m.group("end")) - 1}]{m.group("county")}'
continue
if activation or revocation:
yield f'[{activation}-{revocation}]{value}'
else:
yield value
def parse_page(content):
"""Parse the contents of the Wikipedia page and return number, county, province tuples."""
province = None
prefix = None
for line in clean(content).splitlines():
line = clean(line)
m = province_re.match(line)
if m:
province = m.group('province')
prefix = m.group('prefix')
continue
m = entry_re.match(line)
if m:
number = m.group('number')
assert number.startswith(prefix)
counties = m.group('county')
try:
activation = str(int(m.group('activation')))
except ValueError:
activation = ''
try:
revocation = str(int(m.group('revocation')))
except ValueError:
revocation = ''
for county in parse_county(counties, activation, revocation):
yield prefix, province, number, county.strip()
if __name__ == '__main__':
"""Output a data file in the right format."""
print('# Downloaded from')
for page in wikipedia_pages:
print(f'# {get_wikipedia_url(page)}')
# Download all data
provinces = {}
numbers = defaultdict(lambda: defaultdict(list))
for page in wikipedia_pages:
response = requests.get(get_wikipedia_url(page), timeout=30, headers={'User-Agent': user_agent})
response.raise_for_status()
for prefix, province, number, county in parse_page(response.text):
provinces[prefix] = province
numbers[prefix][number].append(county)
# Print data
for prefix, province in sorted(provinces.items()):
print(f'{prefix} province="{province}"')
for number, counties in sorted(numbers[prefix].items()):
county = ','.join(sorted(counties))
print(f' {number[2:]} county="{county}"')
|