1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192
|
# numdb.py - module for handling hierarchically organised numbers
#
# Copyright (C) 2010-2023 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA
"""Query structured number format files with number properties.
This module contains functions for reading and querying a database that
stores numbers that use a hierarchical format (e.g. ISBN, IBAN, phone
numbers, etc).
To read a database from a file:
>>> with open('tests/numdb-test.dat', 'r') as f:
... dbfile = read(f)
To split a number:
>>> dbfile.split('01006')
['0', '100', '6']
>>> dbfile.split('902006')
['90', '20', '06']
>>> dbfile.split('909856')
['90', '985', '6']
To split the number and get properties for each part:
>>> import pprint
>>> pprint.pprint(dbfile.info('01006'))
[('0', {'prop1': 'foo'}), ('100', {'prop2': 'bar'}), ('6', {})]
>>> pprint.pprint(dbfile.info('02006'))
[('0', {'prop1': 'foo'}), ('200', {'prop2': 'bar', 'prop3': 'baz'}), ('6', {})]
>>> pprint.pprint(dbfile.info('03456'))
[('0', {'prop1': 'foo'}), ('345', {'prop2': 'bar', 'prop3': 'baz'}), ('6', {})]
>>> pprint.pprint(dbfile.info('902006'))
[('90', {'prop1': 'booz'}), ('20', {'prop2': 'foo'}), ('06', {})]
>>> pprint.pprint(dbfile.info('909856'))
[('90', {'prop1': 'booz'}), ('985', {'prop2': 'fooz'}), ('6', {})]
>>> pprint.pprint(dbfile.info('9889'))
[('98', {'prop1': 'booz'}), ('89', {'prop2': 'foo'})]
>>> pprint.pprint(dbfile.info('633322'))
[('6', {'prop1': 'boo'}), ('333', {'prop2': 'bar', 'prop3': 'baz', 'prop4': 'bla'}), ('22', {})]
>>> pprint.pprint(dbfile.info('1200333'))
[('1', {'prop1': 'foo'}), ('200', {'prop2': 'bar', 'prop3': 'baz'}), ('333', {'prop4': 'bax'})]
"""
from __future__ import annotations
import re
TYPE_CHECKING = False
if TYPE_CHECKING: # pragma: no cover (only used when type checking)
from collections.abc import Generator, Iterable
from typing import IO, Any
PrefixInfo = tuple[int, str, str, dict[str, str], list['PrefixInfo']]
_line_re = re.compile(
r'^(?P<indent> *)'
r'(?P<ranges>([^-,\s]+(-[^-,\s]+)?)(,[^-,\s]+(-[^-,\s]+)?)*)\s*'
r'(?P<props>.*)$')
_prop_re = re.compile(
r'(?P<prop>[0-9a-zA-Z-_]+)="(?P<value>[^"]*)"')
# this is a cache of open databases
_open_databases = {}
# the prefixes attribute of NumDB is structured as follows:
# prefixes = [
# [ length, low, high, props, children ]
# ...
# ]
# where children is a prefixes structure in its own right
# (there is no expected ordering within the list)
class NumDB():
"""Number database."""
prefixes: list[PrefixInfo]
def __init__(self) -> None:
"""Construct an empty database."""
self.prefixes = []
@staticmethod
def _find(number: str, prefixes: list[PrefixInfo]) -> list[tuple[str, dict[str, str]]]:
"""Lookup the specified number in the list of prefixes, this will
return basically what info() should return but works recursively."""
if not number:
return []
part = number
properties: dict[str, Any] = {}
next_prefixes: list[PrefixInfo] = []
# go over prefixes and find matches
for length, low, high, props, children in prefixes:
if len(part) >= length and low <= part[:length] <= high:
# only use information from the shortest match
if length < len(part):
part = part[:length]
properties = {}
next_prefixes = []
properties.update(props)
next_prefixes.extend(children)
# return first part and recursively find next matches
return [(part, properties)] + NumDB._find(number[len(part):], next_prefixes)
def info(self, number: str) -> list[tuple[str, dict[str, str]]]:
"""Split the provided number in components and associate properties
with each component. This returns a tuple of tuples. Each tuple
consists of a string (a part of the number) and a dict of properties.
"""
return NumDB._find(number, self.prefixes)
def split(self, number: str) -> list[str]:
"""Split the provided number in components. This returns a tuple with
the number of components identified."""
return [part for part, props in self.info(number)]
def _parse(
fp: Iterable[str],
) -> Generator[tuple[int, int, str, str, dict[str, str], list[PrefixInfo]]]:
"""Read lines of text from the file pointer and generate indent, length,
low, high, properties tuples."""
for line in fp:
# ignore comments
if line[0] == '#' or line.strip() == '':
continue # pragma: no cover (optimisation takes it out)
# any other line should parse
match = _line_re.search(line)
assert match is not None
indent = len(match.group('indent'))
ranges = match.group('ranges')
props = dict(_prop_re.findall(match.group('props')))
children: list[PrefixInfo] = []
for rnge in ranges.split(','):
if '-' in rnge:
low, high = rnge.split('-')
else:
low, high = rnge, rnge
yield indent, len(low), low, high, props, children
def read(fp: Iterable[str]) -> NumDB:
"""Return a new database with the data read from the specified file."""
last_indent = 0
db = NumDB()
stack = {0: db.prefixes}
for indent, length, low, high, props, children in _parse(fp):
if indent > last_indent:
# set our stack location to the last parent entry
stack[indent] = stack[last_indent][-1][4]
stack[indent].append((length, low, high, props, children))
last_indent = indent
return db
def _get_resource_stream(name: str) -> IO[bytes]:
"""Return a readable file-like object for the resource."""
try: # pragma: no cover (Python 3.9 and newer)
import importlib.resources
return importlib.resources.files(__package__).joinpath(name).open('rb')
except (ImportError, AttributeError): # pragma: no cover (older Python versions)
import pkg_resources # type: ignore[import-untyped]
return pkg_resources.resource_stream(__name__, name) # type: ignore[no-any-return]
def get(name: str) -> NumDB:
"""Open a database with the specified name to perform queries on."""
if name not in _open_databases:
import codecs
reader = codecs.getreader('utf-8')
with reader(_get_resource_stream(name + '.dat')) as fp:
_open_databases[name] = read(fp)
return _open_databases[name]
|