#!/usr/bin/env python

from __future__ import print_function

try:
    from urllib.request import urlopen
except ImportError:
    from urllib2 import urlopen
import xml.etree.ElementTree as etree

from intranges import intranges_from_list

UNICODE_VERSION = '6.3.0'

SCRIPTS_URL = "http://www.unicode.org/Public/{version}/ucd/Scripts.txt"
JOININGTYPES_URL = "http://www.unicode.org/Public/{version}/ucd/ArabicShaping.txt"
IDNATABLES_URL = "http://www.iana.org/assignments/idna-tables-{version}/idna-tables-{version}.xml"
IDNATABLES_NS = "http://www.iana.org/assignments"

# These scripts are needed to compute IDNA contextual rules, see
# https://www.iana.org/assignments/idna-tables-6.3.0#idna-tables-context

SCRIPT_WHITELIST = sorted(['Greek', 'Han', 'Hebrew', 'Hiragana', 'Katakana'])


def print_optimised_list(d):
    print("(")
    for (start, end) in intranges_from_list(d):
        print("        ({}, {}),".format(hex(int(start)), hex(int(end))))
    print("    ),")


def build_idnadata(version):

    print("# This file is automatically generated by build-idnadata.py\n")

    #
    # Script classifications are used by some CONTEXTO rules in RFC 5891
    #
    print("scripts = {")
    scripts = {}
    for line in urlopen(SCRIPTS_URL.format(version=version)).readlines():
        line = line.decode('utf-8')
        line = line.strip()
        if not line or line[0] == '#':
            continue
        if line.find('#'):
            line = line.split('#')[0]
        (codepoints, scriptname) = [x.strip() for x in line.split(';')]
        if not scriptname in scripts:
            scripts[scriptname] = set()
        if codepoints.find('..') > 0:
            (begin, end) = [int(x, 16) for x in codepoints.split('..')]
            for cp in range(begin, end+1):
                scripts[scriptname].add(cp)
        else:
            scripts[scriptname].add(int(codepoints, 16))

    for script in SCRIPT_WHITELIST:
        print("    '{0}':".format(script), end=' ')
        print_optimised_list(scripts[script])

    print("}")

    #
    # Joining types are used by CONTEXTJ rule A.1
    #
    print("joining_types = {")
    scripts = {}
    for line in urlopen(JOININGTYPES_URL.format(version=version)).readlines():
        line = line.decode('utf-8')
        line = line.strip()
        if not line or line[0] == '#':
            continue
        (codepoint, name, joiningtype, group) = [x.strip() for x in line.split(';')]
        print("    {0}: '{1}',".format(hex(int(codepoint, 16)), joiningtype))
    print("}")

    #
    # These are the classification of codepoints into PVALID, CONTEXTO, CONTEXTJ, etc.
    #
    print("codepoint_classes = {")
    classes = {}

    namespace = "{{{0}}}".format(IDNATABLES_NS)
    idntables_data = urlopen(IDNATABLES_URL.format(version=version)).read()
    root = etree.fromstring(idntables_data)

    for record in root.findall('{0}registry[@id="idna-tables-properties"]/{0}record'.format(namespace)):
        codepoint = record.find("{0}codepoint".format(namespace)).text
        prop = record.find("{0}property".format(namespace)).text
        if prop in ('UNASSIGNED', 'DISALLOWED'):
            continue
        if not prop in classes:
            classes[prop] = set()
        if codepoint.find('-') > 0:
            (begin, end) = [int(x, 16) for x in codepoint.split('-')]
            for cp in range(begin, end+1):
                classes[prop].add(cp)
        else:
            classes[prop].add(int(codepoint, 16))

    for prop in classes:
        print("    '{0}':".format(prop), end=' ')
        print_optimised_list(classes[prop])

    print("}")

if __name__ == "__main__":
    build_idnadata(UNICODE_VERSION)

