File: common.py

package info (click to toggle)
opencc 1.2.0%2Bds1-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 4,260 kB
sloc: cpp: 5,181; python: 470; makefile: 126; javascript: 106; sh: 70
file content (226 lines) | stat: -rw-r--r-- 7,210 bytes
# -*- coding: utf-8 -*-

import codecs
import sys


def sort_items(input_filename, output_filename):
    input_file = codecs.open(input_filename, "r", encoding="utf-8")

    lines = [line.rstrip("\r\n") for line in input_file]
    input_file.close()

    def line_type(line):
        if line == "" or line.strip() == "":
            return "empty"
        if line.startswith("#"):
            return "comment"
        if "\t" in line:
            return "entry"
        raise ValueError("Invalid dictionary line: " + line)

    parsed = []
    for line in lines:
        parsed.append({"type": line_type(line), "content": line})

    entry_lines = [i for i, p in enumerate(parsed) if p["type"] == "entry"]
    if not entry_lines:
        header_blocks = []
        current = []
        for p in parsed:
            if p["type"] == "comment":
                current.append(p["content"])
            elif p["type"] == "empty":
                if current:
                    header_blocks.append(list(current))
                    current = []
        if current:
            header_blocks.append(list(current))

        output_file = open(output_filename, "wb")
        for idx, block in enumerate(header_blocks):
            for line in block:
                output_file.write((line + "\n").encode("utf-8"))
            if idx < len(header_blocks) - 1:
                output_file.write(b"\n")
        if header_blocks:
            output_file.write(b"\n")
        output_file.close()
        return

    first_entry = entry_lines[0]
    last_entry = entry_lines[-1]

    header_end = -1
    for i in range(first_entry - 1, -1, -1):
        if parsed[i]["type"] == "empty":
            header_end = i
            break

    header_blocks = []
    current = []
    for i in range(0, header_end + 1):
        if parsed[i]["type"] == "comment":
            current.append(parsed[i]["content"])
        elif parsed[i]["type"] == "empty":
            if current:
                header_blocks.append(list(current))
                current = []
    if current:
        header_blocks.append(list(current))

    footer_blocks = []
    current = []
    for i in range(last_entry + 1, len(parsed)):
        if parsed[i]["type"] == "comment":
            current.append(parsed[i]["content"])
        elif parsed[i]["type"] == "empty":
            if current:
                footer_blocks.append(list(current))
                current = []
    if current:
        footer_blocks.append(list(current))

    annotated_entries = []
    floating_blocks = []
    current = []
    entry_index = 0
    for i in range(header_end + 1, last_entry + 1):
        p = parsed[i]
        if p["type"] == "comment":
            current.append(p["content"])
            continue
        if p["type"] == "empty":
            if current:
                floating_blocks.append({"anchor": entry_index, "lines": list(current)})
                current = []
            continue
        if p["type"] == "entry":
            attached = None
            if current:
                has_empty = False
                for j in range(i - 1, -1, -1):
                    if parsed[j]["type"] == "entry":
                        break
                    if parsed[j]["type"] == "empty":
                        has_empty = True
                        break
                if has_empty:
                    floating_blocks.append({"anchor": entry_index, "lines": list(current)})
                else:
                    attached = list(current)
                current = []

            key, value = p["content"].split("\t", 1)
            annotated_entries.append(
                {
                    "key": key,
                    "value": value,
                    "attached": attached,
                    "original_index": entry_index,
                }
            )
            entry_index += 1

    if current:
        floating_blocks.append({"anchor": entry_index, "lines": list(current)})

    annotated_entries.sort(key=lambda e: e["key"])
    index_map = {e["original_index"]: i for i, e in enumerate(annotated_entries)}
    for block in floating_blocks:
        if block["anchor"] in index_map:
            block["anchor"] = index_map[block["anchor"]]
        else:
            block["anchor"] = len(annotated_entries)

    floating_by_anchor = {}
    for block in floating_blocks:
        floating_by_anchor.setdefault(block["anchor"], []).append(block["lines"])

    output_file = open(output_filename, "wb")

    for idx, block in enumerate(header_blocks):
        for line in block:
            output_file.write((line + "\n").encode("utf-8"))
        if idx < len(header_blocks) - 1:
            output_file.write(b"\n")
    if header_blocks and annotated_entries:
        output_file.write(b"\n")

    for i, entry in enumerate(annotated_entries):
        for block in floating_by_anchor.get(i, []):
            output_file.write(b"\n")
            for line in block:
                output_file.write((line + "\n").encode("utf-8"))
            output_file.write(b"\n")

        if entry["attached"]:
            for line in entry["attached"]:
                output_file.write((line + "\n").encode("utf-8"))
        output_file.write(
            (entry["key"] + "\t" + entry["value"] + "\n").encode("utf-8")
        )

    for block in floating_by_anchor.get(len(annotated_entries), []):
        output_file.write(b"\n")
        for line in block:
            output_file.write((line + "\n").encode("utf-8"))

    if footer_blocks:
        if annotated_entries:
            output_file.write(b"\n")
        for idx, block in enumerate(footer_blocks):
            for line in block:
                output_file.write((line + "\n").encode("utf-8"))
            if idx < len(footer_blocks) - 1:
                output_file.write(b"\n")

    output_file.close()


def reverse_items(input_filename, output_filename):
    input_file = codecs.open(input_filename, "r", encoding="utf-8")
    dic = {}

    for line in input_file:
        stripped = line.strip()
        if not stripped or stripped.startswith("#"):
            continue
        key, value = line.split("\t")
        while value[-1] == "\n" or value[-1] == "\r":
            value = value[:-1]

        value_list = value.split(" ")
        for value in value_list:
            if value in dic:
                dic[value].append(key)
            else:
                dic[value] = [key]

    input_file.close()

    output_file = open(output_filename, "wb")

    for key in sorted(dic.keys()):
        line = key + "\t" + " ".join(dic[key]) + "\n"
        output_file.write(line.encode('utf-8'))

    output_file.close()


def find_target_items(input_filename, keyword):
    input_file = codecs.open(input_filename, "r", encoding="utf-8")
    for line in input_file:
        stripped = line.strip()
        if not stripped or stripped.startswith("#"):
            continue
        key, value = line.split("\t")
        while value[-1] == "\n" or value[-1] == "\r":
            value = value[:-1]

        value_list = value.split(" ")
        for value in value_list:
            if keyword in value:
                sys.stdout.write(line)

    input_file.close()