File: schema.py

package info (click to toggle)
python-fastparquet 2024.2.0-2
links: PTS, VCS
area: main
in suites: sid
size: 120,180 kB
sloc: python: 8,181; makefile: 187
file content (203 lines) | stat: -rwxr-xr-x 7,073 bytes
"""Utils for working with the parquet thrift models."""
from collections import OrderedDict

from fastparquet import parquet_thrift


def schema_tree(schema, i=0):
    root = schema[i]
    root["children"] = OrderedDict()
    while len(root["children"]) < root.num_children:
        i += 1
        s = schema[i]
        root["children"][s.name] = s
        if s.num_children not in [None, 0]:
            i = schema_tree(schema, i)
    if root.num_children:
        return i
    else:
        return i + 1


def schema_to_text(root, indent=[]):
    text = "".join(indent) + '- ' + root.name + ": "
    parts = []
    if root.type is not None:
        parts.append(parquet_thrift.Type._VALUES_TO_NAMES[root.type])
    if root.logicalType is not None:
        for key in dir(root.logicalType):
            if getattr(root.logicalType, key) is not None:
                if key == "TIMESTAMP":
                    unit = [k for k in dir(root.logicalType.TIMESTAMP.unit) if getattr(
                        root.logicalType.TIMESTAMP.unit, k) is not None][0]
                    parts.append(f"TIMESTAMP[{unit}]")
                else:
                    # extra parameters possible here
                    parts.append(key)
                break

    if root.converted_type is not None:
        parts.append(parquet_thrift.ConvertedType._VALUES_TO_NAMES[
                         root.converted_type])
    if root.repetition_type is not None:
        parts.append(parquet_thrift.FieldRepetitionType._VALUES_TO_NAMES[
                         root.repetition_type])
    text += ', '.join(parts)
    indent.append('|')
    if hasattr(root, 'children'):
        indent[-1] = '| '
        for i, child in enumerate(root["children"].values()):
            if i == len(root["children"]) - 1:
                indent[-1] = '  '
            text += '\n' + schema_to_text(child, indent)
    indent.pop()
    return text


def flatten(schema, root, name_parts=[]):
    if not hasattr(schema, 'children'):
        return
    if schema is not root:
        name_parts = name_parts + [schema.name]
    # root["children"].pop('.'.join(name_parts), None)
    for name, item in schema["children"].copy().items():
        if schema.repetition_type == parquet_thrift.FieldRepetitionType.REPEATED:
            continue
        if len(getattr(item, 'children', [])) == 0:
            root["children"]['.'.join(name_parts + [name])] = item
        elif item.converted_type in [parquet_thrift.ConvertedType.LIST,
                                     parquet_thrift.ConvertedType.MAP]:
            root["children"]['.'.join(name_parts + [name])] = item
        else:
            flatten(item, root, name_parts)
            item["isflat"] = True


class SchemaHelper(object):
    """Utility providing convenience methods for schema_elements."""

    def __init__(self, schema_elements):
        """Initialize with the specified schema_elements."""
        self.schema_elements = schema_elements
        for se in schema_elements:
            try:
                se.name = se.name.decode()
            except AttributeError:
                pass  # already a str
        self.root = schema_elements[0]
        self.schema_elements_by_name = dict(
            [(se.name, se) for se in schema_elements])
        schema_tree(schema_elements)
        self._text = None
        flatten(self.root, self.root)

    @property
    def text(self):
        if self._text is None:
            self._text = schema_to_text(self.schema_elements[0])
        return self._text

    def __eq__(self, other):
        return self.schema_elements == other.schema_elements

    def __ne__(self, other):
        return not self.__eq__(other)

    def __str__(self):
        return self.text

    def __repr__(self):
        return "<Parquet Schema with {} entries>".format(
            len(self.schema_elements))

    def schema_element(self, name):
        """Get the schema element with the given name or path"""
        root = self.root
        if isinstance(name, str):
            name = name.split('.')
        for part in name:
            root = root["children"][part]
        return root

    def is_required(self, name):
        """Return true if the schema element with the given name is required."""
        required = True
        if isinstance(name, str):
            name = name.split('.')
        parts = []
        for part in name:
            parts.append(part)
            s = self.schema_element(parts)
            if s.repetition_type != parquet_thrift.FieldRepetitionType.REQUIRED:
                required = False
                break
        return required

    def max_repetition_level(self, parts):
        """Get the max repetition level for the given schema path."""
        max_level = 0
        if isinstance(parts, str):
            parts = parts.split('.')
        for i in range(len(parts)):
            element = self.schema_element(parts[:i+1])
            if element.repetition_type == parquet_thrift.FieldRepetitionType.REPEATED:
                max_level += 1
        return max_level

    def max_definition_level(self, parts):
        """Get the max definition level for the given schema path."""
        max_level = 0
        if isinstance(parts, str):
            parts = parts.split('.')
        for i in range(len(parts)):
            element = self.schema_element(parts[:i+1])
            if element.repetition_type != parquet_thrift.FieldRepetitionType.REQUIRED:
                max_level += 1
        return max_level


def _is_list_like(helper, column):
    if len(column.meta_data.path_in_schema) < 3:
        return False
    se = helper.schema_element(
        column.meta_data.path_in_schema[:-2])
    ct = se.converted_type
    if ct != parquet_thrift.ConvertedType.LIST:
        return False
    if len(se["children"]) > 1:
        return False
    se2 = list(se["children"].values())[0]
    if len(se2["children"]) > 1:
        return False
    if se2.repetition_type != parquet_thrift.FieldRepetitionType.REPEATED:
        return False
    se3 = list(se2["children"].values())[0]
    if se3.repetition_type == parquet_thrift.FieldRepetitionType.REPEATED:
        return False
    return True


def _is_map_like(helper, column):
    if len(column.meta_data.path_in_schema) < 3:
        return False
    se = helper.schema_element(
        column.meta_data.path_in_schema[:-2])
    ct = se.converted_type
    if ct != parquet_thrift.ConvertedType.MAP:
        return False
    if len(se["children"]) > 1:
        return False
    se2 = list(se["children"].values())[0]
    if len(se2["children"]) != 2:
        return False
    if se2.repetition_type != parquet_thrift.FieldRepetitionType.REPEATED:
        return False
    if set(se2["children"]) != {'key', 'value'}:
        return False
    se3 = se2["children"]['key']
    if se3.repetition_type != parquet_thrift.FieldRepetitionType.REQUIRED:
        return False
    se3 = se2["children"]['value']
    if se3.repetition_type == parquet_thrift.FieldRepetitionType.REPEATED:
        return False
    return True