File: schemanormalization.py

package info (click to toggle)
python-avro 1.10.1%2Bdfsg-1
links: PTS, VCS
area: main
in suites: bullseye
size: 1,624 kB
sloc: python: 11,437; xml: 4,061; sh: 752; java: 386; makefile: 21
file content (224 lines) | stat: -rw-r--r-- 6,815 bytes
#!/usr/bin/env python
# -*- mode: python -*-
# -*- coding: utf-8 -*-

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import hashlib
from io import StringIO

from avro.schema import ARRAY, ENUM, ERROR, FIXED, MAP, RECORD, UNION


def ToParsingCanonicalForm(schema):
  """Returns the "Parsing Canonical Form" of a schema.

  The Parsing Canonical Form is defined by the Avro specification.

  Args:
      schema: The Schema to be normalized.
  Returns:
      A string containing the canonical JSON schema.
  """

  env = {}
  with StringIO() as output:
    return _BuildCanonicalForm(env, schema, output).getvalue()


def _BuildCanonicalForm(env, s, o):
    first_time = True
    st = s.type

    # The Java Avro implementation represents error records as records with
    # a separate error flag. For canonicalization to be consistent across
    # implementations we must normalize errors to have record type here.
    if st == ERROR:
      st = RECORD

    if st == UNION:
      o.write('[')
      for b in s.schemas:
        if not first_time:
          o.write(',')
        else:
          first_time = False
        _BuildCanonicalForm(env, b, o)
      o.write(']')
      return o

    elif st in {ARRAY, MAP}:
      o.write('{"type":"')
      o.write(st)
      o.write('"')
      if st == ARRAY:
        o.write(',"items":')
        _BuildCanonicalForm(env, s.items, o)
      else:
        o.write(',"values":')
        _BuildCanonicalForm(env, s.values, o)
      o.write('}')
      return o

    elif st in {ENUM, FIXED, RECORD}:
      name = s.fullname
      if name in env:
        o.write(env[name])
        return o
      qname = '"%s"' % name
      env[name] = qname
      o.write('{"name":')
      o.write(qname)
      o.write(',"type":"')
      o.write(st)
      o.write('"')
      if st == ENUM:
        o.write(',"symbols":[')
        for enum_symbol in s.symbols:
          if not first_time:
            o.write(',')
          else:
            first_time = False
          o.write('"')
          o.write(enum_symbol)
          o.write('"')
        o.write("]")
      elif st == FIXED:
        o.write(',"size":')
        o.write(str(s.size))
      else: # st == RECORD or st == ERROR
        o.write(',"fields":[')
        for f in s.fields:
          if not first_time:
            o.write(',')
          else:
            first_time = False
          o.write('{"name":"')
          o.write(f.name)
          o.write('"')
          o.write(',"type":')
          _BuildCanonicalForm(env, f.type, o)
          o.write('}')
        o.write(']')
      o.write('}')
      return o
    else:
      # boolean, bytes, double, float, int, long, null, string
      o.write('"')
      o.write(st)
      o.write('"')
      return o


def Fingerprint(parsing_normal_form_schema, fingerprint_algorithm_name):
  """Returns a fingerprint of a string of bytes.

  Args:
    parsing_normal_form_schema: A string containing an Avro
      schema in parsing normal form, such as one obtained
      by passing a schema object to ToParsingCanonicalForm()
    fingerprint_algorithm_name: One of the algorithm names
      returned by FingerprintAlgorithmNames(), typically
      'CRC-64-AVRO', 'md5' or 'sha256'. See the Avro
      specification for guidance on selecting a
      fingerprinting algorithm.
  Returns:
      A bytes object containing the schema fingerprint.
  """
  if fingerprint_algorithm_name not in FingerprintAlgorithmNames():
    raise ValueError("Unknown schema fingerprint algorithm {!r}"
                     .format(fingerprint_algorithm_name))
  fingerprint_algorithm = _FINGERPRINT_ALIASES_TO_NAMES[fingerprint_algorithm_name]
  data = parsing_normal_form_schema.encode('utf-8')
  if fingerprint_algorithm in _CRC_64_AVRO:
    return _Crc64AvroFingerprint(data)
  h = hashlib.new(fingerprint_algorithm, data)
  return h.digest()


_CRC_64_AVRO = frozenset({'CRC-64-AVRO'})
_PYTHON_DIGEST_NAMES = frozenset(hashlib.algorithms_guaranteed | _CRC_64_AVRO)

# These are the only three algorithms which Java implementations are
# *required* to support. We provide aliases for them here when the
# Python implementation also supports them, so that key fingerprint
# algorithm names can interoperate between Java and Python.
_JAVA_TO_PYTHON_DIGEST_NAMES = {
  'MD5': 'md5',
  'SHA-1': 'sha1',
  'SHA-256': 'sha256'}

_AVAILABLE_JAVA_TO_PYTHON_DIGEST_NAMES = {
  j: p for j, p in _JAVA_TO_PYTHON_DIGEST_NAMES.items()
  if p in hashlib.algorithms_guaranteed}

_FINGERPRINT_ALIASES_TO_NAMES = {name: name for name in _PYTHON_DIGEST_NAMES}
_FINGERPRINT_ALIASES_TO_NAMES.update(_AVAILABLE_JAVA_TO_PYTHON_DIGEST_NAMES)


def FingerprintAlgorithmNames():
  """A collection of fingerprint algorithm names.

  The same algorithm may be associated with more than one entry
  in this collection. For example, 'SHA-1' and 'sha1' might both
  be present in the result, in order to facilitate the interoperability
  of algorithm names between Python and, say, Java Avro
  implementations.

  Returns:
    A set of strings containing algorithm names, any
    one of which can be used as the fingerprint_algorithm
    argument of Fingerprint()
  """
  return _FINGERPRINT_ALIASES_TO_NAMES.keys()


_EMPTY64 = 0xc15d213aa4d7a795


def _Crc64AvroFingerprint(data):
  """The 64-bit Rabin Fingerprint.

  As described in the Avro specification.

  Args:
    data: A bytes object containing the UTF-8 encoded parsing canonical
      form of an Avro schema.
  Returns:
    A bytes object with a length of eight.
  """
  if _FP_TABLE is None:
    _PopulateFpTable()
  result = _EMPTY64
  for b in data:
    result = (result >> 8) ^ _FP_TABLE[(result ^ b) & 0xff]
  # Although not mentioned in the Avro specification, the Java
  # implementation gives fingerprint bytes in little-endian order
  return result.to_bytes(length=8, byteorder='little', signed=False)

_FP_TABLE = None


def _PopulateFpTable():
  global _FP_TABLE
  _FP_TABLE = []
  for i in range(256):
    fp = i
    for j in range(8):
      mask = -(fp & 1)
      fp = (fp >> 1) ^ (_EMPTY64 & mask)
    _FP_TABLE.append(fp)