File: tokenizer.py

package info (click to toggle)

python-ldap 3.1.0-2

links: PTS, VCS
area: main
in suites: buster
size: 2,248 kB
sloc: python: 9,465; ansic: 2,828; makefile: 132; sh: 68

file content (80 lines) | stat: -rw-r--r-- 2,306 bytes

parent folder | download | duplicates (2)

"""
ldap.schema.tokenizer - Low-level parsing functions for schema element strings

See https://www.python-ldap.org/ for details.
"""

import re

TOKENS_FINDALL = re.compile(
    r"(\()"           # opening parenthesis
    r"|"              # or
    r"(\))"           # closing parenthesis
    r"|"              # or
    r"([^'$()\s]+)"   # string of length >= 1 without '$() or whitespace
    r"|"              # or
    r"('.*?'(?!\w))"  # any string or empty string surrounded by single quotes
                      # except if right quote is succeeded by alphanumeric char
    r"|"              # or
    r"([^\s]+?)",     # residue, all non-whitespace strings
).findall


def split_tokens(s):
    """
    Returns list of syntax elements with quotes and spaces stripped.
    """
    parts = []
    parens = 0
    for opar, cpar, unquoted, quoted, residue in TOKENS_FINDALL(s):
        if unquoted:
            parts.append(unquoted)
        elif quoted:
            parts.append(quoted[1:-1])
        elif opar:
            parens += 1
            parts.append(opar)
        elif cpar:
            parens -= 1
            parts.append(cpar)
        elif residue == '$':
            if not parens:
                raise ValueError("'$' outside parenthesis in %r" % (s))
        else:
            raise ValueError(residue, s)
    if parens:
        raise ValueError("Unbalanced parenthesis in %r" % (s))
    return parts

def extract_tokens(l,known_tokens):
  """
  Returns dictionary of known tokens with all values
  """
  assert l[0].strip()=="(" and l[-1].strip()==")",ValueError(l)
  result = {}
  result.update(known_tokens)
  i = 0
  l_len = len(l)
  while i<l_len:
    if l[i] in result:
      token = l[i]
      i += 1 # Consume token
      if i<l_len:
        if l[i] in result:
          # non-valued
          result[token] = (())
        elif l[i]=="(":
          # multi-valued
          i += 1 # Consume left parentheses
          start = i
          while i<l_len and l[i]!=")":
            i += 1
          result[token] = tuple(filter(lambda v:v!='$',l[start:i]))
          i += 1 # Consume right parentheses
        else:
          # single-valued
          result[token] = l[i],
          i += 1 # Consume single value
    else:
      i += 1 # Consume unrecognized item
  return result