File: optimized_al.py

package info (click to toggle)
sqlalchemy 1.0.15%2Bds1-1
links: PTS, VCS
area: main
in suites: stretch
size: 13,056 kB
ctags: 26,600
sloc: python: 169,901; ansic: 1,346; makefile: 260; xml: 17
file content (215 lines) | stat: -rw-r--r-- 8,157 bytes
parent folder | download | duplicates (4)
"""Uses the same strategy as
  ``adjacency_list.py``, but associates each   DOM row with its owning
  document row, so that a full document of DOM nodes can be loaded
  using O(1) queries - the construction of the "hierarchy" is performed
  after the load in a non-recursive fashion and is more
  efficient.

"""

##################### PART I - Imports/Configuration #########################
from sqlalchemy import (MetaData, Table, Column, Integer, String, ForeignKey,
    Unicode, and_, create_engine)
from sqlalchemy.orm import mapper, relationship, Session, lazyload

import sys, os, io, re

from xml.etree import ElementTree

e = create_engine('sqlite://', echo=True)
meta = MetaData()

####################### PART II - Table Metadata #############################

# stores a top level record of an XML document.
documents = Table('documents', meta,
    Column('document_id', Integer, primary_key=True),
    Column('filename', String(30), unique=True),
)

# stores XML nodes in an adjacency list model.  This corresponds to
# Element and SubElement objects.
elements = Table('elements', meta,
    Column('element_id', Integer, primary_key=True),
    Column('parent_id', Integer, ForeignKey('elements.element_id')),
    Column('document_id', Integer, ForeignKey('documents.document_id')),
    Column('tag', Unicode(30), nullable=False),
    Column('text', Unicode),
    Column('tail', Unicode)
    )

# stores attributes.  This corresponds to the dictionary of attributes
# stored by an Element or SubElement.
attributes = Table('attributes', meta,
    Column('element_id', Integer, ForeignKey('elements.element_id'), primary_key=True),
    Column('name', Unicode(100), nullable=False, primary_key=True),
    Column('value', Unicode(255)))

meta.create_all(e)

########################### PART III - Model #################################

# our document class.  contains a string name,
# and the ElementTree root element.
class Document(object):
    def __init__(self, name, element):
        self.filename = name
        self.element = element

    def __str__(self):
        buf = io.StringIO()
        self.element.write(buf)
        return buf.getvalue()

########################## PART IV - Persistence Mapping #####################

# Node class.  a non-public class which will represent
# the DB-persisted Element/SubElement object.  We cannot create mappers for
# ElementTree elements directly because they are at the very least not new-style
# classes, and also may be backed by native implementations.
# so here we construct an adapter.
class _Node(object):
    pass

# Attribute class.  also internal, this will represent the key/value attributes stored for
# a particular Node.
class _Attribute(object):
    def __init__(self, name, value):
        self.name = name
        self.value = value

# setup mappers.  Document will eagerly load a list of _Node objects.
# they will be ordered in primary key/insert order, so that we can reconstruct
# an ElementTree structure from the list.
mapper(Document, documents, properties={
    '_nodes':relationship(_Node, lazy='joined', cascade="all, delete-orphan")
})

# the _Node objects change the way they load so that a list of _Nodes will organize
# themselves hierarchically using the ElementTreeMarshal.  this depends on the ordering of
# nodes being hierarchical as well; relationship() always applies at least ROWID/primary key
# ordering to rows which will suffice.
mapper(_Node, elements, properties={
    'children':relationship(_Node, lazy=None),  # doesnt load; used only for the save relationship
    'attributes':relationship(_Attribute, lazy='joined', cascade="all, delete-orphan"), # eagerly load attributes
})

mapper(_Attribute, attributes)

# define marshalling functions that convert from _Node/_Attribute to/from ElementTree objects.
# this will set the ElementTree element as "document._element", and append the root _Node
# object to the "_nodes" mapped collection.
class ElementTreeMarshal(object):
    def __get__(self, document, owner):
        if document is None:
            return self

        if hasattr(document, '_element'):
            return document._element

        nodes = {}
        root = None
        for node in document._nodes:
            if node.parent_id is not None:
                parent = nodes[node.parent_id]
                elem = ElementTree.SubElement(parent, node.tag)
                nodes[node.element_id] = elem
            else:
                parent = None
                elem = root = ElementTree.Element(node.tag)
                nodes[node.element_id] = root
            for attr in node.attributes:
                elem.attrib[attr.name] = attr.value
            elem.text = node.text
            elem.tail = node.tail

        document._element = ElementTree.ElementTree(root)
        return document._element

    def __set__(self, document, element):
        def traverse(node):
            n = _Node()
            n.tag = str(node.tag)
            n.text = str(node.text)
            n.tail = str(node.tail)
            document._nodes.append(n)
            n.children = [traverse(n2) for n2 in node]
            n.attributes = [_Attribute(str(k), str(v)) for k, v in node.attrib.items()]
            return n

        traverse(element.getroot())
        document._element = element

    def __delete__(self, document):
        del document._element
        document._nodes = []

# override Document's "element" attribute with the marshaller.
Document.element = ElementTreeMarshal()

###################### PART V - Basic Persistence Example ####################

line = "\n--------------------------------------------------------"

# save to DB
session = Session(e)

# get ElementTree documents
for file in ('test.xml', 'test2.xml', 'test3.xml'):
    filename = os.path.join(os.path.dirname(__file__), file)
    doc = ElementTree.parse(filename)
    session.add(Document(file, doc))

print("\nSaving three documents...", line)
session.commit()
print("Done.")

print("\nFull text of document 'text.xml':", line)
document = session.query(Document).filter_by(filename="test.xml").first()

print(document)

######################## PART VI - Searching for Paths #######################

# manually search for a document which contains "/somefile/header/field1:hi"
print("\nManual search for /somefile/header/field1=='hi':", line)
d = session.query(Document).join('_nodes', aliased=True).\
                filter(and_(_Node.parent_id==None, _Node.tag=='somefile')).\
                join('children', aliased=True, from_joinpoint=True).\
                filter(_Node.tag=='header').\
                join('children', aliased=True, from_joinpoint=True).\
                filter(and_(_Node.tag=='field1', _Node.text=='hi')).\
                one()
print(d)

# generalize the above approach into an extremely impoverished xpath function:
def find_document(path, compareto):
    j = documents
    prev_elements = None
    query = session.query(Document)
    first = True
    for i, match in enumerate(re.finditer(r'/([\w_]+)(?:\[@([\w_]+)(?:=(.*))?\])?', path)):
        (token, attrname, attrvalue) = match.group(1, 2, 3)
        if first:
            query = query.join('_nodes', aliased=True).filter(_Node.parent_id==None)
            first = False
        else:
            query = query.join('children', aliased=True, from_joinpoint=True)
        query = query.filter(_Node.tag==token)
        if attrname:
            query = query.join('attributes', aliased=True, from_joinpoint=True)
            if attrvalue:
                query = query.filter(and_(_Attribute.name==attrname, _Attribute.value==attrvalue))
            else:
                query = query.filter(_Attribute.name==attrname)
    return query.options(lazyload('_nodes')).filter(_Node.text==compareto).all()

for path, compareto in (
        ('/somefile/header/field1', 'hi'),
        ('/somefile/field1', 'hi'),
        ('/somefile/header/field2', 'there'),
        ('/somefile/header/field2[@attr=foo]', 'there')
    ):
    print("\nDocuments containing '%s=%s':" % (path, compareto), line)
    print([d.filename for d in find_document(path, compareto)])