1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
|
# -*- coding: utf-8 -*-
# Copyright © 2007-2008 Stockholm TreeAligner Project
# Author: Torsten Marek <shlomme@gmx.net>
# Licensed under the GNU GPLv2
from collections import defaultdict
from itertools import count
import logging
def get_version_string():
import nltk
return "nltk %s" % (nltk.__version__, )
from nltk_contrib.tiger.index import IndexNodeId, CONTINUOUS, DISCONTINUOUS, gorn2db
from nltk_contrib.tiger.graph import NodeType, veeroot_graph, DEFAULT_VROOT_EDGE_LABEL
__all__ = ("TigerCorpusIndexer",)
# think about splitting feature_values table into several small tables.
# add NOT NULLS everywhere
# TODO: create proper progress reporter interface, hand in
INDEX_VERSION = 3
class _Tables(object):
FEATURES = """CREATE TABLE features
(id INTEGER PRIMARY KEY, order_id INTEGER, name TEXT, domain TEXT)"""
FEATURE_VALUES = """CREATE TABLE feature_values
(feature_id INTEGER, value_id INTEGER, value TEXT, description TEXT,
UNIQUE(feature_id, value_id))"""
EDGE_LABELS = """CREATE TABLE edge_labels
(id INTEGER PRIMARY KEY, label TEXT UNIQUE, description TEXT)"""
SECEDGE_LABELS = """CREATE TABLE secedge_labels
(id INTEGER PRIMARY KEY, label TEXT UNIQUE, description TEXT)"""
GRAPHS = """CREATE TABLE graphs
(id INTEGER PRIMARY KEY, xml_graph_id TEXT, data BLOB)"""
METADATA = """CREATE TABLE corpus_metadata
(key TEXT UNIQUE, value TEXT)"""
INDEX_METADATA = """CREATE TABLE index_metadata
(key TEXT UNIQUE, value NONE)"""
NODE_DATA = """CREATE TABLE node_data
(id INTEGER PRIMARY KEY, xml_node_id TEXT, edge_label INTEGER,
gorn_address BLOB, continuity INTEGER, arity INTEGER,
tokenarity INTEGER, left_corner INTEGER, right_corner INTEGER, token_order INTEGER)"""
FEATURE_IIDX_TEMPLATE = """CREATE TABLE feature_iidx_%s
(node_id INTEGER PRIMARY KEY NOT NULL, value_id INTEGER NOT NULL)"""
SECEDGES = """CREATE TABLE secedges
(origin_id INT, target_id INT, label_id INT)"""
class TigerCorpusIndexer(object):
def __init__(self, db, graph_serializer, progress = False, always_veeroot = True):
self._db = db
self._cursor = db.cursor()
self._progress = progress
self._always_veeroot = always_veeroot
self._graphs = 0
self._cursor.execute(_Tables.FEATURES)
self._cursor.execute(_Tables.FEATURE_VALUES)
self._cursor.execute(_Tables.EDGE_LABELS)
self._cursor.execute(_Tables.SECEDGE_LABELS)
self._cursor.execute(_Tables.GRAPHS)
self._cursor.execute(_Tables.METADATA)
self._cursor.execute(_Tables.INDEX_METADATA)
self._cursor.execute(_Tables.NODE_DATA)
self._cursor.execute(_Tables.SECEDGES)
self._serializer = graph_serializer
self._open_list_features = []
self._feature_count = {
NodeType.TERMINAL: 0,
NodeType.NONTERMINAL: 0}
self._feature_iidx_stmts = {}
self._feature_value_maps = {}
self._insert_lists = defaultdict(list)
self._store_creator_metadata()
def _store_creator_metadata(self):
self._add_index_metadata(creator=get_version_string(), index_version=INDEX_VERSION)
def _add_index_metadata(self, **kwargs):
self._cursor.executemany("INSERT INTO index_metadata (key, value) VALUES (?, ?)",
kwargs.iteritems())
def set_metadata(self, metadata):
self._cursor.executemany("INSERT INTO corpus_metadata (key, value) VALUES (?, ?)",
metadata.iteritems())
def add_feature(self, feature_name, domain, feature_values):
order_id = self._feature_count[domain]
self._feature_count[domain] += 1
self._cursor.execute("INSERT INTO features (order_id, name, domain) VALUES (?, ?, ?)",
(order_id, feature_name, domain.key))
feature_id = self._cursor.lastrowid
if len(feature_values) > 0:
value_map = dict((feature_value, idx) for idx, feature_value in enumerate(feature_values))
self._cursor.executemany("INSERT INTO feature_values (feature_id, value_id, value, description) VALUES (?, ?, ?, ?)",
((feature_id, value_map[value], value, description)
for value, description in feature_values.iteritems()))
else:
value_map = defaultdict(count().next)
self._open_list_features.append((feature_id, value_map))
self._feature_value_maps[feature_name] = (value_map, domain)
self._serializer.add_feature_value_map(feature_name, domain, order_id, value_map)
self._create_feature_value_index(feature_name)
return feature_id
def set_edge_labels(self, edge_labels):
if self._always_veeroot:
assert DEFAULT_VROOT_EDGE_LABEL in edge_labels, "no neutral edge label"
self._cursor.executemany("INSERT INTO edge_labels (id, label, description) VALUES (?, ?, ?)",
((idx, e[0], e[1]) for idx, e in enumerate(edge_labels.iteritems())))
self._edge_label_map = dict(self._cursor.execute("SELECT label, id FROM edge_labels"))
self._serializer.set_edge_label_map(self._edge_label_map)
def set_secedge_labels(self, secedge_labels):
self._cursor.executemany("INSERT INTO secedge_labels (id, label, description) VALUES (?, ?, ?)",
((idx, e[0], e[1]) for idx, e in enumerate(secedge_labels.iteritems())))
self._secedge_label_map = dict(self._cursor.execute("SELECT label, id FROM secedge_labels"))
self._serializer.set_secedge_label_map(self._secedge_label_map)
def _create_feature_value_index(self, feature_name):
feature_name = str(feature_name)
assert feature_name.isalpha()
self._cursor.execute(_Tables.FEATURE_IIDX_TEMPLATE % (feature_name,))
self._feature_iidx_stmts[feature_name] = "INSERT INTO feature_iidx_%s (node_id, value_id) VALUES (?, ?)" % (feature_name,)
def get_terminal_index_data(self, node, node_ids):
return (node_ids[node.id].to_int(), node.id, self._edge_label_map.get(node.edge_label, None),
gorn2db(node.gorn_address), node.order)
def get_nonterminal_index_data(self, node, node_ids):
return (node_ids[node.id].to_int(), node.id, self._edge_label_map.get(node.edge_label, None),
gorn2db(node.gorn_address),
(CONTINUOUS if node.is_continuous else DISCONTINUOUS),
node.arity, node.token_arity,
node_ids[node.left_corner].to_int(), node_ids[node.right_corner].to_int(),
node.children_type)
def _store_node_data(self, graph, node_ids):
nonterminals, terminals = graph.compute_node_information()
self._cursor.executemany("""INSERT INTO node_data
(id, xml_node_id, edge_label, gorn_address, continuity, arity, tokenarity, left_corner, right_corner, token_order)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(self.get_nonterminal_index_data(nt, node_ids) for nt in nonterminals))
self._cursor.executemany("INSERT INTO node_data (id, xml_node_id, edge_label, gorn_address, token_order, continuity) VALUES (?, ?, ?, ?, ?, 0)",
(self.get_terminal_index_data(t, node_ids) for t in terminals))
def _index_feature_values(self, graph, node_ids):
for node in graph:
for feature_name, feature_value in node.features.iteritems():
value_map, domain = self._feature_value_maps[feature_name]
assert node.TYPE is domain
self._insert_lists[feature_name].append((node_ids[node.id].to_int(), value_map[feature_value]))
if self._graphs % 1000 == 0:
self._flush_node_feature_values()
def _index_secedges(self, graph, node_ids):
for node in graph:
if node.secedges is not None:
self._cursor.executemany(
"INSERT INTO secedges (origin_id, target_id, label_id) VALUES (?, ?, ?)",
((node_ids[node.id].to_int(), node_ids[graph.nodes[target_node].id].to_int(),
self._secedge_label_map[label])
for label, target_node in node.secedges))
def _flush_node_feature_values(self):
for feature_name, values in self._insert_lists.iteritems():
self._cursor.executemany(self._feature_iidx_stmts[feature_name], values)
self._insert_lists = defaultdict(list)
def _convert_ids(self, graph, node_ids): # split out into separate method
def _convert_edgelist(l):
return [(label, node_ids[target_xml_id])
for label, target_xml_id in l]
graph.id = self._graphs
graph.root_id = node_ids[graph.root_id]
for xml_node_id in graph.nodes.keys():
node = graph.nodes.pop(xml_node_id)
node.id = node_ids[node.id]
graph.nodes[node_ids[xml_node_id]] = node
if node.secedges:
node.secedges = _convert_edgelist(node.secedges)
if node.TYPE is NodeType.NONTERMINAL:
node.edges = _convert_edgelist(node.edges)
def add_graph(self, graph):
try:
roots = graph.get_roots()
except KeyError, e:
logging.error("Graph %s is faulty: node %s referenced more than once.",
graph.id, e.args[0])
return
if self._always_veeroot:
veeroot_graph(graph, roots)
else:
assert len(roots) == 1, "No auto-veerooting, but several unconnected subgraphs %s in %s." % (roots, graph.id)
node_ids = dict((xml_node_id, IndexNodeId(self._graphs, idx))
for idx, xml_node_id in enumerate(graph.nodes))
xml_id = graph.id
self._store_node_data(graph, node_ids)
self._index_feature_values(graph, node_ids)
self._index_secedges(graph, node_ids)
self._convert_ids(graph, node_ids)
self._cursor.execute("INSERT INTO graphs (id, xml_graph_id, data) VALUES (?, ?, ?)",
(self._graphs, xml_id, buffer(self._serializer.serialize_graph(graph))))
self._graphs += 1
if self._progress and self._graphs % 100 == 0:
print self._graphs
def finalize(self, optimize = True):
if self._progress:
print "finalize"
self._flush_node_feature_values()
if self._progress:
print "inserting feature values"
for feature_id, feature_value_map in self._open_list_features:
self._cursor.executemany("INSERT INTO feature_values (feature_id, value_id, value) VALUES (?, ?, ?)",
((feature_id, value_id, value)
for value, value_id in feature_value_map.iteritems()))
del self._open_list_features
if self._progress:
print "Committing database"
self._db.commit()
self._cursor.execute("CREATE INDEX feature_id_idx ON feature_values (feature_id)")
for feature_name in self._feature_value_maps:
if self._progress:
print "creating index for feature '%s'" % (feature_name,)
self._cursor.execute("CREATE INDEX %s_iidx_idx ON feature_iidx_%s (value_id)" % (feature_name, feature_name))
if self._progress:
print "creating index for xml node ids"
self._cursor.execute("CREATE UNIQUE INDEX xml_node_id_idx ON node_data (xml_node_id)")
if self._progress:
print "creating index for xml graph ids"
self._cursor.execute("CREATE UNIQUE INDEX xml_graph_id_idx ON graphs (xml_graph_id)")
if self._progress:
print "creating secedge indices"
self._cursor.execute("CREATE INDEX se_origin_idx ON secedges (origin_id)")
self._cursor.execute("CREATE INDEX se_target_idx ON secedges (target_id)")
self._db.commit()
if optimize:
if self._progress:
print "Optimizing database"
self._db.execute("VACUUM")
self._add_index_metadata(finished = True)
self._db.commit()
self._db = None
self._cursor = None
|