File: transientdb.py

package info (click to toggle)
thuban 1.2.2-14
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 9,176 kB
  • sloc: python: 30,410; ansic: 6,181; xml: 4,234; cpp: 1,595; makefile: 145
file content (705 lines) | stat: -rw-r--r-- 26,724 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
# Copyright (C) 2003 by Intevation GmbH
# Authors:
# Bernhard Herzog <bh@intevation.de>
#
# This program is free software under the GPL (>=v2)
# Read the file COPYING coming with the software for details.

"""Database for transient data

This database is intended for data representations needed during the
lifetime of a Thuban session but which is not permanent. Examples of
this are for instance a join of two DBF files where the DBF files are
the permanent representation of the data and the join only exists in the
Thuban session and is reconstructed when the session is opened.
"""

__version__ = "$Revision: 2871 $"
# $Source$
# $Id: transientdb.py 2871 2009-03-20 13:10:25Z dpinte $

# Pysqlite version 1. and 2. behaves quiet differently
# Pysqlite uses a different paramstyle.  The older version
# support format and pyformat while pysqlite2 supports only qmark
# and named.
# The sqlite2 boolean variable is used to manage specific part of the code
try:

    # Using SQLITE 3.x
    import sqlite3 as sqlite
    sqlite2 = True
except ImportError:
    try :
        # Using SQLITE 2.x
        sqlite2 = True
        from pysqlite2 import dbapi2 as sqlite
    except ImportError:
        # Using SQLITE 1.x
        sqlite2 = False
        import sqlite
    
from base import TitledObject

import table

sql_type_map = {
    table.FIELDTYPE_INT: "INTEGER",
    table.FIELDTYPE_STRING: "VARCHAR",
    table.FIELDTYPE_DOUBLE: "FLOAT",
    }

type_converter_map = {
    table.FIELDTYPE_INT: int,
    table.FIELDTYPE_STRING: str,
    table.FIELDTYPE_DOUBLE: float,
    }

class TransientDatabase:

    def __init__(self, filename):
        self.filename = filename
        self.conn = sqlite.connect(filename)
        # Counters to produce unique table and column names
        self.num_tables = 0
        self.num_cols = 0
        # Since there's only once process using the SQLite database, we
        # might be able to get a tad more speed with default_synchronous
        # OFF. So far I haven't seen any measurable speedup, though.
        #self.execute("PRAGMA default_synchronous = OFF")

    def __del__(self):
        self.close()

    def close(self):
        if self.conn is not None:
            self.conn.close()
            self.conn = None

    def new_table_name(self):
        self.num_tables += 1
        return "Table%03d" % self.num_tables

    def new_column_name(self):
        self.num_cols += 1
        return "Col%03d" % self.num_cols

    def execute(self, *args):
        """execute the SQL statement in the database and return the result"""
        cursor = self.conn.cursor()
        cursor.execute(*args)
        result = cursor.fetchone()
        self.conn.commit()
        return result

    def cursor(self):
        return self.conn.cursor()


class ColumnReference:

    def __init__(self, name, type, internal_name):
        self.name = name
        self.type = type
        self.internal_name = internal_name


class TransientTableBase:

    """Base class for tables in the transient database"""

    def __init__(self, transient_db):
        """Initialize the table for use with the given transient db"""
        self.db = transient_db
        self.tablename = self.db.new_table_name()
        self.indexed_columns = {}
        self.read_record_cursor = None
        self.read_record_last_row = None
        self.read_record_last_result = None

    def create(self, columns):
        self.columns = columns
        self.name_to_column = {}
        self.orig_names = []
        self.internal_to_orig = {}
        self.orig_to_internal = {}
        self.column_map = {}

        # Create the column objects and fill various maps and lists
        for index in range(len(self.columns)):
            col = self.columns[index]
            self.name_to_column[col.name] = col
            self.orig_names.append(col.name)
            self.internal_to_orig[col.internal_name] = col.name
            self.orig_to_internal[col.name] = col.internal_name
            self.column_map[col.name] = col
            self.column_map[index] = col

        # Build the CREATE TABLE statement and create the table in the
        # database
        table_types = ["id INTEGER PRIMARY KEY"]
        for col in self.columns:
            table_types.append("%s %s" % (col.internal_name,
                                          sql_type_map[col.type]))
        table_stmt = "CREATE TABLE %s (\n    %s\n);" % (self.tablename,
                                                   ",\n    ".join(table_types))
        self.db.execute(table_stmt)

    def transient_table(self):
        """
        Return a table whose underlying implementation is in the transient db
        """
        return self

    def ensure_index(self, column):
        """Ensure that there's an index on the given column"""
        if not column in self.indexed_columns:
            internal_name = self.orig_to_internal[column]
            indexname = "Index_%s_%s" % (self.tablename, internal_name)
            stmt = "CREATE INDEX %s ON %s (%s);" % (indexname, self.tablename,
                                                    internal_name)
            self.db.execute(stmt)
            self.indexed_columns[column] = 1

    def NumColumns(self):
        return len(self.columns)

    def NumRows(self):
        result = self.db.execute("SELECT count(*) FROM %s;" % self.tablename)
        return int(result[0])

    def Columns(self):
        return self.columns

    def Column(self, col):
        return self.column_map[col]

    def HasColumn(self, col):
        """Return whether the table has a column with the given name or index
        """
        return self.column_map.has_key(col)

    def RowIdToOrdinal(self, gid):
        """Return the row ordinal given its id

        At the moment the transient tables are only used for tables that
        don't distinguish between row number and row id, so the value is
        returned unchanged.
        """
        return gid

    def RowOrdinalToId(self, num):
        """Return the rowid for given its ordinal

        At the moment the transient tables are only used for tables that
        don't distinguish between row number and row id, so the value is
        returned unchanged.
        """
        return num

    def ReadRowAsDict(self, index, row_is_ordinal = 0):
        """Return the entire row as a dictionary with column names as keys

        The row_is_ordinal is ignored because at the moment the
        transient tables are only used for DBF files where it doesn't
        matter.
        """
        # Implementation Strategy: Executing a completely new select
        # statement every time this method is called is too slow. The
        # most important usage is to read the records more or less
        # sequentially. This happens e.g. when drawing a layer with a
        # classification where the shapes are drawn in order of the
        # shape ids. Another pattern is that the same row is requested
        # several times in a row. This happens in the table view, for
        # instance.

        # We can exploit this to make access faster by having one cursor
        # open all the time and keeping the last row read around in case
        # the same row is accessed again the next time and if the row
        # index is larger than the row we have read last we simply fetch
        # rows from the cursor until we've reached the requested row. If
        # the requested row index is smaller then we start a new cursor.

        # FIXME: So far this scheme seems to work well enough. Obvious
        # improvements would be to start the cursor at exactly the
        # requested row (should be efficient and easy to do now that the
        # id is the primary key) and to perhaps to also start a new
        # cursor if the requested index is much larger than the last row
        # so that we don't read and discard lots of the rows.

        # Check whether we have to start a new cursor
        if self.read_record_cursor is None or index <self.read_record_last_row:
            stmt = ("SELECT %s FROM %s;"
                    % (", ".join([c.internal_name for c in self.columns]),
                       self.tablename))
            self.read_record_cursor = self.db.cursor()
            self.read_record_cursor.execute(stmt)
            self.read_record_last_row = -1
            self.read_record_last_result = None

        # Now we should have a cursor at a position less than or equal
        # to the index so the following if statement will always set
        # result to a suitable value
        assert index >= self.read_record_last_row

        if index == self.read_record_last_row:
            result = self.read_record_last_result
        else:
            for i in range(index - self.read_record_last_row):
                result = self.read_record_cursor.fetchone()
                self.read_record_last_result = result
        self.read_record_last_row = index
        return dict(zip(self.orig_names, result))

    def ReadValue(self, row, col, row_is_ordinal = 0):
        """Return the value of the specified row and column

        The col parameter may be the index of the column or its name.

        The row_is_ordinal is ignored because at the moment the
        transient tables are only used for DBF files where it doesn't
        matter.
        """
        # Depending on the actual access patterns of the table data, it
        # might be a bit faster in some circumstances to not implement
        # this via ReadRowAsDict, but this simple implementation should
        # be fast enough for most purposes.
        return self.ReadRowAsDict(row)[self.column_map[col].name]

    def ValueRange(self, col):
        # Performance notes:
        #
        # In sqlite 2.8.6 the min and max aggregate functions can use an
        # index but only when used as the only expression in the select
        # statement (i.e. 'select min(col), max(col) from tbl;' will not
        # use the index but 'select min(col) from tbl;' will) so we
        # query the minimum and maximum separately.
        #
        # With the separate statements we can take advantage of an index
        # if it exists. If the index doesn't exist, creating it first
        # and then using it in the query is slower than the queries
        # without an index. Creating the index is only an advantage if
        # the queries are performed multiple times. With the current use
        # patterns where ValueRange is only used occasionally by the
        # classification generation dialog creating the index only for
        # this usage is not really worth it, so we don't.
        col = self.column_map[col]
        iname = col.internal_name
        min = self.db.execute("SELECT min(%s) FROM %s;"
                              % (iname, self.tablename))[0]
        max = self.db.execute("SELECT max(%s) FROM %s;"
                              % (iname, self.tablename))[0]
        converter = type_converter_map[col.type]
        return (converter(min), converter(max))

    def UniqueValues(self, col):
        # Performance notes:
        #
        # In sqlite 2.8.6 there doesn't seem to be a way to query the
        # unique items that uses an index. I've tried
        #
        #   SELECT col FROM tbl GROUP BY col;
        #
        # and
        #
        #   SELECT DISTINCT col FROM tbl;
        #
        # and in both cases the index is not used. If the index isn't
        # used it doesn't make sense to call self.ensure_index.
        iname = self.column_map[col].internal_name
        cursor = self.db.cursor()
        cursor.execute("SELECT %s FROM %s GROUP BY %s;"
                       % (iname, self.tablename, iname))
        result = []
        while 1:
            row = cursor.fetchone()
            if row is None:
                break
            result.append(row[0])
        return result

    def Width(self, col):
        """Return the maximum width of values in the column

        The return value is the the maximum length of string
        representation of the values in the column (represented by index
        or name).
        """
        max = 0

        type  = self.column_map[col].type
        iname = self.column_map[col].internal_name
        cursor = self.db.cursor()
        cursor.execute("SELECT %s FROM %s;" % (iname, self.tablename))
        values = [ i[0] for i in cursor.fetchall()]
        if not values:
            return None

        if type == table.FIELDTYPE_DOUBLE:
            format = "%.12f"
        elif type == table.FIELDTYPE_INT:
            format = "%d"
        else:
            format = "%s"
        for value in values:
            if value is None: continue
            l = len(format % value)
            if l > max:
                max = l

        return max

    def SimpleQuery(self, left, comparison, right):
        """Return the indices of all rows that matching a condition.

        Parameters:
           left -- The column object for the left side of the comparison

           comparison -- The comparison operator as a string. It must be
                         one of '==', '!=', '<', '<=', '>=', '>'

           right -- The right hand side of the comparison. It must be
                    either a column object or a value, i.e. a string,
                    int or float.

        The return value is a sorted list of the indices of the rows
        where the condition is true.
        """
        if comparison not in ("==", "!=", "<", "<=", ">=", ">"):
            raise ValueError("Comparison operator %r not allowed" % comparison)

        if hasattr(right, "internal_name"):
            right_template = right.internal_name
            params = ()
        else:
            if sqlite2:
                right_template = "?"
            else: right_template = "%s"
            params = (right,)

        query = "SELECT id FROM %s WHERE %s %s %s ORDER BY id;" \
                % (self.tablename, left.internal_name, comparison,
                   right_template)

        cursor = self.db.cursor()
        cursor.execute(query, params)
        result = []
        while 1:
            row = cursor.fetchone()
            if row is None:
                break
            result.append(row[0])
        return result

    def Dependencies(self):
        """Placeholder for a method in a derived class.

        Return a sequence with the tables and other data objects that
        self depends on.
        """
        raise NotImplementedError


class TransientTable(TitledObject, TransientTableBase):

    """A Table in a transient DB that starts as the copy of a Thuban Table."""

    def __init__(self, transient_db, table):
        """Create a new table in the given transient DB as a copy of table

        The table argument can be any object implementing the Table
        interface.
        """
        TransientTableBase.__init__(self, transient_db)
        TitledObject.__init__(self, table.Title())
        self.create(table)

    def create(self, table):
        columns = []
        for col in table.Columns():
            columns.append(ColumnReference(col.name, col.type,
                                           self.db.new_column_name()))
        TransientTableBase.create(self, columns)

        # copy the input table to the transient db

        # A key to insert to use for the formatting of the insert
        # statement. The key must not be equal to any of the column
        # names so we construct one by building a string of x's that is
        # longer than any of the column names
        id_key = max([len(col.name) for col in self.columns]) * "x"

        if sqlite2:
            insert_template = "INSERT INTO %s (id, %s) VALUES (%s, %s);" \
                               % (self.tablename,
                                  ", ".join([col.internal_name
                                             for col in self.columns]),
                                  '?',
                                  ", ".join(["?" for col in self.columns]))

        else:
            insert_template = "INSERT INTO %s (id, %s) VALUES (%%(%s)s, %s);" \
                               % (self.tablename,
                                  ", ".join([col.internal_name
                                             for col in self.columns]),
                                  id_key,
                                  ", ".join(["%%(%s)s" % col.name
                                             for col in self.columns]))
        cursor = self.db.cursor()
        for i in range(table.NumRows()):
            row = table.ReadRowAsDict(i)
            row[id_key] = i
            if sqlite2:
                params = [i]
                for col in self.columns:
                  params.append(row[col.name])
                cursor.execute(insert_template, params)
            else:  
                cursor.execute(insert_template, row)
        self.db.conn.commit()



class TransientJoinedTable(TitledObject, TransientTableBase):

    """A Table in the transient DB that contains a join of two tables"""

    def __init__(self, transient_db, left_table, left_field,
                 right_table, right_field = None, outer_join = False):
        """Create a new table in the transient DB as a join of two tables.

        Both input tables, left_table and right_table must have a
        transient_table method that returns a table object for a table
        in the transient database. The join is performed on the condition
        that the value of the left_field column the the left table is
        equal to the value of the right_field in the right_table.

        The joined table contains all columns of the input tables,
        however, the column names of the right table may be changed
        slightly to make them unique in the joined table. This is
        currently done by appending a sufficient number of underscores
        ('_').
        """
        TransientTableBase.__init__(self, transient_db)
        self.dependencies = (left_table, right_table)
        self.left_table = left_table.transient_table()
        self.left_field = left_field
        self.right_table = right_table.transient_table()
        if right_field:
            self.right_field = right_field
        else:
            self.right_field = self.left_field
        self.outer_join = outer_join

        title = "Join of %(left)s and %(right)s" \
                % {"left": self.left_table.Title(),
                   "right": self.right_table.Title()}
        TitledObject.__init__(self, title)

        self.create()

    def create(self):
        """Internal: Create the table with the joined data"""
        self.tablename = self.db.new_table_name()

        self.right_table.ensure_index(self.right_field)

        # determine the internal column names to join on before
        # coalescing the column information because if the external
        # column names are the same they will be mapped to the same
        # internal name afterwards.
        internal_left_col = self.left_table.orig_to_internal[self.left_field]
        internal_right_col =self.right_table.orig_to_internal[self.right_field]

        # Coalesce the column information
        visited = {}
        columns = []
        newcolumns = []
        for table in (self.left_table, self.right_table):
            for col in table.Columns():
                colname = col.name
                # We can't allow multiple columns with the same
                # original name, so append '_' to this one until
                # it is unique.
                # FIXME: There should be a better solution.
                while colname in visited:
                    colname = colname + '_'
                columns.append((table.tablename, col))
                newcol = ColumnReference(colname, col.type,
                                            "Col%03d" % (len(newcolumns)+1))
                newcolumns.append(newcol)
                visited[colname] = 1
        TransientTableBase.create(self, newcolumns)

        # Copy the joined data to the table.
        newinternal_names = [col.internal_name for col in self.columns]
        internal_references = ["%s.%s" % (table, col.internal_name) 
                                                    for table, col in columns]
        if self.outer_join:
            join_operator = 'LEFT OUTER JOIN'
        else:
            join_operator = 'JOIN'
        stmt = ("INSERT INTO %s (id, %s) SELECT %s.id, %s FROM %s"
                " %s %s ON %s.%s = %s.%s;"
                % (self.tablename,
                   ", ".join(newinternal_names),
                   self.left_table.tablename,
                   ", ".join(internal_references),
                   self.left_table.tablename,
                   join_operator,
                   self.right_table.tablename,
                   self.left_table.tablename,
                   internal_left_col,
                   self.right_table.tablename,
                   internal_right_col))
        self.db.execute(stmt)

    def Dependencies(self):
        """Return a tuple with the two tables the join depends on."""
        return self.dependencies

    def JoinType(self):
        """Return the type of the join (either 'INNER' or 'LEFT OUTER')"""
        if self.outer_join:
            return "LEFT OUTER"
        else:
            return "INNER"


class AutoTransientTable(TitledObject):

    """Table that copies data to a transient table on demand.

    The AutoTransientTable takes another table as input and copies data
    to a table in a TransientDatabase instance on demand.
    """

    def __init__(self, transient_db, table):
        TitledObject.__init__(self, table.Title())
        self.transient_db = transient_db
        self.table = table
        self.t_table = None

    def Columns(self):
        return self.table.Columns()

    def Column(self, col):
        return self.table.Column(col)

    def HasColumn(self, col):
        """Return whether the table has a column with the given name or index
        """
        return self.table.HasColumn(col)

    def NumRows(self):
        return self.table.NumRows()

    def NumColumns(self):
        return self.table.NumColumns()

    def RowIdToOrdinal(self, gid):
        """Return the row ordinal given its id"""
        if self.t_table is not None:
            return self.t_table.RowIdToOrdinal(gid)
        else:
            return self.table.RowIdToOrdinal(gid)

    def RowOrdinalToId(self, num):
        """Return the rowid for given its ordinal"""
        if self.t_table is not None:
            return self.t_table.RowOrdinalToId(num)
        else:
            return self.table.RowOrdinalToId(num)

    def ReadRowAsDict(self, record, row_is_ordinal = 0):
        """Return the record no. record as a dict mapping field names to values
        """
        if self.t_table is not None:
            return self.t_table.ReadRowAsDict(record,
                                              row_is_ordinal = row_is_ordinal)
        else:
            return self.table.ReadRowAsDict(record,
                                            row_is_ordinal = row_is_ordinal)

    def ReadValue(self, row, col, row_is_ordinal = 0):
        """Return the value of the specified row and column

        The col parameter may be the index of the column or its name.
        """
        if self.t_table is not None:
            return self.t_table.ReadValue(row, col,
                                          row_is_ordinal = row_is_ordinal)
        else:
            return self.table.ReadValue(row, col,
                                        row_is_ordinal = row_is_ordinal)

    def copy_to_transient(self):
        """Internal: Create a transient table and copy the data into it"""
        self.t_table = TransientTable(self.transient_db, self)

    def transient_table(self):
        """
        Return a table whose underlying implementation is in the transient db
        """
        if self.t_table is None:
            self.copy_to_transient()
        return self.t_table

    def ValueRange(self, col):
        # Performance of sqlite vs. DBF for this method:
        #
        # If the table has been copied to the sqlite database it's
        # faster to use it even if there is no index on that column.
        # Otherwise it's faster to simply loop through all rows in the
        # DBF file. Copying the data to the sqlite database can take
        # very long for large amounts of data
        #
        # Of course if the table is not a DBF file the issue could be
        # different, although copying the data into sqlite first will
        # likely always be slower than simply querying the non-sqlite
        # table directly. Currently only DBFfiles and memory tables are
        # used as the underlying non-sqlite table, though.
        if self.t_table is not None:
            return self.t_table.ValueRange(col)
        else:
            return self.table.ValueRange(col)

    def UniqueValues(self, col):
        # The performance trade-offs for this method are basically the
        # same as for ValueRange except that currently there doesn't
        # seem to be a way to take advantage of indexes in this case in
        # sqlite. However, but it's still faster to query the transient
        # table if it already exists.
        if self.t_table is not None:
            return self.t_table.UniqueValues(col)
        else:
            return self.table.UniqueValues(col)

    def SimpleQuery(self, left, comparison, right):
        if self.t_table is None:
            self.copy_to_transient()
        # Make sure to use the column object of the transient table. The
        # left argument is always a column object so we can just ask the
        # t_table for the right object.
        if hasattr(right, "name"):
            return self.t_table.SimpleQuery(self.t_table.Column(left.name),
                                            comparison, 
                                            self.t_table.Column(right.name))
        else:
            return self.t_table.SimpleQuery(self.t_table.Column(left.name),
                                            comparison, right)

    def Dependencies(self):
        """Return a tuple containing the original table"""
        return (self.table,)

    def Width(self, col):
        return self.table.Width(col)

    def write_record(self, row, values):
        """Write the values to the given row.

        This is a very experimental feature which doesn't work in all
        cases, so you better know what you're doing when calling this
        method.
        """
        self.table.write_record(row, values)