1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
|
SCHEMA = """
CREATE TABLE features (
id text,
seqid text,
source text,
featuretype text,
start int,
end int,
score text,
strand text,
frame text,
attributes text,
extra text,
bin int,
primary key (id)
);
CREATE TABLE relations (
parent text,
child text,
level int,
primary key (parent, child, level)
);
CREATE TABLE meta (
dialect text,
version text
);
CREATE TABLE directives (
directive text
);
CREATE TABLE autoincrements (
base text,
n int,
primary key (base)
);
CREATE TABLE duplicates (
idspecid text,
newid text,
primary key (newid)
);
"""
default_pragmas = {
"synchronous": "NORMAL",
"journal_mode": "MEMORY",
"main.page_size": 4096,
"main.cache_size": 10000,
}
_keys = [
"id",
"seqid",
"source",
"featuretype",
"start",
"end",
"score",
"strand",
"frame",
"attributes",
"extra",
"bin",
]
_gffkeys = [
"seqid",
"source",
"featuretype",
"start",
"end",
"score",
"strand",
"frame",
"attributes",
]
_gffkeys_extra = _gffkeys + ["extra"]
_SELECT = "SELECT " + ", ".join(_keys) + ", features.rowid as file_order FROM features "
_INSERT = (
"INSERT INTO features ("
+ ", ".join(_keys)
+ ") VALUES ("
+ ",".join(list("?" * len(_keys)))
+ ")"
)
_update_clause = ",".join(["%s = ?" % i for i in _keys])
_UPDATE = "UPDATE features SET " + _update_clause + " WHERE id = ?"
# TODO: create indexes once profiling figures out which ones work best....
INDEXES = []
# This dictionary keeps track of idiosyncracies to [attempt to] maintain
# invariance of file->db->file round trips.
dialect = {
# Initial semicolon, e.g.,
#
# ;ID=001;
# vs
# ID=001;
"leading semicolon": False,
# Semicolon after the last value, e.g.,
#
# ID=001; Name=gene1;
# vs
# ID=001; Name=gene1
"trailing semicolon": False,
# e.g.,
#
# gene_id "GENE1"
# vs
# gene_id GENE1
"quoted GFF2 values": False,
# Sometimes there's extra space surrounding the semicolon, e.g.,
#
# ID=001;Name=gene1
# vs
# ID=001; Name=gene1
"field separator": ";",
# Usually "=" for GFF3; " " for GTF, e.g.,
#
# gene_id "GENE1"
# vs
# gene_id="GENE1"
"keyval separator": "=",
# Usually a comma, e.g.,
#
# Parent=gene1,gene2,gene3
"multival separator": ",",
# General GTF or GFF format
"fmt": "gff3",
# How multiple values for the same key are handled, e.g.,
#
# Parent=gene1; Parent=gene2;
# vs
# Parent=gene1,gene2;
#
# (the first one has repeated keys)
"repeated keys": False,
# If these keys exist, then print them in this order.
"order": ["ID", "Name", "gene_id", "transcript_id"],
}
always_return_list = True
ignore_url_escape_characters = False
# these keyword args are used by iterators.
_iterator_kwargs = (
"data",
"checklines",
"transform",
"force_dialect_check",
"dialect",
"from_string",
)
|