File: constants.py

package info (click to toggle)
python-gffutils 0.13-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 10,164 kB
  • sloc: python: 5,557; makefile: 57; sh: 13
file content (165 lines) | stat: -rw-r--r-- 3,142 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
SCHEMA = """

CREATE TABLE features (
    id text,
    seqid text,
    source text,
    featuretype text,
    start int,
    end int,
    score text,
    strand text,
    frame text,
    attributes text,
    extra text,
    bin int,
    primary key (id)
    );

CREATE TABLE relations (
    parent text,
    child text,
    level int,
    primary key (parent, child, level)
    );

CREATE TABLE meta (
    dialect text,
    version text
    );

CREATE TABLE directives (
    directive text
    );

CREATE TABLE autoincrements (
    base text,
    n int,
    primary key (base)
    );

CREATE TABLE duplicates (
    idspecid text,
    newid text,
    primary key (newid)
    );


"""

default_pragmas = {
    "synchronous": "NORMAL",
    "journal_mode": "MEMORY",
    "main.page_size": 4096,
    "main.cache_size": 10000,
}

_keys = [
    "id",
    "seqid",
    "source",
    "featuretype",
    "start",
    "end",
    "score",
    "strand",
    "frame",
    "attributes",
    "extra",
    "bin",
]
_gffkeys = [
    "seqid",
    "source",
    "featuretype",
    "start",
    "end",
    "score",
    "strand",
    "frame",
    "attributes",
]
_gffkeys_extra = _gffkeys + ["extra"]

_SELECT = "SELECT " + ", ".join(_keys) + ", features.rowid as file_order FROM features "

_INSERT = (
    "INSERT INTO features ("
    + ", ".join(_keys)
    + ") VALUES ("
    + ",".join(list("?" * len(_keys)))
    + ")"
)


_update_clause = ",".join(["%s = ?" % i for i in _keys])
_UPDATE = "UPDATE features SET " + _update_clause + " WHERE id = ?"


# TODO: create indexes once profiling figures out which ones work best....
INDEXES = []


# This dictionary keeps track of idiosyncracies to [attempt to] maintain
# invariance of file->db->file round trips.
dialect = {
    # Initial semicolon, e.g.,
    #
    #   ;ID=001;
    # vs
    #   ID=001;
    "leading semicolon": False,
    # Semicolon after the last value, e.g.,
    #
    #   ID=001; Name=gene1;
    # vs
    #   ID=001; Name=gene1
    "trailing semicolon": False,
    # e.g.,
    #
    #   gene_id "GENE1"
    # vs
    #   gene_id GENE1
    "quoted GFF2 values": False,
    # Sometimes there's extra space surrounding the semicolon, e.g.,
    #
    #   ID=001;Name=gene1
    # vs
    #   ID=001; Name=gene1
    "field separator": ";",
    # Usually "=" for GFF3; " " for GTF, e.g.,
    #
    #   gene_id "GENE1"
    # vs
    #   gene_id="GENE1"
    "keyval separator": "=",
    # Usually a comma, e.g.,
    #
    #   Parent=gene1,gene2,gene3
    "multival separator": ",",
    # General GTF or GFF format
    "fmt": "gff3",
    # How multiple values for the same key are handled, e.g.,
    #
    #   Parent=gene1; Parent=gene2;
    # vs
    #   Parent=gene1,gene2;
    #
    # (the first one has repeated keys)
    "repeated keys": False,
    # If these keys exist, then print them in this order.
    "order": ["ID", "Name", "gene_id", "transcript_id"],
}

always_return_list = True
ignore_url_escape_characters = False

# these keyword args are used by iterators.
_iterator_kwargs = (
    "data",
    "checklines",
    "transform",
    "force_dialect_check",
    "dialect",
    "from_string",
)