File: attr_test_cases.py

package info (click to toggle)
python-gffutils 0.13-3
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 10,164 kB
  • sloc: python: 5,557; makefile: 57; sh: 13
file content (181 lines) | stat: -rw-r--r-- 6,202 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Test cases used for the nosetest generator over in parser_test.py.

Each item is a 3-tuple.  The first two items are the attribute string and the
expected parsed dictionary.  The third, if not None, is a reconstructed string
that is acceptable -- this is needed for cases like this:

    name "fgenesh1_pg.C_chr_1000007"; transcriptId 873

where there is not consistency in the quoting of values.  So, in this case, the
following would be an acceptable recontructed string (with quotes around the
873):

    name "fgenesh1_pg.C_chr_1000007"; transcriptId "873"


"""
attrs = [
    # c_elegans_WS199_shortened_gff.txt
    (
        "count=1;gene=amx-2;sequence=SAGE:ggcagagtcttttggca;" "transcript=B0019.1",
        {
            "count": ["1"],
            "gene": ["amx-2"],
            "sequence": ["SAGE:ggcagagtcttttggca"],
            "transcript": ["B0019.1"],
        },
        None,
    ),
    # ensembl_gtf.txt
    (
        'gene_id "Y74C9A.6"; transcript_id "Y74C9A.6"; exon_number "1"; '
        'gene_name "Y74C9A.6"; transcript_name "NR_001477.2";',
        {
            "gene_id": ["Y74C9A.6"],
            "transcript_id": ["Y74C9A.6"],
            "exon_number": ["1"],
            "gene_name": ["Y74C9A.6"],
            "transcript_name": ["NR_001477.2"],
        },
        None,
    ),
    # F3-unique-3.v2.gff
    (
        "g=A3233312322232122211;i=1;p=1.000;q=23,12,18,17,10,24,19,14,27,9,23"
        ",9,16,20,11,7,8,4,4,14;u=0,0,0,1",
        {
            "g": ["A3233312322232122211"],
            "i": ["1"],
            "p": ["1.000"],
            "q": [
                "23",
                "12",
                "18",
                "17",
                "10",
                "24",
                "19",
                "14",
                "27",
                "9",
                "23",
                "9",
                "16",
                "20",
                "11",
                "7",
                "8",
                "4",
                "4",
                "14",
            ],
            "u": ["0", "0", "0", "1"],
        },
        None,
    ),
    # glimmer_nokeyval.gff3
    (
        "ID=GL0000006;Name=GL0000006;Lack 3'-end;",
        {"ID": ["GL0000006"], "Name": ["GL0000006"], "Lack 3'-end": []},
        None,
    ),
    # hybrid1.gff3
    (
        "ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:"
        "A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:"
        "SOMA_HUMAN;Note=growth%20hormone%201;Alias=GH1",
        {
            "ID": ["A00469"],
            "Dbxref": [
                "AFFX-U133:205840_x_at",
                "Locuslink:2688",
                "Genbank-mRNA:A00469",
                "Swissprot:P01241",
                "PFAM:PF00103",
                "AFFX-U95:1332_f_at",
                "Swissprot:SOMA_HUMAN",
            ],
            "Note": ["growth hormone 1"],
            "Alias": ["GH1"],
        },
        "ID=A00469;Dbxref=AFFX-U133:205840_x_at,Locuslink:2688,Genbank-mRNA:"
        "A00469,Swissprot:P01241,PFAM:PF00103,AFFX-U95:1332_f_at,Swissprot:"
        "SOMA_HUMAN;Note=growth hormone 1;Alias=GH1",
    ),
    # jgi_gff2.txt
    #
    # This file is inconsitent with how it quotes values -- integers are not
    # quoted but string values are.  Only way to make this be invariant is to
    # keep track of the "flavor" of each attribute; not sure it's worth the
    # effort / processing time.
    (
        'name "fgenesh1_pg.C_chr_1000007"; transcriptId 873',
        {"name": ["fgenesh1_pg.C_chr_1000007"], "transcriptId": ["873"]},
        'name "fgenesh1_pg.C_chr_1000007"; transcriptId "873"',
    ),
    # mouse_extra_comma.gff3: extra comma line
    #
    # Note extra empty string in the dictionary's "Parent" field.
    #
    (
        "Name=CDS:NC_000083.5:LOC100040603;Parent=XM_001475631.1,",
        {"Name": ["CDS:NC_000083.5:LOC100040603"], "Parent": ["XM_001475631.1", ""]},
        None,
    ),
    # mouse_extra_comma.gff3
    #
    # Note the empty ID field.  Compare with the "Lack 3'-end" attribute of
    # glimmer_nokeyval.gff3 above.  Presumably the "Lack 3'-end" field should
    # be interpreted as "True", but an empty ID should be interpreted as "None"
    # or something.
    #
    # Furthermore, the "Lack 3'-end" has no trailing "=", but the "ID" field
    # here does.
    #
    # In both cases, the dictionary entry is simply an empty list; it's just in
    # the reconstruction where things get tricky.
    (
        "ID=;Parent=XM_001475631.1",
        {"ID": [], "Parent": ["XM_001475631.1"]},
        "ID;Parent=XM_001475631.1",
    ),
    # ncbi_gff3.txt
    (
        "ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;"
        "locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified%20by%20mat"
        "ch%20to%20protein%20family%20HMM%20PF00491%3B%20match%20to%20prote"
        "in%20family%20HMM%20TIGR01230;transl_table=11;product=agmatinase;p"
        "rotein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;"
        "exon_number=1",
        {
            "ID": ["NC_008596.1:speB:unknown_transcript_1"],
            "Parent": ["NC_008596.1:speB"],
            "locus_tag": ["MSMEG_1072"],
            "EC_number": ["3.5.3.11"],
            "note": [
                "identified by match to protein family HMM P"
                "F00491; match to protein family HMM TIGR01"
                "230"
            ],
            "transl_table": ["11"],
            "product": ["agmatinase"],
            "protein_id": ["YP_885468.1"],
            "db_xref": ["GI:118469242", "GeneID:4535378"],
            "exon_number": ["1"],
        },
        "ID=NC_008596.1:speB:unknown_transcript_1;Parent=NC_008596.1:speB;"
        "locus_tag=MSMEG_1072;EC_number=3.5.3.11;note=identified by mat"
        "ch to protein family HMM PF00491%3B match to prote"
        "in family HMM TIGR01230;transl_table=11;product=agmatinase;p"
        "rotein_id=YP_885468.1;db_xref=GI:118469242;db_xref=GeneID:4535378;"
        "exon_number=1",
    ),
    # wormbase_gff2_alt.txt
    #
    (
        'CDS "cr01.sctg102.wum.2.1"',
        {"CDS": ["cr01.sctg102.wum.2.1"]},
        None,
    ),
]