File: test_greengenes.py

package info (click to toggle)
python-cogent 2024.5.7a1%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 74,600 kB
  • sloc: python: 92,479; makefile: 117; sh: 16
file content (243 lines) | stat: -rw-r--r-- 6,599 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
from unittest import TestCase

from cogent3.parse.greengenes import (
    MinimalGreengenesParser,
    SpecificGreengenesParser,
    make_ignore_f,
)


# consider project name
# remember to add yourself if you make changes


class ParseGreengenesRecordsTests(TestCase):
    def setUp(self):
        pass

    def test_MinimalGreengenesParser_mock(self):
        """Test MinimalGreengenesParser against mock data"""
        res = MinimalGreengenesParser(
            mock_data.splitlines(), RecStart="my_starting", RecEnd="my_ending"
        )

        records = list(res)

        exp = [
            {"a": "1", "b": "2", "c": "3", "d": "", "e": "5"},
            {"q": "asdasd", "c": "taco"},
        ]

        self.assertEqual(records, exp)

    def test_MinimalGreengenesParser_real(self):
        """Test MinimalGreengenesParser against real data"""
        res = MinimalGreengenesParser(real_data.splitlines())
        record1, record2 = list(res)

        self.assertEqual(record1["G2_chip_tax_string"], "Unclassified")
        self.assertEqual(
            record1["authors"],
            "Hernanandez-Eugenio,G., Silva-Rojas,H.V., Zelaya-Molina,L.X.",
        )
        self.assertEqual(record1["bel3_div_ratio"], "")
        self.assertEqual(len(record1), 72)

        self.assertEqual(record2["ncbi_acc_w_ver"], "FJ832719.1")
        self.assertEqual(record2["timestamp"], "2010-03-23 14:08:27")
        self.assertEqual(
            record2["title"],
            "Developmental Microbial Ecology of the Crop of the Folivorous Hoatzin",
        )

    def test_SpecificGreengenesParser_real(self):
        """Test SpecificGreengenesParser against real data"""
        fields = ["prokMSA_id", "journal"]
        res = SpecificGreengenesParser(real_data.splitlines(), fields)
        records = list(res)
        exp = [("604868", ""), ("604867", "ISME J (2010) In press")]
        self.assertEqual(records, exp)

        ids = ["604867", "12312312323"]
        res = SpecificGreengenesParser(real_data.splitlines(), fields, ids)
        records = list(res)
        exp = [("604867", "ISME J (2010) In press")]
        self.assertEqual(records, exp)

    def test_make_ignore_f(self):
        """Properly ignore empty records and the start line"""
        f = make_ignore_f("testing")
        self.assertFalse(f(["asasdasd", ""]))
        self.assertFalse(f(["test", ""]))
        self.assertFalse(f(["testing2", ""]))
        self.assertFalse(f(["testing", "asd"]))
        self.assertTrue(f(["", ""]))
        self.assertTrue(f(None))
        self.assertTrue(f(["", ""]))
        self.assertTrue(f(["testing", ""]))


mock_data = """my_starting
a=1
b=2
c=3
d=
e=5
my_ending

my_starting
q=asdasd
c=taco
my_ending
"""

real_data = """BEGIN
G2_chip_tax_string=Unclassified
G2_chip_tax_string_format_2=Unclassified
HOMD_tax_string=
HOMD_tax_string_format_2=
Hugenholtz_tax_string=Unclassified
Hugenholtz_tax_string_format_2=Unclassified
Ludwig_tax_string=Unclassified
Ludwig_tax_string_format_2=Unclassified
Pace_tax_string=Unclassified
Pace_tax_string_format_2=Unclassified
RDP_tax_string=Unclassified
RDP_tax_string_format_2=Unclassified
Silva_tax_string=Unclassified
Silva_tax_string_format_2=Unclassified
authors=Hernanandez-Eugenio,G., Silva-Rojas,H.V., Zelaya-Molina,L.X.
bel3_div_ratio=
bellerophon=
blast_perc_ident_to_template=
clone=51a 
contact_info=Irrigacion, Universidad Autonoma Chapingo, Carretera Mexico-Texcoco Km 37.5, Texcoco, Mexico 56230, Mexico
core_set_member=
core_set_member2=
country=Mexico: Mexico City 
create_date=21-NOV-2009
db_name=
decision=clone
description=Uncultured bacterium clone 51a 16S ribosomal RNA gene, partial sequence
email=
gold_id=
img_oid=
isolate=
isolation_source=mesophilic anaerobic reactor fed with effluent from the chemical industry 
journal=
longest_insertion=
medline_ids=
ncbi_acc=
ncbi_acc_w_ver=FJ461956.1
ncbi_gi=213390944
ncbi_seq_length=1512
ncbi_tax_id=77133
ncbi_tax_string=Bacteria; environmental samples
ncbi_tax_string_format_2=Unclassified
non_ACGT_count=
non_ACGT_percent=
note=
organism=uncultured bacterium
perc_ident_to_invariant_core=
prokMSA_id=604868
prokMSAname=Microbial ecology industrial digestor mesophilic anaerobic reactor fed effluent chemical industry clone 51a
pubmed_ids=
remark=
replaced_by=
single_nt_runs_over_7=
small_gap_intrusions=
source=uncultured bacterium
span_aligned=1..2
specific_host=
status=0
strain=
study_id=38002
sub_species=
submit_date=24-OCT-2008
template=
timestamp=2010-03-23 14:08:27
title=Microbial ecology of industrial anaerobic digestor
unaligned_length=
update_date=21-NOV-2009
warning=
wigeon95=
wigeon99=
wigeon_std_dev=
aligned_seq=unaligned
END

BEGIN
G2_chip_tax_string=Unclassified
G2_chip_tax_string_format_2=Unclassified
HOMD_tax_string=
HOMD_tax_string_format_2=
Hugenholtz_tax_string=Unclassified
Hugenholtz_tax_string_format_2=Unclassified
Ludwig_tax_string=Unclassified
Ludwig_tax_string_format_2=Unclassified
Pace_tax_string=Unclassified
Pace_tax_string_format_2=Unclassified
RDP_tax_string=Unclassified
RDP_tax_string_format_2=Unclassified
Silva_tax_string=Unclassified
Silva_tax_string_format_2=Unclassified
authors=Brodie,E.L., Dominguez-Bello,M.G., Garcia-Amado,M.A., Godoy-Vitorino,F., Goldfarb,K.C., Michelangeli,F.
bel3_div_ratio=
bellerophon=
blast_perc_ident_to_template=
clone=J3Q101_11C02 
contact_info=Biology, University of Puerto Rico, Rio Piedras Campus, PO Box 23360, San Juan, PR 00931-3360, USA
core_set_member=
core_set_member2=
country=Venezuela 
create_date=10-DEC-2009
db_name=
decision=clone
description=Uncultured bacterium clone J3Q101_11C02 16S ribosomal RNA gene, partial sequence
email=
gold_id=
img_oid=
isolate=
isolation_source=crop contents 
journal=ISME J (2010) In press
longest_insertion=
medline_ids=
ncbi_acc=
ncbi_acc_w_ver=FJ832719.1
ncbi_gi=226447371
ncbi_seq_length=1326
ncbi_tax_id=77133
ncbi_tax_string=Bacteria; environmental samples
ncbi_tax_string_format_2=Unclassified
non_ACGT_count=
non_ACGT_percent=
note=
organism=uncultured bacterium
perc_ident_to_invariant_core=
prokMSA_id=604867
prokMSAname=Microbial Ecology Crop Folivorous Hoatzin crop contents clone J3Q101_11C02
pubmed_ids=
remark=
replaced_by=
single_nt_runs_over_7=
small_gap_intrusions=
source=uncultured bacterium
span_aligned=1..2
specific_host=
status=0
strain=
study_id=37901
sub_species=
submit_date=16-MAR-2009
template=
timestamp=2010-03-23 14:08:27
title=Developmental Microbial Ecology of the Crop of the Folivorous Hoatzin
unaligned_length=
update_date=10-DEC-2009
warning=
wigeon95=
wigeon99=
wigeon_std_dev=
aligned_seq=unaligned
END
"""