1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243
|
from unittest import TestCase
from cogent3.parse.greengenes import (
MinimalGreengenesParser,
SpecificGreengenesParser,
make_ignore_f,
)
# consider project name
# remember to add yourself if you make changes
class ParseGreengenesRecordsTests(TestCase):
def setUp(self):
pass
def test_MinimalGreengenesParser_mock(self):
"""Test MinimalGreengenesParser against mock data"""
res = MinimalGreengenesParser(
mock_data.splitlines(), RecStart="my_starting", RecEnd="my_ending"
)
records = list(res)
exp = [
{"a": "1", "b": "2", "c": "3", "d": "", "e": "5"},
{"q": "asdasd", "c": "taco"},
]
self.assertEqual(records, exp)
def test_MinimalGreengenesParser_real(self):
"""Test MinimalGreengenesParser against real data"""
res = MinimalGreengenesParser(real_data.splitlines())
record1, record2 = list(res)
self.assertEqual(record1["G2_chip_tax_string"], "Unclassified")
self.assertEqual(
record1["authors"],
"Hernanandez-Eugenio,G., Silva-Rojas,H.V., Zelaya-Molina,L.X.",
)
self.assertEqual(record1["bel3_div_ratio"], "")
self.assertEqual(len(record1), 72)
self.assertEqual(record2["ncbi_acc_w_ver"], "FJ832719.1")
self.assertEqual(record2["timestamp"], "2010-03-23 14:08:27")
self.assertEqual(
record2["title"],
"Developmental Microbial Ecology of the Crop of the Folivorous Hoatzin",
)
def test_SpecificGreengenesParser_real(self):
"""Test SpecificGreengenesParser against real data"""
fields = ["prokMSA_id", "journal"]
res = SpecificGreengenesParser(real_data.splitlines(), fields)
records = list(res)
exp = [("604868", ""), ("604867", "ISME J (2010) In press")]
self.assertEqual(records, exp)
ids = ["604867", "12312312323"]
res = SpecificGreengenesParser(real_data.splitlines(), fields, ids)
records = list(res)
exp = [("604867", "ISME J (2010) In press")]
self.assertEqual(records, exp)
def test_make_ignore_f(self):
"""Properly ignore empty records and the start line"""
f = make_ignore_f("testing")
self.assertFalse(f(["asasdasd", ""]))
self.assertFalse(f(["test", ""]))
self.assertFalse(f(["testing2", ""]))
self.assertFalse(f(["testing", "asd"]))
self.assertTrue(f(["", ""]))
self.assertTrue(f(None))
self.assertTrue(f(["", ""]))
self.assertTrue(f(["testing", ""]))
mock_data = """my_starting
a=1
b=2
c=3
d=
e=5
my_ending
my_starting
q=asdasd
c=taco
my_ending
"""
real_data = """BEGIN
G2_chip_tax_string=Unclassified
G2_chip_tax_string_format_2=Unclassified
HOMD_tax_string=
HOMD_tax_string_format_2=
Hugenholtz_tax_string=Unclassified
Hugenholtz_tax_string_format_2=Unclassified
Ludwig_tax_string=Unclassified
Ludwig_tax_string_format_2=Unclassified
Pace_tax_string=Unclassified
Pace_tax_string_format_2=Unclassified
RDP_tax_string=Unclassified
RDP_tax_string_format_2=Unclassified
Silva_tax_string=Unclassified
Silva_tax_string_format_2=Unclassified
authors=Hernanandez-Eugenio,G., Silva-Rojas,H.V., Zelaya-Molina,L.X.
bel3_div_ratio=
bellerophon=
blast_perc_ident_to_template=
clone=51a
contact_info=Irrigacion, Universidad Autonoma Chapingo, Carretera Mexico-Texcoco Km 37.5, Texcoco, Mexico 56230, Mexico
core_set_member=
core_set_member2=
country=Mexico: Mexico City
create_date=21-NOV-2009
db_name=
decision=clone
description=Uncultured bacterium clone 51a 16S ribosomal RNA gene, partial sequence
email=
gold_id=
img_oid=
isolate=
isolation_source=mesophilic anaerobic reactor fed with effluent from the chemical industry
journal=
longest_insertion=
medline_ids=
ncbi_acc=
ncbi_acc_w_ver=FJ461956.1
ncbi_gi=213390944
ncbi_seq_length=1512
ncbi_tax_id=77133
ncbi_tax_string=Bacteria; environmental samples
ncbi_tax_string_format_2=Unclassified
non_ACGT_count=
non_ACGT_percent=
note=
organism=uncultured bacterium
perc_ident_to_invariant_core=
prokMSA_id=604868
prokMSAname=Microbial ecology industrial digestor mesophilic anaerobic reactor fed effluent chemical industry clone 51a
pubmed_ids=
remark=
replaced_by=
single_nt_runs_over_7=
small_gap_intrusions=
source=uncultured bacterium
span_aligned=1..2
specific_host=
status=0
strain=
study_id=38002
sub_species=
submit_date=24-OCT-2008
template=
timestamp=2010-03-23 14:08:27
title=Microbial ecology of industrial anaerobic digestor
unaligned_length=
update_date=21-NOV-2009
warning=
wigeon95=
wigeon99=
wigeon_std_dev=
aligned_seq=unaligned
END
BEGIN
G2_chip_tax_string=Unclassified
G2_chip_tax_string_format_2=Unclassified
HOMD_tax_string=
HOMD_tax_string_format_2=
Hugenholtz_tax_string=Unclassified
Hugenholtz_tax_string_format_2=Unclassified
Ludwig_tax_string=Unclassified
Ludwig_tax_string_format_2=Unclassified
Pace_tax_string=Unclassified
Pace_tax_string_format_2=Unclassified
RDP_tax_string=Unclassified
RDP_tax_string_format_2=Unclassified
Silva_tax_string=Unclassified
Silva_tax_string_format_2=Unclassified
authors=Brodie,E.L., Dominguez-Bello,M.G., Garcia-Amado,M.A., Godoy-Vitorino,F., Goldfarb,K.C., Michelangeli,F.
bel3_div_ratio=
bellerophon=
blast_perc_ident_to_template=
clone=J3Q101_11C02
contact_info=Biology, University of Puerto Rico, Rio Piedras Campus, PO Box 23360, San Juan, PR 00931-3360, USA
core_set_member=
core_set_member2=
country=Venezuela
create_date=10-DEC-2009
db_name=
decision=clone
description=Uncultured bacterium clone J3Q101_11C02 16S ribosomal RNA gene, partial sequence
email=
gold_id=
img_oid=
isolate=
isolation_source=crop contents
journal=ISME J (2010) In press
longest_insertion=
medline_ids=
ncbi_acc=
ncbi_acc_w_ver=FJ832719.1
ncbi_gi=226447371
ncbi_seq_length=1326
ncbi_tax_id=77133
ncbi_tax_string=Bacteria; environmental samples
ncbi_tax_string_format_2=Unclassified
non_ACGT_count=
non_ACGT_percent=
note=
organism=uncultured bacterium
perc_ident_to_invariant_core=
prokMSA_id=604867
prokMSAname=Microbial Ecology Crop Folivorous Hoatzin crop contents clone J3Q101_11C02
pubmed_ids=
remark=
replaced_by=
single_nt_runs_over_7=
small_gap_intrusions=
source=uncultured bacterium
span_aligned=1..2
specific_host=
status=0
strain=
study_id=37901
sub_species=
submit_date=16-MAR-2009
template=
timestamp=2010-03-23 14:08:27
title=Developmental Microbial Ecology of the Crop of the Folivorous Hoatzin
unaligned_length=
update_date=10-DEC-2009
warning=
wigeon95=
wigeon99=
wigeon_std_dev=
aligned_seq=unaligned
END
"""
|