1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242
|
#!/usr/bin/env python
# Copyright 2001-2004 by Brad Chapman. All rights reserved.
# Revisions copyright 2007-2016 by Peter Cock. All rights reserved.
# Revisions copyright 2013 by Kai Blin. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Test the GenBank parser and make sure everything is working smoothly.
"""
# standard library
from __future__ import print_function
import os
from Bio._py3k import StringIO
import warnings
from Bio import BiopythonParserWarning
# GenBank stuff to test
from Bio import GenBank
from Bio.GenBank import utils
from Bio.Alphabet import _get_base_alphabet, ProteinAlphabet
gb_file_dir = os.path.join(os.getcwd(), 'GenBank')
test_files = ['noref.gb', 'cor6_6.gb', 'iro.gb', 'pri1.gb', 'arab1.gb',
'protein_refseq.gb', 'extra_keywords.gb', 'one_of.gb',
'NT_019265.gb', 'origin_line.gb', 'blank_seq.gb',
'dbsource_wrap.gb', 'gbvrl1_start.seq', 'NC_005816.gb',
'no_end_marker.gb', 'wrong_sequence_indent.gb',
'invalid_locus_line_spacing.gb', 'empty_feature_qualifier.gb',
'invalid_misc_feature.gb', '1MRR_A.gp']
# We only test writing on a subset of the examples:
write_format_files = ['noref.gb', 'cor6_6.gb', 'iro.gb', 'pri1.gb', 'arab1.gb',
'extra_keywords.gb', 'one_of.gb', 'origin_line.gb']
# don't test writing on protein_refseq, since it is horribly nasty
# don't test writing on the CONTIG refseq, because the wrapping of
# locations won't work exactly
# don't test writing on blank_seq because it lacks a sequence type
# don't test dbsource_wrap because it is a junky RefSeq file
files_to_parse = []
for file in test_files:
files_to_parse.append(os.path.join(gb_file_dir, file))
# parse the bioperl test files
# comment this out for now -- there are a bunch of junky records in here
# that no longer exist in GenBank -- do we really need to support those?
# files_to_parse = [os.path.join(os.getcwd(), 'GenBank', 'bioperl_test.gb')]
# parse the biojava test files
# files_to_parse += [os.path.join(os.getcwd(), 'GenBank', 'biojava_test.gb')]
# test the parsers
feature_parser = GenBank.FeatureParser(debug_level=0)
record_parser = GenBank.RecordParser(debug_level=0)
all_parsers = [feature_parser, record_parser]
print("Testing parsers...")
for parser in all_parsers:
for filename in files_to_parse:
if not os.path.isfile(filename):
print("Missing test input file: %s" % filename)
continue
handle = open(filename, 'r')
iterator = GenBank.Iterator(handle, parser)
while True:
with warnings.catch_warnings():
warnings.simplefilter("ignore", BiopythonParserWarning)
# e.g. BiopythonParserWarning: Premature end of file in sequence data
cur_record = next(iterator)
if cur_record is None:
break
if isinstance(parser, GenBank.FeatureParser):
print("***Record from %s with the FeatureParser"
% filename.split(os.path.sep)[-1])
print("Seq: %r" % cur_record.seq)
print("Id: %s" % cur_record.id)
print("Name: %s" % cur_record.name)
print("Description %s" % cur_record.description)
print("Annotations***")
ann_keys = sorted(cur_record.annotations)
for ann_key in ann_keys:
if ann_key != 'references':
print("Key: %s" % ann_key)
print("Value: %s" %
cur_record.annotations[ann_key])
else:
print("References*")
for reference in cur_record.annotations[ann_key]:
print(str(reference))
print("Features")
for feature in cur_record.features:
print(feature)
if isinstance(_get_base_alphabet(cur_record.seq.alphabet),
ProteinAlphabet):
assert feature.strand is None
else:
# Assuming no mixed strand examples...
assert feature.strand is not None
print("DB cross refs %s" % cur_record.dbxrefs)
elif isinstance(parser, GenBank.RecordParser):
print("***Record from %s with the RecordParser"
% filename.split(os.path.sep)[-1])
print("sequence length: %i" % len(cur_record.sequence))
print("locus: %s" % cur_record.locus)
print("definition: %s" % cur_record.definition)
print("accession: %s" % cur_record.accession)
for reference in cur_record.references:
print("reference title: %s" % reference.title)
for feature in cur_record.features:
print("feature key: %s" % feature.key)
print("location: %s" % feature.location)
print("num qualifiers: %i" % len(feature.qualifiers))
for qualifier in feature.qualifiers:
print("key: %s value: %s" % (qualifier.key, qualifier.value))
handle.close()
# test writing GenBank format
print("Testing writing GenBank format...")
def do_comparison(good_record, test_record):
"""Compare two records to see if they are the same.
Ths compares the two GenBank record, and will raise an AssertionError
if two lines do not match, showing the non-matching lines.
"""
good_handle = StringIO(good_record)
test_handle = StringIO(test_record)
while True:
good_line = good_handle.readline()
test_line = test_handle.readline()
if not(good_line) and not(test_line):
break
if not(good_line):
raise AssertionError("Extra info in Test: %r" % test_line)
if not(test_line):
raise AssertionError("Extra info in Expected: %r" % good_line)
test_normalized = ' '.join(x for x in test_line.split() if x)
good_normalized = ' '.join(x for x in good_line.split() if x)
assert test_normalized == good_normalized, \
"Expected does not match Test.\nExpect: %r\nTest: %r\n" % \
(good_line, test_line)
def t_write_format():
record_parser = GenBank.RecordParser(debug_level=0)
for file in write_format_files:
print("Testing GenBank writing for %s..." % os.path.basename(file))
cur_handle = open(os.path.join("GenBank", file), "r")
compare_handle = open(os.path.join("GenBank", file), "r")
iterator = GenBank.Iterator(cur_handle, record_parser)
compare_iterator = GenBank.Iterator(compare_handle)
while True:
cur_record = next(iterator)
compare_record = next(compare_iterator)
if cur_record is None or compare_record is None:
break
print("\tTesting for %s" % cur_record.version)
output_record = str(cur_record) + "\n"
do_comparison(compare_record, output_record)
cur_handle.close()
compare_handle.close()
t_write_format()
def t_cleaning_features():
"""Test the ability to clean up feature values.
"""
parser = GenBank.FeatureParser(feature_cleaner=utils.FeatureValueCleaner())
handle = open(os.path.join("GenBank", "arab1.gb"))
iterator = GenBank.Iterator(handle, parser)
first_record = next(iterator)
# test for cleaning of translation
translation_feature = first_record.features[1]
test_trans = translation_feature.qualifiers["translation"][0]
assert ' ' not in test_trans, \
"Did not clean spaces out of the translation"
assert '\012' not in test_trans, \
"Did not clean newlines out of the translation"
handle.close()
print("Testing feature cleaning...")
t_cleaning_features()
def t_ensembl_locus():
line = "LOCUS HG531_PATCH 1000000 bp DNA HTG 18-JUN-2011\n"
s = GenBank.Scanner.GenBankScanner()
c = GenBank._FeatureConsumer(True)
s._feed_first_line(c, line)
assert c.data.name == "HG531_PATCH", c.data.name
assert c._expected_size == 1000000, c._expected_size
line = "LOCUS HG531_PATCH 759984 bp DNA HTG 18-JUN-2011\n"
s = GenBank.Scanner.GenBankScanner()
c = GenBank._FeatureConsumer(True)
s._feed_first_line(c, line)
assert c.data.name == "HG531_PATCH", c.data.name
assert c._expected_size == 759984, c._expected_size
line = "LOCUS HG506_HG1000_1_PATCH 814959 bp DNA HTG 18-JUN-2011\n"
s = GenBank.Scanner.GenBankScanner()
c = GenBank._FeatureConsumer(True)
s._feed_first_line(c, line)
assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name
assert c._expected_size == 814959, c._expected_size
line = "LOCUS HG506_HG1000_1_PATCH 1219964 bp DNA HTG 18-JUN-2011\n"
s = GenBank.Scanner.GenBankScanner()
c = GenBank._FeatureConsumer(True)
s._feed_first_line(c, line)
assert c.data.name == "HG506_HG1000_1_PATCH", c.data.name
assert c._expected_size == 1219964, c._expected_size
print("Done")
print("Testing EnsEMBL LOCUS lines...")
t_ensembl_locus()
|