1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
|
# Copyright 2007 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
import os
from sets import Set
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from StringIO import StringIO
from Bio.Alphabet import generic_protein, generic_rna, generic_dna
#Longer list including alignment only file formats we can read AND write:
test_write_read_alignment_formats = SeqIO._FormatToWriter.keys()[:]
# test_files is a list of tuples containing:
# - string: file format
# - boolean: alignment (requires all seqs be same length)
# - string: relative filename
# - integer: number of sequences
test_files = [ \
#Following examples are also used in test_Clustalw.py
("clustal",True, 'Clustalw/cw02.aln', 2),
("clustal",True, 'Clustalw/opuntia.aln', 7),
#Following nucleic examples are also used in test_Fasta2.py
("fasta", False, 'Nucleic/lupine.nu', 1),
("fasta", False, 'Nucleic/elderberry.nu', 1),
("fasta", False, 'Nucleic/phlox.nu', 1),
("fasta", False, 'Nucleic/centaurea.nu', 1),
("fasta", False, 'Nucleic/wisteria.nu', 1),
("fasta", False, 'Nucleic/sweetpea.nu', 1),
("fasta", False, 'Nucleic/lavender.nu', 1),
#Following protein examples are also used in test_Fasta2.py
("fasta", False, 'Amino/aster.pro', 1),
("fasta", False, 'Amino/loveliesbleeding.pro', 1),
("fasta", False, 'Amino/rose.pro', 1),
("fasta", False, 'Amino/rosemary.pro', 1),
#Following examples are also used in test_Fasta.py
("fasta", False, 'Fasta/f001', 1), #Protein
("fasta", False, 'Fasta/f002', 3), #DNA
#("fasta", False, 'Fasta/f003', 2), #Protein with comments
("fasta", False, 'Fasta/fa01', 2), #Protein with gaps
#Following examples are also used in test_GFF.py
("fasta", False, 'GFF/NC_001802.fna', 1), #upper case
("fasta", False, 'GFF/NC_001802lc.fna', 1), #lower case
("fasta", True, 'GFF/multi.fna', 3), #Trivial nucleotide alignment
#Following example is also used in test_registry.py
("fasta", False, 'Registry/seqs.fasta', 2), #contains blank line
#Following example is also used in test_Nexus.py
("nexus", True, 'Nexus/test_Nexus_input.nex', 9),
#Following examples are also used in test_SwissProt.py
("swiss", False, 'SwissProt/sp001', 1),
("swiss", False, 'SwissProt/sp002', 1),
("swiss", False, 'SwissProt/sp003', 1),
("swiss", False, 'SwissProt/sp004', 1),
("swiss", False, 'SwissProt/sp005', 1),
("swiss", False, 'SwissProt/sp006', 1),
("swiss", False, 'SwissProt/sp007', 1),
("swiss", False, 'SwissProt/sp008', 1),
("swiss", False, 'SwissProt/sp009', 1),
("swiss", False, 'SwissProt/sp010', 1),
("swiss", False, 'SwissProt/sp011', 1),
("swiss", False, 'SwissProt/sp012', 1),
("swiss", False, 'SwissProt/sp013', 1),
("swiss", False, 'SwissProt/sp014', 1),
("swiss", False, 'SwissProt/sp015', 1),
("swiss", False, 'SwissProt/sp016', 1),
#Following example is also used in test_registry.py
("swiss", False, 'Registry/EDD_RAT.dat', 1),
#Following examples are also used in test_GenBank.py
("genbank",False, 'GenBank/noref.gb', 1),
("genbank",False, 'GenBank/cor6_6.gb', 6),
("genbank",False, 'GenBank/iro.gb', 1),
("genbank",False, 'GenBank/pri1.gb', 1),
("genbank",False, 'GenBank/arab1.gb', 1),
("genbank",False, 'GenBank/protein_refseq.gb', 1), #Old version
("genbank",False, 'GenBank/protein_refseq2.gb', 1), #Revised version
("genbank",False, 'GenBank/extra_keywords.gb', 1),
("genbank",False, 'GenBank/one_of.gb', 1),
("genbank",False, 'GenBank/NT_019265.gb', 1),
("genbank",False, 'GenBank/origin_line.gb', 1),
("genbank",False, 'GenBank/blank_seq.gb', 1),
("genbank",False, 'GenBank/dbsource_wrap.gb', 1),
# The next example is a truncated copy of gbvrl1.seq from
# ftp://ftp.ncbi.nih.gov/genbank/gbvrl1.seq.gz
# This includes an NCBI header, and the first three records:
("genbank",False, 'GenBank/gbvrl1_start.seq', 3),
#Following files are also used in test_GFF.py
("genbank",False, 'GFF/NC_001422.gbk', 1),
#Following files are currently only used here:
("embl", False, 'EMBL/TRBG361.embl', 1),
("embl", False, 'EMBL/DD231055_edited.embl', 1),
("embl", False, 'EMBL/SC10H5.embl', 1), # Pre 2006 style ID line
("embl", False, 'EMBL/U87107.embl', 1), # Old ID line with SV line
("stockholm", True, 'Stockholm/simple.sth', 2),
("stockholm", True, 'Stockholm/funny.sth', 5),
#Following PHYLIP files are currently only used here (test_SeqIO)
#and are mostly from Joseph Felsenstein's PHYLIP v3.6 documentation:
("phylip", True, 'Phylip/reference_dna.phy', 6),
("phylip", True, 'Phylip/reference_dna2.phy', 6),
("phylip", True, 'Phylip/hennigian.phy', 10),
("phylip", True, 'Phylip/horses.phy', 10),
("phylip", True, 'Phylip/random.phy', 10),
("phylip", True, 'Phylip/interlaced.phy', 3),
("phylip", True, 'Phylip/interlaced2.phy', 4),
]
# This is a list of two-tuples. Each tuple contains a
# list of SeqRecord objects and a description (string)
test_records = [
([], "zero records"),
([SeqRecord(Seq("CHSMAIKLSSEHNIPSGIANAL",generic_protein), id="Alpha"),
SeqRecord(Seq("HNGFTALEGEIHHLTHGEKVAF",generic_protein), id="Gamma"),
SeqRecord(Seq("DITHGVG",generic_protein), id="delta")],
"three peptides of different lengths"),
([SeqRecord(Seq("CHSMAIKLSSEHNIPSGIANAL",generic_protein), id="Alpha"),
SeqRecord(Seq("VHGMAHPLGAFYNTPHGVANAI",generic_protein), id="Beta"),
SeqRecord(Seq("HNGFTALEGEIHHLTHGEKVAF",generic_protein), id="Gamma")],
"three proteins alignment"),
([SeqRecord(Seq("AATAAACCTTGCTGGCCATTGTGATCCATCCA",generic_dna), id="X"),
SeqRecord(Seq("ACTCAACCTTGCTGGTCATTGTGACCCCAGCA",generic_dna), id="Y"),
SeqRecord(Seq("TTTCCTCGGAGGCCAATCTGGATCAAGACCAT",generic_dna), id="Z")],
"three DNA sequence alignment"),
([SeqRecord(Seq("AATAAACCTTGCTGGCCATTGTGATCCATCCA",generic_dna), id="X",
name="The\nMystery\rSequece:%sX" % os.linesep),
SeqRecord(Seq("ACTCAACCTTGCTGGTCATTGTGACCCCAGCA",generic_dna), id="Y",
description="an%sevil\rdescription right\nhere" % os.linesep),
SeqRecord(Seq("TTTCCTCGGAGGCCAATCTGGATCAAGACCAT",generic_dna), id="Z")],
"3 DNA seq alignment with CR/LF in name/descr"),
([SeqRecord(Seq("CHSMAIKLSSEHNIPSGIANAL",generic_protein), id="Alpha"),
SeqRecord(Seq("VHGMAHPLGAFYNTPHGVANAI",generic_protein), id="Beta"),
SeqRecord(Seq("VHGMAHPLGAFYNTPHGVANAI",generic_protein), id="Beta"),
SeqRecord(Seq("HNGFTALEGEIHHLTHGEKVAF",generic_protein), id="Gamma")],
"alignment with repeated record"),
]
# Meddle with the annotation too:
assert test_records[4][1] == "3 DNA seq alignment with CR/LF in name/descr"
# Add a list of strings,
test_records[4][0][2].annotations["note"] = ["Note%salso" % os.linesep \
+ "\r\nhas\n evil line\rbreaks!", "Wow"]
# Add a simple string
test_records[4][0][2].annotations["comment"] = "More%sof" % os.linesep \
+ "\r\nthese\n evil line\rbreaks!"
# Add a float too:
test_records[4][0][2].annotations["weight"] = 2.5
def records_match(record_one, record_two) :
"""This is meant to be a strict comparison for exact agreement"""
if record_one.id <> record_two.id :
return False
if record_one.name <> record_two.name :
return False
if record_one.description <> record_two.description :
return False
if record_one.seq.tostring() <> record_two.seq.tostring() :
return False
#Close enough... should I check for features, annotation etc?
return True
def record_summary(record, indent=" ") :
"""Returns a concise summary of a SeqRecord object as a string"""
if record.id == record.name :
answer = "%sID and Name='%s',\n%sSeq='" % (indent, record.id, indent)
else :
answer = "%sID = '%s', Name='%s',\n%sSeq='" % (indent, record.id, record.name, indent)
if len(record.seq) > 50 :
answer += record.seq[:40].tostring() + "..." + record.seq[-7:].tostring()
else :
answer += record.seq.tostring()
answer += "', length=%i" % (len(record.seq))
return answer
def col_summary(col_text) :
if len(col_text) < 65 :
return col_text
else :
return col_text[:60] + "..." + col_text[-5:]
def alignment_summary(alignment, index=" ") :
"""Returns a concise summary of an Alignment object as a string"""
answer = []
alignment_len = alignment.get_alignment_length()
rec_count = len(alignment.get_all_seqs())
for i in range(min(5,alignment_len)) :
answer.append(index + col_summary(alignment.get_column(i)) \
+ " alignment column %i" % i)
if alignment_len > 5 :
i = alignment_len - 1
answer.append(index + col_summary("|" * rec_count) \
+ " ...")
answer.append(index + col_summary(alignment.get_column(i)) \
+ " alignment column %i" % i)
return "\n".join(answer)
def check_simple_write_read(records, indent=" ") :
#print indent+"Checking we can write and then read back these records"
for format in SeqIO._FormatToWriter.keys() :
print indent+"Checking can write/read as '%s' format" % format
#Going to write to a handle...
handle = StringIO()
try :
SeqIO.write(sequences=records, handle=handle, format=format)
except ValueError, e :
#This is often expected to happen, for example when we try and
#write sequences of different lengths to an alignment file.
print indent+"Failed: %s" % str(e)
#Carry on to the next format:
continue
handle.flush()
handle.seek(0)
#Now ready to read back from the handle...
try :
records2 = list(SeqIO.parse(handle=handle, format=format))
except ValueError, e :
#This is BAD. We can't read our own output.
#I want to see the output when called from the test harness,
#run_tests.py (which can be funny about new lines on Windows)
handle.seek(0)
raise ValueError("%s\n\n%s\n\n%s" \
% (str(e), repr(handle.read()), repr(records)))
assert len(records2) == t_count
for i in range(t_count) :
#Check the bare minimum (ID and sequence) as
#many formats can't store more than that.
#
#Note some formats allow spaces, others don't.
#Assume spaces are turned into underscores.
records[i].id.replace(" ","_") == records2[i].id.replace(" ","_")
records[i].seq.data == records2[i].seq.data
for (t_format, t_alignment, t_filename, t_count) in test_files :
print "Testing reading %s format file %s" % (t_format, t_filename)
assert os.path.isfile(t_filename), t_filename
#Try as an iterator using handle
records = list(SeqIO.parse(handle=open(t_filename,"r"), format=t_format))
assert len(records) == t_count
#Try using the iterator with a for loop
records2 = []
for record in SeqIO.parse(handle=open(t_filename,"r"), format=t_format) :
records2.append(record)
assert len(records2) == t_count
#Try using the iterator with the next() method
records3 = []
seq_iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format)
while True :
try :
record = seq_iterator.next()
except StopIteration :
record = None
if record :
records3.append(record)
else :
break
#Try a mixture of next() and list (a torture test!)
seq_iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format)
try :
record = seq_iterator.next()
except StopIteration :
record = None
if record is not None :
records4 = [record]
records4.extend(list(seq_iterator))
else :
records4 = []
assert len(records4) == t_count
#Try a mixture of next() and for loop (a torture test!)
seq_iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format)
try :
record = seq_iterator.next()
except StopIteration :
record = None
if record is not None :
records5 = [record]
for record in seq_iterator :
records5.append(record)
else :
records5 = []
assert len(records5) == t_count
for i in range(t_count) :
record = records[i]
#Check returned expected object type
assert isinstance(record, SeqRecord)
assert isinstance(record.seq, Seq)
assert isinstance(record.id, basestring)
assert isinstance(record.name, basestring)
assert isinstance(record.description, basestring)
assert record.id <> ""
if "accessions" in record.annotations :
accs = record.annotations["accessions"]
#Check for blanks, or entries with leading/trailing spaces
for acc in accs :
assert acc and acc == acc.strip(), \
"Bad accession in annotations: %s" % repr(acc)
assert len(Set(accs)) == len(accs), \
"Repeated accession in annotations: %s" % repr(accs)
for ref in record.dbxrefs :
assert ref and ref == ref.strip(), \
"Bad cross reference in dbxrefs: %s" % repr(ref)
assert len(record.dbxrefs) == len(record.dbxrefs), \
"Repeated cross reference in dbxrefs: %s" % repr(record.dbxrefs)
#Check the lists obtained by the different methods agree
assert records_match(record, records2[i])
assert records_match(record, records3[i])
assert records_match(record, records4[i])
assert records_match(record, records5[i])
if i < 3 :
print record_summary(record)
# Only printed the only first three records: 0,1,2
if t_count > 4 :
print " ..."
if t_count > 3 :
print record_summary(records[-1])
# Check Bio.SeqIO.read(...)
if t_count == 1 :
record = SeqIO.read(handle=open(t_filename), format=t_format)
assert isinstance(record, SeqRecord)
else :
try :
record = SeqIO.read(open(t_filename), t_format)
assert False, "Bio.SeqIO.read(...) should have failed"
except ValueError :
#Expected to fail
pass
if t_alignment :
print "Testing reading %s format file %s as an alignment" \
% (t_format, t_filename)
#Using SeqIO.to_alignment(SeqIO.parse(...))
alignment = SeqIO.to_alignment(SeqIO.parse( \
handle=open(t_filename,"r"), format=t_format))
assert len(alignment.get_all_seqs()) == t_count
alignment_len = alignment.get_alignment_length()
#Check the record order agrees, and double check the
#sequence lengths all agree too.
for i in range(t_count) :
assert records_match(records[i], alignment.get_all_seqs()[i])
assert len(records[i].seq) == alignment_len
print alignment_summary(alignment)
#Some alignment file formats have magic characters which mean
#use the letter in this position in the first sequence.
#They should all have been converted by the parser, but if
#not reversing the record order might expose an error. Maybe.
records.reverse()
check_simple_write_read(records)
print "Finished tested reading files"
print
print "Starting testing writing records"
print "(Note that some of these are expected to 'fail' and say why)"
print
for (records, descr) in test_records :
print "Testing can write/read %s" % descr
for format in SeqIO._FormatToWriter.keys() :
print " Checking can write/read as '%s' format" % format
#################
# Write records #
#################
handle = StringIO()
try :
SeqIO.write(records, handle, format)
except ValueError, e:
#This is expected to happen on many of the examples.
print " Failed: %s" % str(e)
continue #goto next test
#################
# Read records #
#################
handle.seek(0)
try :
new_records = list(SeqIO.parse(handle, format))
except ValueError, e :
#THIS INDICATES A SIGNIFICANT PROBLEM,
#as we can't read the file we just wrote!
print " FAILED: %s" % str(e)
continue #goto next test
#################
# Check records #
#################
assert len(new_records) == len(records)
for record, new_record in zip(records, new_records) :
assert record.id == new_record.id
assert record.seq.tostring() == new_record.seq.tostring()
#Using records_match(record, new_record) is too strict
#Close now, after checking, so that it can be used at the console for debugging
handle.close()
print "Finished tested writing files"
|