File: test_SeqIO.py

package info (click to toggle)
python-biopython 1.45-3
  • links: PTS, VCS
  • area: main
  • in suites: lenny
  • size: 18,192 kB
  • ctags: 12,310
  • sloc: python: 83,505; xml: 13,834; ansic: 7,015; cpp: 1,855; sql: 1,144; makefile: 179
file content (416 lines) | stat: -rw-r--r-- 17,512 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
# Copyright 2007 by Peter Cock.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

import os
from sets import Set
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from StringIO import StringIO
from Bio.Alphabet import generic_protein, generic_rna, generic_dna

#Longer list including alignment only file formats we can read AND write:
test_write_read_alignment_formats = SeqIO._FormatToWriter.keys()[:]

# test_files is a list of tuples containing:
# - string:  file format
# - boolean: alignment (requires all seqs be same length)
# - string:  relative filename
# - integer: number of sequences

test_files = [ \
#Following examples are also used in test_Clustalw.py
    ("clustal",True,  'Clustalw/cw02.aln', 2),
    ("clustal",True,  'Clustalw/opuntia.aln', 7),
#Following nucleic examples are also used in test_Fasta2.py
    ("fasta",  False, 'Nucleic/lupine.nu', 1),
    ("fasta",  False, 'Nucleic/elderberry.nu', 1),
    ("fasta",  False, 'Nucleic/phlox.nu', 1),
    ("fasta",  False, 'Nucleic/centaurea.nu', 1),
    ("fasta",  False, 'Nucleic/wisteria.nu', 1),
    ("fasta",  False, 'Nucleic/sweetpea.nu', 1),
    ("fasta",  False, 'Nucleic/lavender.nu', 1),
#Following protein examples are also used in test_Fasta2.py
    ("fasta",  False, 'Amino/aster.pro', 1),
    ("fasta",  False, 'Amino/loveliesbleeding.pro', 1),
    ("fasta",  False, 'Amino/rose.pro', 1),
    ("fasta",  False, 'Amino/rosemary.pro', 1),
#Following examples are also used in test_Fasta.py
    ("fasta",  False, 'Fasta/f001', 1), #Protein
    ("fasta",  False, 'Fasta/f002', 3), #DNA
    #("fasta", False, 'Fasta/f003', 2), #Protein with comments
    ("fasta",  False, 'Fasta/fa01', 2), #Protein with gaps
#Following examples are also used in test_GFF.py
    ("fasta",  False, 'GFF/NC_001802.fna', 1), #upper case
    ("fasta",  False, 'GFF/NC_001802lc.fna', 1), #lower case
    ("fasta",  True,  'GFF/multi.fna', 3), #Trivial nucleotide alignment
#Following example is also used in test_registry.py
    ("fasta",  False, 'Registry/seqs.fasta', 2), #contains blank line
#Following example is also used in test_Nexus.py
    ("nexus",  True,  'Nexus/test_Nexus_input.nex', 9),
#Following examples are also used in test_SwissProt.py
    ("swiss",  False, 'SwissProt/sp001', 1),
    ("swiss",  False, 'SwissProt/sp002', 1),
    ("swiss",  False, 'SwissProt/sp003', 1),
    ("swiss",  False, 'SwissProt/sp004', 1),
    ("swiss",  False, 'SwissProt/sp005', 1),
    ("swiss",  False, 'SwissProt/sp006', 1),
    ("swiss",  False, 'SwissProt/sp007', 1),
    ("swiss",  False, 'SwissProt/sp008', 1),
    ("swiss",  False, 'SwissProt/sp009', 1),
    ("swiss",  False, 'SwissProt/sp010', 1),
    ("swiss",  False, 'SwissProt/sp011', 1),
    ("swiss",  False, 'SwissProt/sp012', 1),
    ("swiss",  False, 'SwissProt/sp013', 1),
    ("swiss",  False, 'SwissProt/sp014', 1),
    ("swiss",  False, 'SwissProt/sp015', 1),
    ("swiss",  False, 'SwissProt/sp016', 1),
#Following example is also used in test_registry.py
    ("swiss",  False, 'Registry/EDD_RAT.dat', 1),
#Following examples are also used in test_GenBank.py
    ("genbank",False, 'GenBank/noref.gb', 1),
    ("genbank",False, 'GenBank/cor6_6.gb', 6),
    ("genbank",False, 'GenBank/iro.gb', 1),
    ("genbank",False, 'GenBank/pri1.gb', 1),
    ("genbank",False, 'GenBank/arab1.gb', 1),
    ("genbank",False, 'GenBank/protein_refseq.gb', 1), #Old version
    ("genbank",False, 'GenBank/protein_refseq2.gb', 1), #Revised version
    ("genbank",False, 'GenBank/extra_keywords.gb', 1),
    ("genbank",False, 'GenBank/one_of.gb', 1),
    ("genbank",False, 'GenBank/NT_019265.gb', 1),
    ("genbank",False, 'GenBank/origin_line.gb', 1),
    ("genbank",False, 'GenBank/blank_seq.gb', 1),
    ("genbank",False, 'GenBank/dbsource_wrap.gb', 1),
# The next example is a truncated copy of gbvrl1.seq from
# ftp://ftp.ncbi.nih.gov/genbank/gbvrl1.seq.gz
# This includes an NCBI header, and the first three records:
    ("genbank",False, 'GenBank/gbvrl1_start.seq', 3),
#Following files are also used in test_GFF.py
    ("genbank",False, 'GFF/NC_001422.gbk', 1),
#Following files are currently only used here:
    ("embl",      False, 'EMBL/TRBG361.embl', 1),
    ("embl",      False, 'EMBL/DD231055_edited.embl', 1),
    ("embl",      False, 'EMBL/SC10H5.embl', 1), # Pre 2006 style ID line
    ("embl",      False, 'EMBL/U87107.embl', 1), # Old ID line with SV line
    ("stockholm", True,  'Stockholm/simple.sth', 2),
    ("stockholm", True,  'Stockholm/funny.sth', 5),
#Following PHYLIP files are currently only used here (test_SeqIO)
#and are mostly from Joseph Felsenstein's PHYLIP v3.6 documentation:
    ("phylip",    True,  'Phylip/reference_dna.phy', 6),
    ("phylip",    True,  'Phylip/reference_dna2.phy', 6),
    ("phylip",    True,  'Phylip/hennigian.phy', 10),
    ("phylip",    True,  'Phylip/horses.phy', 10),
    ("phylip",    True,  'Phylip/random.phy', 10),
    ("phylip",    True,  'Phylip/interlaced.phy', 3),
    ("phylip",    True,  'Phylip/interlaced2.phy', 4),
    ]

# This is a list of two-tuples.  Each tuple contains a
# list of SeqRecord objects and a description (string)
test_records = [
    ([], "zero records"),
    ([SeqRecord(Seq("CHSMAIKLSSEHNIPSGIANAL",generic_protein), id="Alpha"),
      SeqRecord(Seq("HNGFTALEGEIHHLTHGEKVAF",generic_protein), id="Gamma"),
      SeqRecord(Seq("DITHGVG",generic_protein), id="delta")],
     "three peptides of different lengths"),
    ([SeqRecord(Seq("CHSMAIKLSSEHNIPSGIANAL",generic_protein), id="Alpha"),
      SeqRecord(Seq("VHGMAHPLGAFYNTPHGVANAI",generic_protein), id="Beta"),
      SeqRecord(Seq("HNGFTALEGEIHHLTHGEKVAF",generic_protein), id="Gamma")],
     "three proteins alignment"),
    ([SeqRecord(Seq("AATAAACCTTGCTGGCCATTGTGATCCATCCA",generic_dna), id="X"),
      SeqRecord(Seq("ACTCAACCTTGCTGGTCATTGTGACCCCAGCA",generic_dna), id="Y"),
      SeqRecord(Seq("TTTCCTCGGAGGCCAATCTGGATCAAGACCAT",generic_dna), id="Z")],
     "three DNA sequence alignment"),
    ([SeqRecord(Seq("AATAAACCTTGCTGGCCATTGTGATCCATCCA",generic_dna), id="X",
                name="The\nMystery\rSequece:%sX" % os.linesep),
      SeqRecord(Seq("ACTCAACCTTGCTGGTCATTGTGACCCCAGCA",generic_dna), id="Y",
                description="an%sevil\rdescription right\nhere" % os.linesep),
      SeqRecord(Seq("TTTCCTCGGAGGCCAATCTGGATCAAGACCAT",generic_dna), id="Z")],
     "3 DNA seq alignment with CR/LF in name/descr"),
    ([SeqRecord(Seq("CHSMAIKLSSEHNIPSGIANAL",generic_protein), id="Alpha"),
      SeqRecord(Seq("VHGMAHPLGAFYNTPHGVANAI",generic_protein), id="Beta"),
      SeqRecord(Seq("VHGMAHPLGAFYNTPHGVANAI",generic_protein), id="Beta"),
      SeqRecord(Seq("HNGFTALEGEIHHLTHGEKVAF",generic_protein), id="Gamma")],
     "alignment with repeated record"),
    ]
# Meddle with the annotation too:
assert test_records[4][1] == "3 DNA seq alignment with CR/LF in name/descr"
# Add a list of strings,
test_records[4][0][2].annotations["note"] = ["Note%salso" % os.linesep \
                                    + "\r\nhas\n evil line\rbreaks!", "Wow"]
# Add a simple string
test_records[4][0][2].annotations["comment"] = "More%sof" % os.linesep \
                                          + "\r\nthese\n evil line\rbreaks!"
# Add a float too:
test_records[4][0][2].annotations["weight"] = 2.5

def records_match(record_one, record_two) :
    """This is meant to be a strict comparison for exact agreement"""
    if record_one.id <> record_two.id :
        return False
    if record_one.name <> record_two.name :
        return False
    if record_one.description <> record_two.description :
        return False
    if record_one.seq.tostring() <> record_two.seq.tostring() :
        return False
    #Close enough... should I check for features, annotation etc?
    return True

def record_summary(record, indent=" ") :
    """Returns a concise summary of a SeqRecord object as a string"""
    if record.id == record.name :
        answer = "%sID and Name='%s',\n%sSeq='" % (indent, record.id, indent)
    else :
        answer = "%sID = '%s', Name='%s',\n%sSeq='" % (indent, record.id, record.name, indent)
    if len(record.seq) > 50 :
        answer += record.seq[:40].tostring() + "..." + record.seq[-7:].tostring()
    else :
        answer += record.seq.tostring()
    answer += "', length=%i" % (len(record.seq))
    return answer

def col_summary(col_text) :
    if len(col_text) < 65 :
        return col_text
    else :
        return col_text[:60] + "..." + col_text[-5:]

def alignment_summary(alignment, index=" ") :
    """Returns a concise summary of an Alignment object as a string"""
    answer = []
    alignment_len = alignment.get_alignment_length()
    rec_count = len(alignment.get_all_seqs())
    for i in range(min(5,alignment_len)) :
        answer.append(index + col_summary(alignment.get_column(i)) \
                            + " alignment column %i" % i)
    if alignment_len > 5 :
        i = alignment_len - 1
        answer.append(index + col_summary("|" * rec_count) \
                            + " ...")
        answer.append(index + col_summary(alignment.get_column(i)) \
                            + " alignment column %i" % i)
    return "\n".join(answer)


def check_simple_write_read(records, indent=" ") :
    #print indent+"Checking we can write and then read back these records"
    for format in SeqIO._FormatToWriter.keys() :
        print indent+"Checking can write/read as '%s' format" % format
        
        #Going to write to a handle...
        handle = StringIO()
        
        try :
            SeqIO.write(sequences=records, handle=handle, format=format)
        except ValueError, e :
            #This is often expected to happen, for example when we try and
            #write sequences of different lengths to an alignment file.
            print indent+"Failed: %s" % str(e)
            #Carry on to the next format:
            continue

        handle.flush()
        handle.seek(0)
        #Now ready to read back from the handle...
        try :
            records2 = list(SeqIO.parse(handle=handle, format=format))
        except ValueError, e :
            #This is BAD.  We can't read our own output.
            #I want to see the output when called from the test harness,
            #run_tests.py (which can be funny about new lines on Windows)
            handle.seek(0)
            raise ValueError("%s\n\n%s\n\n%s" \
                              % (str(e), repr(handle.read()), repr(records)))

        assert len(records2) == t_count
        for i in range(t_count) :
            #Check the bare minimum (ID and sequence) as
            #many formats can't store more than that.
            #
            #Note some formats allow spaces, others don't.
            #Assume spaces are turned into underscores.
            records[i].id.replace(" ","_") == records2[i].id.replace(" ","_")
            records[i].seq.data == records2[i].seq.data

for (t_format, t_alignment, t_filename, t_count) in test_files :
    print "Testing reading %s format file %s" % (t_format, t_filename)
    assert os.path.isfile(t_filename), t_filename

    #Try as an iterator using handle
    records  = list(SeqIO.parse(handle=open(t_filename,"r"), format=t_format))
    assert len(records)  == t_count

    #Try using the iterator with a for loop
    records2 = []
    for record in SeqIO.parse(handle=open(t_filename,"r"), format=t_format) :
        records2.append(record)
    assert len(records2) == t_count

    #Try using the iterator with the next() method
    records3 = []
    seq_iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format)
    while True :
        try :
            record = seq_iterator.next()
        except StopIteration :
            record = None
        if record :
            records3.append(record)
        else :
            break

    #Try a mixture of next() and list (a torture test!)
    seq_iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format)
    try :
        record = seq_iterator.next()
    except StopIteration :
        record = None
    if record is not None :
        records4 = [record]
        records4.extend(list(seq_iterator))
    else :
        records4 = []
    assert len(records4) == t_count

    #Try a mixture of next() and for loop (a torture test!)
    seq_iterator = SeqIO.parse(handle=open(t_filename,"r"), format=t_format)
    try :
        record = seq_iterator.next()
    except StopIteration :
        record = None
    if record is not None :
        records5 = [record]
        for record in seq_iterator :
            records5.append(record)
    else :
        records5 = []
    assert len(records5) == t_count

    for i in range(t_count) :
        record = records[i]

        #Check returned expected object type
        assert isinstance(record, SeqRecord)
        assert isinstance(record.seq, Seq)
        assert isinstance(record.id, basestring)
        assert isinstance(record.name, basestring)
        assert isinstance(record.description, basestring)
        assert record.id <> ""

        if "accessions" in record.annotations :
            accs = record.annotations["accessions"]
            #Check for blanks, or entries with leading/trailing spaces
            for acc in accs :
                assert acc and acc == acc.strip(), \
                    "Bad accession in annotations: %s" % repr(acc)
            assert len(Set(accs)) == len(accs), \
                   "Repeated accession in annotations: %s" % repr(accs)
        for ref in record.dbxrefs :
            assert ref and ref == ref.strip(), \
                "Bad cross reference in dbxrefs: %s" % repr(ref)
        assert len(record.dbxrefs) == len(record.dbxrefs), \
               "Repeated cross reference in dbxrefs: %s" % repr(record.dbxrefs)
                
            
        #Check the lists obtained by the different methods agree
        assert records_match(record, records2[i])
        assert records_match(record, records3[i])
        assert records_match(record, records4[i])
        assert records_match(record, records5[i])

        if i < 3 :
            print record_summary(record)
    # Only printed the only first three records: 0,1,2 
    if t_count > 4 :
        print " ..."
    if t_count > 3 :
        print record_summary(records[-1])

    # Check Bio.SeqIO.read(...)
    if t_count == 1 :
        record = SeqIO.read(handle=open(t_filename), format=t_format)
        assert isinstance(record, SeqRecord)
    else :
        try :
            record = SeqIO.read(open(t_filename), t_format)
            assert False, "Bio.SeqIO.read(...) should have failed"
        except ValueError :
            #Expected to fail
            pass


    if t_alignment :
        print "Testing reading %s format file %s as an alignment" \
              % (t_format, t_filename)

        #Using SeqIO.to_alignment(SeqIO.parse(...))
        alignment = SeqIO.to_alignment(SeqIO.parse( \
                    handle=open(t_filename,"r"), format=t_format))
        assert len(alignment.get_all_seqs()) == t_count

        alignment_len = alignment.get_alignment_length()

        #Check the record order agrees, and double check the
        #sequence lengths all agree too.
        for i in range(t_count) :
            assert records_match(records[i], alignment.get_all_seqs()[i])
            assert len(records[i].seq) == alignment_len

        print alignment_summary(alignment)

    #Some alignment file formats have magic characters which mean
    #use the letter in this position in the first sequence.
    #They should all have been converted by the parser, but if
    #not reversing the record order might expose an error.  Maybe.
    records.reverse()
    check_simple_write_read(records)

print "Finished tested reading files"
print
print "Starting testing writing records"
print "(Note that some of these are expected to 'fail' and say why)"
print
for (records, descr) in test_records :
    print "Testing can write/read %s" % descr
    for format in SeqIO._FormatToWriter.keys() :
        print " Checking can write/read as '%s' format" % format

        #################
        # Write records #
        #################
        handle = StringIO()
        try :
            SeqIO.write(records, handle, format)
        except ValueError, e:
            #This is expected to happen on many of the examples.
            print " Failed: %s" % str(e)
            continue #goto next test

        #################
        # Read records  #
        #################
        handle.seek(0)
        try :
            new_records = list(SeqIO.parse(handle, format))
        except ValueError, e :
            #THIS INDICATES A SIGNIFICANT PROBLEM,
            #as we can't read the file we just wrote!
            print " FAILED: %s" % str(e)
            continue #goto next test

        #################
        # Check records #
        #################
        assert len(new_records) == len(records)
        for record, new_record in zip(records, new_records) :
            assert record.id == new_record.id
            assert record.seq.tostring() == new_record.seq.tostring()
            #Using records_match(record, new_record) is too strict

        #Close now, after checking, so that it can be used at the console for debugging
        handle.close()
        
print "Finished tested writing files"