File: test_GenBank.py

package info (click to toggle)
python-biopython 1.54-1
links: PTS, VCS
area: main
in suites: squeeze
size: 25,400 kB
ctags: 10,975
sloc: python: 116,757; xml: 33,167; ansic: 8,622; sql: 1,488; makefile: 147
file content (213 lines) | stat: -rw-r--r-- 8,144 bytes
#!/usr/bin/env python
"""Test the GenBank parser and make sure everything is working smoothly.
"""
# standard library
import os
import copy
import cStringIO

# Biopython
from Bio import File

# GenBank stuff to test
from Bio import GenBank
from Bio.GenBank import utils

gb_file_dir = os.path.join(os.getcwd(), 'GenBank')

test_files = ['noref.gb', 'cor6_6.gb', 'iro.gb', 'pri1.gb', 'arab1.gb',
              'protein_refseq.gb', 'extra_keywords.gb', 'one_of.gb',
              'NT_019265.gb', 'origin_line.gb', 'blank_seq.gb',
              'dbsource_wrap.gb', 'gbvrl1_start.seq', 'NC_005816.gb']

# We only test writing on a subset of the examples:
write_format_files = ['noref.gb', 'cor6_6.gb', 'iro.gb', 'pri1.gb', 'arab1.gb',
                      'extra_keywords.gb', 'one_of.gb', 'origin_line.gb']
# don't test writing on protein_refseq, since it is horribly nasty
# don't test writing on the CONTIG refseq, because the wrapping of
# locations won't work exactly
# don't test writing on blank_seq because it lacks a sequence type
# don't test dbsource_wrap because it is a junky RefSeq file

files_to_parse = []
for file in test_files:
    files_to_parse.append(os.path.join(gb_file_dir, file))

# parse the bioperl test files
# comment this out for now -- there are a bunch of junky records in here
# that no longer exist in GenBank -- do we really need to support those?
# files_to_parse = [os.path.join(os.getcwd(), 'GenBank', 'bioperl_test.gb')]

# parse the biojava test files
# files_to_parse += [os.path.join(os.getcwd(), 'GenBank', 'biojava_test.gb')]

# test the parsers
feature_parser = GenBank.FeatureParser(debug_level = 0)
record_parser = GenBank.RecordParser(debug_level = 0)

all_parsers = [feature_parser, record_parser]
print "Testing parsers..."
for parser in all_parsers:
    for filename in files_to_parse:
        if not os.path.isfile(filename):
            print "Missing test input file: %s" % filename
            continue
        
        handle = open(filename, 'r')
        iterator = GenBank.Iterator(handle, parser)
        
        while 1:
            cur_record = iterator.next()

            if cur_record is None:
                break

            if isinstance(parser, GenBank.FeatureParser):
                print "***Record from %s with the FeatureParser" \
                      % filename.split(os.path.sep)[-1]
                print "Seq:", repr(cur_record.seq)
                print "Id:", cur_record.id
                print "Name:", cur_record.name
                print "Description", cur_record.description
                print "Annotations***"
                ann_keys = cur_record.annotations.keys()
                ann_keys.sort()
                for ann_key in ann_keys:
                    if ann_key != 'references':
                        print "Key: %s" % ann_key
                        print "Value: %s" % \
                              cur_record.annotations[ann_key]
                    else:
                        print "References*"
                        for reference in cur_record.annotations[ann_key]:
                            print str(reference) 
                print "Feaures"
                for feature in cur_record.features:
                    print feature
                print "DB cross refs", cur_record.dbxrefs
            elif isinstance(parser, GenBank.RecordParser):
                print "***Record from %s with the RecordParser" \
                      % filename.split(os.path.sep)[-1]
                print "sequence length: %i" % len(cur_record.sequence)
                print "locus:", cur_record.locus
                print "definition:", cur_record.definition
                print "accession:", cur_record.accession
                for reference in cur_record.references:
                    print "reference title:", reference.title

                for feature in cur_record.features:
                    print "feature key:", feature.key
                    print "location:", feature.location
                    print "num qualifiers:", len(feature.qualifiers)
                    for qualifier in feature.qualifiers:
                        print "key:", qualifier.key, "value:", qualifier.value
                         
        handle.close()
        
#The dictionaries code has been deprecated
#print "Testing dictionaries..."
#...

# test writing GenBank format
print "Testing writing GenBank format..."

def do_comparison(good_record, test_record):
    """Compare two records to see if they are the same.

    Ths compares the two GenBank record, and will raise an AssertionError
    if two lines do not match, showing the non-matching lines.
    """
    good_handle = cStringIO.StringIO(good_record)
    test_handle = cStringIO.StringIO(test_record)

    while 1:
        good_line = good_handle.readline()
        test_line = test_handle.readline()

        if not(good_line) and not(test_line):
            break
        if not(good_line):
            raise AssertionError("Extra info in Test: `%s`" % test_line)
        if not(test_line):
            raise AssertionError("Extra info in Expected: `%s`" % good_line)
        test_normalized = ' '.join([x for x in test_line.split() if x])
        good_normalized = ' '.join([x for x in good_line.split() if x])
        assert test_normalized == good_normalized, \
               "Expected does not match Test.\nExpect:`%s`\nTest  :`%s`\n" % \
               (good_line, test_line)
    
def t_write_format():
    record_parser = GenBank.RecordParser(debug_level = 0)

    for file in write_format_files:
        print "Testing GenBank writing for %s..." % os.path.basename(file)
        cur_handle = open(os.path.join("GenBank", file), "r")
        compare_handle = open(os.path.join("GenBank", file), "r")
        
        iterator = GenBank.Iterator(cur_handle, record_parser)
        compare_iterator = GenBank.Iterator(compare_handle)
        
        while 1:
            cur_record = iterator.next()
            compare_record = compare_iterator.next()
            
            if cur_record is None or compare_record is None:
                break

            print "\tTesting for %s" % cur_record.version

            output_record = str(cur_record) + "\n"
            do_comparison(compare_record, output_record)

        cur_handle.close()

t_write_format()

def t_cleaning_features():
    """Test the ability to clean up feature values.
    """
    parser = GenBank.FeatureParser(feature_cleaner = \
                                   utils.FeatureValueCleaner())
    handle = open(os.path.join("GenBank", "arab1.gb"))
    iterator = GenBank.Iterator(handle, parser)

    first_record = iterator.next()
    
    # test for cleaning of translation
    translation_feature = first_record.features[1]
    test_trans = translation_feature.qualifiers["translation"][0]
    assert test_trans.find(" ") == -1, \
      "Did not clean spaces out of the translation"
    assert test_trans.find("\012") == -1, \
      "Did not clean newlines out of the translation"

print "Testing feature cleaning..."
t_cleaning_features()

def t_bioformat():
    """Test converting GenBank into different formats using Bioformat.
    """
    from Bio import formats
    from Bio import SeqRecord

    test_file = os.path.join("GenBank", "iro.gb")
    test_handle = open(test_file)
    format = formats["sequence"].identify(test_handle)
    assert format.name == "genbank-records", \
      "Identified format incorrectly: %s" % format.name

    all_records = []
    for record in SeqRecord.io.readFile(test_handle):
        all_records.append(record)

    assert len(all_records) == 1
    assert all_records[0].id == "AL109817.1", \
      "Unexpected record id: %s" % all_records[0].id
    assert all_records[0].seq[0:10] == "cacaggccca", \
      "Unexpected sequence: %s" % all_records[0].seq
    assert all_records[0].description == \
      "Homo sapiens mRNA full length insert cDNA clone EUROIMAGE 125195.", \
      "Unexpected description: %s" % all_records[0].description

print "Testing format conversions..."
# t_bioformat() # XXX this is mucked up right now and still under work