File: test_genbank.py

package info (click to toggle)
python-cogent 1.4.1-1.2
links: PTS, VCS
area: non-free
in suites: squeeze
size: 13,260 kB
ctags: 20,087
sloc: python: 116,163; ansic: 732; makefile: 74; sh: 9
file content (400 lines) | stat: -rw-r--r-- 18,166 bytes
#!/usr/bin/env python
"""Unit tests for the GenBank database parsers.
"""
from cogent.parse.genbank import parse_locus, parse_single_line, \
    indent_splitter, parse_sequence, block_consolidator, parse_organism, \
    parse_feature, location_line_tokenizer, parse_simple_location_segment, \
    parse_location_line, parse_reference, parse_source, \
    Location, LocationList
from cogent.util.unit_test import TestCase, main

__author__ = "Rob Knight"
__copyright__ = "Copyright 2007-2009, The Cogent Project"
__credits__ = ["Rob Knight", "Gavin Huttley"]
__license__ = "GPL"
__version__ = "1.4.1"
__maintainer__ = "Rob Knight"
__email__ = "rob@spot.colorado.edu"
__status__ = "Production"

class GenBankTests(TestCase):
    """Tests of the GenBank main functions."""
    
    def test_parse_locus(self):
        """parse_locus should give correct results on specimen locus lines"""
        line = 'LOCUS       AF108830                5313 bp    mRNA    linear   PRI 19-MAY-1999'
        result = parse_locus(line)
        self.assertEqual(len(result), 6)
        self.assertEqual(result['locus'], 'AF108830')
        self.assertEqual(result['length'], 5313)    #note: int, not str
        self.assertEqual(result['mol_type'], 'mRNA')
        self.assertEqual(result['topology'], 'linear')
        self.assertEqual(result['db'], 'PRI')
        self.assertEqual(result['date'], '19-MAY-1999')
        #should work if some of the fields are missing
        line = 'LOCUS       AF108830                5313'
        result = parse_locus(line)
        self.assertEqual(len(result), 2)
        self.assertEqual(result['locus'], 'AF108830')
        self.assertEqual(result['length'], 5313)    #note: int, not str
    
    def test_parse_single_line(self):
        """parse_single_line should split off the label and return the rest"""
        line_1 = 'VERSION     AF108830.1  GI:4868112\n'
        self.assertEqual(parse_single_line(line_1), 'AF108830.1  GI:4868112')
        #should work if leading spaces
        line_2 = '      VERSION     AF108830.1  GI:4868112\n'
        self.assertEqual(parse_single_line(line_2), 'AF108830.1  GI:4868112')
    
    def test_indent_splitter(self):
        """indent_splitter should split lines at correct locations"""
        #if lines have same indent, should not group together
        lines = [
        'abc    xxx',
        'def    yyy'
        ]
        self.assertEqual(list(indent_splitter(lines)),\
            [[lines[0]], [lines[1]]])
        #if second line is indented, should group with first
        lines = [
        'abc    xxx',
        ' def    yyy'
        ]
        self.assertEqual(list(indent_splitter(lines)),\
            [[lines[0], lines[1]]])
        
        #if both lines indented but second is more, should group with first
        lines = [
        ' abc    xxx',
        '  def    yyy'
        ]
        self.assertEqual(list(indent_splitter(lines)),\
            [[lines[0], lines[1]]])
        
        #if both lines indented equally, should not group
        lines = [
        '   abc    xxx',
        '   def    yyy'
        ]
        self.assertEqual(list(indent_splitter(lines)), \
            [[lines[0]], [lines[1]]])
        
        #for more complex situation, should produce correct grouping
        lines = [
        '  xyz',    #0 -
        '  xxx',    #1 -
        '   yyy',   #2
        '   uuu',   #3
        '   iii',   #4
        '  qaz',    #5 -
        '  wsx',    #6 -
        '   az',    #7
        '   sx',    #8
        '        gb',#9
        '   bg',    #10
        '  aaa',    #11 -
        ]
        self.assertEqual(list(indent_splitter(lines)), \
            [[lines[0]], lines[1:5], [lines[5]], lines[6:11], [lines[11]]])
        
        #real example from genbank file
        lines = \
"""LOCUS       NT_016354           92123751 bp    DNA     linear   CON 29-AUG-2006
DEFINITION  Homo sapiens chromosome 4 genomic contig, reference assembly.
ACCESSION   NT_016354 NT_006109 NT_006204 NT_006245 NT_006302 NT_006371
            NT_006397 NT_016393 NT_016589 NT_016599 NT_016606 NT_022752
            NT_022753 NT_022755 NT_022760 NT_022774 NT_022797 NT_022803
            NT_022846 NT_022960 NT_025694 NT_028147 NT_029273 NT_030643
            NT_030646 NT_030662 NT_031780 NT_031781 NT_031791 NT_034703
            NT_034705 NT_037628 NT_037629 NT_079512
VERSION     NT_016354.18  GI:88977422
KEYWORDS    .
SOURCE      Homo sapiens (human)
  ORGANISM  Homo sapiens
            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
            Catarrhini; Hominidae; Homo.
?
REFERENCE   2  (bases 1 to 92123751)
  AUTHORS   International Human Genome Sequencing Consortium.
  TITLE     Finishing the euchromatic sequence of the human genome""".split('\n')
        self.assertEqual(list(indent_splitter(lines)), \
            [[lines[0]],[lines[1]],lines[2:8],[lines[8]],[lines[9]],lines[10:15],\
            [lines[15]], lines[16:]])
    
    def test_parse_sequence(self):
        """parse_sequence should strip bad chars out of sequence lines"""
        lines = """
ORIGIN
        1 gggagcgcgg cgcgggagcc cgaggctgag actcaccgga ggaagcggcg cgagcgcccc
       61   gccatcgtcc \t\t cggctgaagt 123 \ngcagtg  \n
      121 cctgggctta agcagtcttc45ccacctcagc 
//\n\n\n""".split('\n')
        result = parse_sequence(lines)
        self.assertEqual(result, 'gggagcgcggcgcgggagcccgaggctgagactcaccggaggaagcggcgcgagcgccccgccatcgtcccggctgaagtgcagtgcctgggcttaagcagtcttcccacctcagc')
    
    def test_block_consolidator(self):
        """block_consolidator should join the block together."""
        lines = """  ORGANISM  Homo sapiens
            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
            Mammalia; Eutheria; Euarchontoglires; Primates; Catarrhini;
            Hominidae; Homo.""".split('\n')
        label, data = block_consolidator(lines)
        self.assertEqual(label, 'ORGANISM')
        self.assertEqual(data, ['Homo sapiens',
'            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;',
'            Mammalia; Eutheria; Euarchontoglires; Primates; Catarrhini;',
'            Hominidae; Homo.'])
        lines = r"""COMMENT
                    Contact: Spindel ER
                    Division of Neuroscience""".splitlines()
        label, data = block_consolidator(lines)
        self.assertEqual(label, "COMMENT")
        self.assertEqual(data, ['', '                    Contact: Spindel ER',
                                '                    Division of Neuroscience'])
    
    def test_parse_organism(self):
        """parse_organism should return species, taxonomy (up to genus)"""
        #note: lines modified to include the following:
        # - multiword names
        # - multiword names split over a line break
        # - periods and other punctuation in names
        lines = """  ORGANISM  Homo sapiens
        Eukaryota; Metazoa; Chordata Craniata; Vertebrata; Euteleostomi;
        Mammalia; Eutheria; Euarchontoglires; Primates \t abc.  2.; Catarrhini
        Hominidae; Homo.""".split('\n')
        species, taxonomy = parse_organism(lines)
        self.assertEqual(species, 'Homo sapiens')
        self.assertEqual(taxonomy, ['Eukaryota', 'Metazoa', \
            'Chordata Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia', \
            'Eutheria', 'Euarchontoglires', 'Primates abc. 2.', \
            'Catarrhini Hominidae', 'Homo'])
    
    def test_parse_feature(self):
        """parse_feature should return dict containing annotations of feature"""
        example_feature=\
"""     CDS             complement(join(102262..102647,105026..105217,
                     106638..106719,152424..152682,243209..243267))
                     /gene="nad1"
                     /note="Protein sequence is in conflict with the conceptual
                     translation; author given translation (not conceptual
                     translation)
                     start codon is created by C to U RNA editing"
                     /codon_start=1
                     /exception="RNA editing"
                     /product="NADH dehydrogenase subunit 1"
                     /protein_id="NP_064011.1"
                     /db_xref="GI:9838451"
                     /db_xref="IPI:12345"
                     /translation="MYIAVPAEILGIILPLLLGVAFLVLAERKVMAFVQRRKGPDVVG
                     SFGLLQPLADGSKLILKEPISPSSANFSLFRMAPVTTFMLSLVARAVVPFDYGMVLSD
                     PNIGLLYLFAISSLGVYGIIIAGWSSNSKYAFLGALRSAAQMVPYEVSIGLILITVLI
                     CVGPRNSSEIVMAQKQIWSGIPLFPVLVMFFISCLAETNRAPFDLPEAERELVAGYNV
                     EYSSMGSALFFLGEYANMILMSGLCTSLSPGGWPPILDLPISKRIPGSIWFSIKVILF
                     LFLYIWVRAAFPRYRYDQLMGLGRKVFLPLSLARVVAVSGVLVTFQWLP"""
        result = parse_feature(example_feature.split('\n'))
        self.assertEqual(result['type'], 'CDS')
        self.assertEqual(result['raw_location'], \
            ['complement(join(102262..102647,105026..105217,', \
'                     106638..106719,152424..152682,243209..243267))'])
        self.assertEqual(result['gene'], ['nad1'])
        self.assertEqual(result['note'], ['Protein sequence is in conflict with the conceptual translation; author given translation (not conceptual translation) start codon is created by C to U RNA editing'])
        self.assertEqual(result['codon_start'], ['1'])
        self.assertEqual(result['exception'], ['RNA editing'])
        self.assertEqual(result['product'], ['NADH dehydrogenase subunit 1'])
        self.assertEqual(result['protein_id'],['NP_064011.1'])
        self.assertEqual(result['db_xref'], ['GI:9838451','IPI:12345'])
        self.assertEqual(result['translation'],['MYIAVPAEILGIILPLLLGVAFLVLAERKVMAFVQRRKGPDVVGSFGLLQPLADGSKLILKEPISPSSANFSLFRMAPVTTFMLSLVARAVVPFDYGMVLSDPNIGLLYLFAISSLGVYGIIIAGWSSNSKYAFLGALRSAAQMVPYEVSIGLILITVLICVGPRNSSEIVMAQKQIWSGIPLFPVLVMFFISCLAETNRAPFDLPEAERELVAGYNVEYSSMGSALFFLGEYANMILMSGLCTSLSPGGWPPILDLPISKRIPGSIWFSIKVILFLFLYIWVRAAFPRYRYDQLMGLGRKVFLPLSLARVVAVSGVLVTFQWLP'])
        self.assertEqual(len(result), 11)
        
        short_feature = ['D-loop          15418..16866']
        result = parse_feature(short_feature)
        self.assertEqual(result['type'], 'D-loop')
        self.assertEqual(result['raw_location'], ['15418..16866'])
        #can get more than one = in a line
        #from AF260826  
        bad_feature = \
"""     tRNA            1173..1238
                     /note="codon recognized: AUC; Cove score = 16.56"
                     /product="tRNA-Ile"
                     /anticodon=(pos:1203..1205,aa:Ile)"""
        result = parse_feature(bad_feature.split('\n'))
        self.assertEqual(result['note'], \
            ['codon recognized: AUC; Cove score = 16.56'])
        #need not always have an = in a line
        #from NC_001807
        bad_feature = \
'''     mRNA            556
     /partial
     /citation=[6]
     /product="H-strand"'''
        result = parse_feature(bad_feature.split('\n'))
        self.assertEqual(result['partial'], [''])
    
    def test_location_line_tokenizer(self):
        """location_line_tokenizer should tokenize location lines"""
        llt =location_line_tokenizer
        self.assertEqual(list(llt(['123..456'])), ['123..456'])
        self.assertEqual(list(llt(['complement(123..456)'])), \
            ['complement(', '123..456', ')'])
        self.assertEqual(list(llt(['join(1..2,3..4)'])), \
            ['join(', '1..2', ',', '3..4', ')'])
        self.assertEqual(list(llt([\
            'join(complement(1..2, join(complement( 3..4),',\
            '\n5..6), 7..8\t))'])),\
            ['join(','complement(','1..2',',','join(','complement(','3..4',\
            ')', ',', '5..6',')',',','7..8',')',')'])
    
    def test_parse_simple_location_segment(self):
        """parse_simple_location_segment should parse simple segments"""
        lsp = parse_simple_location_segment
        l = lsp('37')
        self.assertEqual(l._data, 37)
        self.assertEqual(str(l), '37')
        self.assertEqual(l.Strand, 1)
        l = lsp('40..50')
        first, second = l._data
        self.assertEqual(first._data, 40)
        self.assertEqual(second._data, 50)
        self.assertEqual(str(l), '40..50')
        self.assertEqual(l.Strand, 1)
        #should handle ambiguous starts and ends
        l = lsp('>37')
        self.assertEqual(l._data, 37)
        self.assertEqual(str(l), '>37')
        l = lsp('<37')
        self.assertEqual(l._data, 37)
        self.assertEqual(str(l), '<37')
        l = lsp('<37..>42')
        first, second = l._data
        self.assertEqual(first._data, 37)
        self.assertEqual(second._data, 42)
        self.assertEqual(str(first), '<37')
        self.assertEqual(str(second), '>42')
        self.assertEqual(str(l), '<37..>42')
    
    def test_parse_location_line(self):
        """parse_location_line should give correct list of location objects"""
        llt = location_line_tokenizer
        r = parse_location_line(llt(['123..456']))
        self.assertEqual(str(r), '123..456')
        r = parse_location_line(llt(['complement(123..456)']))
        self.assertEqual(str(r), 'complement(123..456)')
        r = parse_location_line(llt(['complement(123..456, 345..678)']))
        self.assertEqual(str(r), \
            'join(complement(345..678),complement(123..456))')
        r = parse_location_line(llt(['complement(join(123..456, 345..678))']))
        self.assertEqual(str(r), \
            'join(complement(345..678),complement(123..456))')
        r = parse_location_line(\
            llt(['join(complement(123..456), complement(345..678))']))
        self.assertEqual(str(r), \
            'join(complement(123..456),complement(345..678))')
        #try some nested joins and complements
        r = parse_location_line(llt(\
            ['complement(join(1..2,3..4,complement(5..6),',
              'join(7..8,complement(9..10))))']))
        self.assertEqual(str(r), \
          'join(9..10,complement(7..8),5..6,complement(3..4),complement(1..2))')
    
    def test_parse_reference(self):
        """parse_reference should give correct fields"""
        r = \
"""REFERENCE   2  (bases 1 to 2587)
  AUTHORS   Janzen,D.M. and Geballe,A.P.
  TITLE     The effect of eukaryotic release factor depletion on translation
            termination in human cell lines
  JOURNAL   (er) Nucleic Acids Res. 32 (15), 4491-4502 (2004)
   PUBMED   15326224"""
        result = parse_reference(r.split('\n'))
        self.assertEqual(len(result), 5)
        self.assertEqual(result['reference'], '2  (bases 1 to 2587)')
        self.assertEqual(result['authors'], 'Janzen,D.M. and Geballe,A.P.')
        self.assertEqual(result['title'], \
            'The effect of eukaryotic release factor depletion ' + \
            'on translation termination in human cell lines')
        self.assertEqual(result['journal'], \
            '(er) Nucleic Acids Res. 32 (15), 4491-4502 (2004)')
        self.assertEqual(result['pubmed'], '15326224')
    
    def test_parse_source(self):
        """parse_source should split into source and organism"""
        s = \
"""SOURCE      African elephant.
  ORGANISM  Mitochondrion Loxodonta africana
            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
            Mammalia; Eutheria; Proboscidea; Elephantidae; Loxodonta.""".split('\n')
        r = parse_source(s)
        self.assertEqual(len(r), 3)
        self.assertEqual(r['source'], 'African elephant.')
        self.assertEqual(r['species'], 'Mitochondrion Loxodonta africana')
        self.assertEqual(r['taxonomy'], ['Eukaryota','Metazoa', 'Chordata',\
            'Craniata', 'Vertebrata', 'Euteleostomi', 'Mammalia',\
            'Eutheria', 'Proboscidea', 'Elephantidae', 'Loxodonta'])
        
class LocationTests(TestCase):
    """Tests of the Location class."""
    def test_init(self):
        """Location should init with 1 or 2 values, plus params."""
        l = Location(37)
        self.assertEqual(str(l), '37')
        l = Location(37, Ambiguity = '>')
        self.assertEqual(str(l), '>37')
        l = Location(37, Ambiguity='<')
        self.assertEqual(str(l), '<37')
        l = Location(37, Accession='AB123')
        self.assertEqual(str(l), 'AB123:37')
        l = Location(37, Accession='AB123', Db='Kegg')
        self.assertEqual(str(l), 'Kegg::AB123:37')

        l1 = Location(37)
        l2 = Location(42)
        l = Location([l1,l2])
        self.assertEqual(str(l), '37..42')
        l3 = Location([l1,l2], IsBounds=True)
        self.assertEqual(str(l3), '(37.42)')
        l4 = Location([l1,l2], IsBetween=True)
        self.assertEqual(str(l4), '37^42')
        l5 = Location([l4,l3])
        self.assertEqual(str(l5), '37^42..(37.42)')
        l5 = Location([l4,l3], Strand=-1)
        self.assertEqual(str(l5), 'complement(37^42..(37.42))')

class LocationListTests(TestCase):
    """Tests of the LocationList class."""
    def test_extract(self):
        """LocationList extract should return correct sequence"""
        l = Location(3)
        l2_a = Location(5)
        l2_b = Location(7)
        l2 = Location([l2_a,l2_b], Strand=-1)
        l3_a = Location(10)
        l3_b = Location(12)
        l3 = Location([l3_a, l3_b])
        ll = LocationList([l, l2, l3])
        s = ll.extract('ACGTGCAGTCAGTAGCAT')
        #               123456789012345678
        self.assertEqual(s, 'G'+'TGC'+'CAG')
        #check a case where it wraps around
        l5_a = Location(16)
        l5_b = Location(4)
        l5 = Location([l5_a,l5_b])
        ll = LocationList([l5])
        s = ll.extract('ACGTGCAGTCAGTAGCAT')
        self.assertEqual(s, 'CATACGT')

if __name__ == '__main__':
    from sys import argv
    if len(argv) > 2 and argv[1] == 'x':
        filename = argv[2]
        lines = open(filename)
        for i in indent_splitter(lines):
            print '******'
            print i[0]
            for j in indent_splitter(i[1:]):
                print '?????'
                for line in j:
                    print line
    else:
        main()