File: test_SeqUtils.py

package info (click to toggle)
python-biopython 1.54-1
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 25,400 kB
  • ctags: 10,975
  • sloc: python: 116,757; xml: 33,167; ansic: 8,622; sql: 1,488; makefile: 147
file content (133 lines) | stat: -rw-r--r-- 5,063 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Copyright 2007-2009 by Peter Cock.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

import os

from Bio.SeqUtils import GC, quick_FASTA_reader
from Bio.SeqUtils.CheckSum import crc32, crc64, gcg, seguid
from Bio.SeqUtils.lcc import lcc_simp, lcc_mult
from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq, MutableSeq
from Bio.Alphabet import single_letter_alphabet
from Bio import SeqIO


######################
# quick_FASTA_reader #
######################

dna_fasta_filename = "Fasta/f002"

tuple_records = quick_FASTA_reader(dna_fasta_filename)
assert len(tuple_records)==3
seq_records = list(SeqIO.parse(open(dna_fasta_filename),"fasta"))
assert len(seq_records)==3
for tuple_record, seq_record in zip(tuple_records, seq_records):
    assert tuple_record == (seq_record.description, seq_record.seq.tostring())
    print "%s has GC%% of %0.1f" % (seq_record.name, GC(seq_record.seq))

##############
# CodonUsage #
##############

print
print "Codon Adaption Index (CAI)"
CAI = CodonAdaptationIndex()
# Note - this needs a whole number of codons, and a DNA seq AS A STRING.
print "Example CAI %0.5f using E. coli (default)" \
      % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG")

#We need a FASTA file of CDS sequences to count the codon usage...
dna_fasta_filename = "fasta.tmp"
dna_genbank_filename = "GenBank/NC_005816.gb"
record = SeqIO.read(open(dna_genbank_filename), "genbank")
records = []
for feature in record.features:
    if feature.type == "CDS" \
    and not feature.sub_features:
        start = feature.location.start.position
        end = feature.location.end.position
        table = int(feature.qualifiers["transl_table"][0])
        if feature.strand == -1:
            seq = record.seq[start:end].reverse_complement()
        else:
            seq = record.seq[start:end]
        #Double check we have the CDS sequence expected
        #TODO - Use any cds_start option if/when added to deal with the met
        assert "M" + str(seq[3:].translate(table)) \
               == feature.qualifiers["translation"][0]+"*"
        records.append(SeqRecord(seq, id=feature.qualifiers["protein_id"][0],
                                 description=feature.qualifiers["product"][0]))
del start, end, table, seq
if os.path.isfile(dna_fasta_filename):
    os.remove(dna_fasta_filename)
handle = open(dna_fasta_filename, "w")
SeqIO.write(records, handle, "fasta")
handle.close()

CAI = CodonAdaptationIndex()
# Note - this needs a FASTA file which containing non-ambiguous DNA coding
# sequences - which should each be a whole number of codons.
CAI.generate_index(dna_fasta_filename)
print "Example CAI %0.5f using %s" \
      % (CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
         record.annotations["source"])

os.remove(dna_fasta_filename)
del record, records
del dna_genbank_filename
del dna_fasta_filename

print

###################
# crc64 collision #
###################

# Example of crc64 collision from Sebastian Bassi using the
# immunoglobulin lambda light chain variable region from Homo sapiens
# Both sequences share the same CRC64 checksum: 44CAAD88706CC153
str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
                + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
                + "YCSSYAGSSTLVFGGGTKLTVL"
str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
                + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
                + "YCCSYAGSSTWVFGGGTKLTVL"

#Explicit testing of crc64 collision:
assert str_light_chain_one != str_light_chain_two
assert crc32(str_light_chain_one) != crc32(str_light_chain_two)
assert crc64(str_light_chain_one) == crc64(str_light_chain_two)
assert gcg(str_light_chain_one) != gcg(str_light_chain_two)
assert seguid(str_light_chain_one) != seguid(str_light_chain_two)

###########################
# main checksum/LCC tests #
###########################

#Print some output, which the test harness will check
examples = [str_light_chain_one, str_light_chain_two,
            "ATGCGTATCGATCGCGATACGATTAGGCGGAT"]

for i, seq_str in enumerate(examples):
    print "Example %i, length %i, %s..." % (i+1, len(seq_str), seq_str[:10])

    #Avoid cross platforms with printing floats by doing conversion explicitly
    def simple_LCC(s):
        return "%0.2f" % lcc_simp(s)

    def windowed_LCC(s):
        return ", ".join(["%0.2f" % v for v in lcc_mult(s,20)])

    for checksum in [crc32, crc64, gcg, seguid, simple_LCC, windowed_LCC]:
        #First using a string:
        value = checksum(seq_str)
        print " %s = %s" % (checksum.__name__, value)
        #Secondly check it works with a Seq object
        assert value == checksum(Seq(seq_str, single_letter_alphabet))
        #Finally check it works with a MutableSeq object
        assert value == checksum(MutableSeq(seq_str, single_letter_alphabet))