1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
|
# Copyright 2007-2009 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
import os
import unittest
from Bio import SeqIO
from Bio.Alphabet import single_letter_alphabet
from Bio.Seq import Seq, MutableSeq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import GC, seq1, seq3
from Bio.SeqUtils.lcc import lcc_simp, lcc_mult
from Bio.SeqUtils.CheckSum import crc32, crc64, gcg, seguid
from Bio.SeqUtils.CodonUsage import CodonAdaptationIndex
def u_crc32(seq):
# NOTE - On Python 2 crc32 could return a signed int, but on Python 3 it is
# always unsigned
# Docs suggest should use crc32(x) & 0xffffffff for consistency.
return crc32(seq) & 0xffffffff
def simple_LCC(s):
# Avoid cross platforms with printing floats by doing conversion explicitly
return "%0.2f" % lcc_simp(s)
def windowed_LCC(s):
return ", ".join("%0.2f" % v for v in lcc_mult(s, 20))
class SeqUtilsTests(unittest.TestCase):
def setUp(self):
# Example of crc64 collision from Sebastian Bassi using the
# immunoglobulin lambda light chain variable region from Homo sapiens
# Both sequences share the same CRC64 checksum: 44CAAD88706CC153
self.str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
+ "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
+ "YCSSYAGSSTLVFGGGTKLTVL"
self.str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
+ "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
+ "YCCSYAGSSTWVFGGGTKLTVL"
def test_codon_usage_ecoli(self):
"""Test Codon Adaptation Index (CAI) using default E. coli data."""
CAI = CodonAdaptationIndex()
self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
"0.09978")
def test_codon_usage_custom(self):
"""Test Codon Adaptation Index (CAI) using FASTA file for background."""
# We need a FASTA file of CDS sequences to count the codon usage...
dna_fasta_filename = "fasta.tmp"
dna_genbank_filename = "GenBank/NC_005816.gb"
record = SeqIO.read(dna_genbank_filename, "genbank")
records = []
for feature in record.features:
if feature.type == "CDS" and len(feature.location.parts) == 1:
start = feature.location.start.position
end = feature.location.end.position
table = int(feature.qualifiers["transl_table"][0])
if feature.strand == -1:
seq = record.seq[start:end].reverse_complement()
else:
seq = record.seq[start:end]
# Double check we have the CDS sequence expected
# TODO - Use any cds_start option if/when added to deal with the met
a = "M" + str(seq[3:].translate(table))
b = feature.qualifiers["translation"][0] + "*"
self.assertEqual(a, b, "%r vs %r" % (a, b))
records.append(SeqRecord(seq, id=feature.qualifiers["protein_id"][0],
description=feature.qualifiers["product"][0]))
with open(dna_fasta_filename, "w") as handle:
SeqIO.write(records, handle, "fasta")
CAI = CodonAdaptationIndex()
# Note - this needs a FASTA file which containing non-ambiguous DNA coding
# sequences - which should each be a whole number of codons.
CAI.generate_index(dna_fasta_filename)
# Now check codon usage index (CAI) using this species
self.assertEqual(record.annotations["source"],
"Yersinia pestis biovar Microtus str. 91001")
self.assertEqual("%0.5f" % CAI.cai_for_gene("ATGCGTATCGATCGCGATACGATTAGGCGGATG"),
"0.67213")
os.remove(dna_fasta_filename)
def test_crc_checksum_collision(self):
# Explicit testing of crc64 collision:
self.assertNotEqual(self.str_light_chain_one, self.str_light_chain_two)
self.assertNotEqual(crc32(self.str_light_chain_one), crc32(self.str_light_chain_two))
self.assertEqual(crc64(self.str_light_chain_one), crc64(self.str_light_chain_two))
self.assertNotEqual(gcg(self.str_light_chain_one), gcg(self.str_light_chain_two))
self.assertNotEqual(seguid(self.str_light_chain_one), seguid(self.str_light_chain_two))
def seq_checksums(self, seq_str, exp_crc32, exp_crc64, exp_gcg, exp_seguid,
exp_simple_LCC, exp_window_LCC):
for s in [seq_str,
Seq(seq_str, single_letter_alphabet),
MutableSeq(seq_str, single_letter_alphabet)]:
self.assertEqual(exp_crc32, u_crc32(s))
self.assertEqual(exp_crc64, crc64(s))
self.assertEqual(exp_gcg, gcg(s))
self.assertEqual(exp_seguid, seguid(s))
self.assertEqual(exp_simple_LCC, simple_LCC(s))
self.assertEqual(exp_window_LCC, windowed_LCC(s))
def test_checksum1(self):
self.seq_checksums(self.str_light_chain_one,
2994980265,
"CRC-44CAAD88706CC153",
9729,
"BpBeDdcNUYNsdk46JoJdw7Pd3BI",
"1.03",
"0.00, 1.00, 0.96, 0.96, 0.96, 0.65, 0.43, 0.35, 0.35, 0.35, 0.35, 0.53, 0.59, 0.26")
def test_checksum2(self):
self.seq_checksums(self.str_light_chain_two,
802105214,
"CRC-44CAAD88706CC153",
9647,
"X5XEaayob1nZLOc7eVT9qyczarY",
"1.07",
"0.00, 1.00, 0.96, 0.96, 0.96, 0.65, 0.43, 0.35, 0.35, 0.35, 0.35, 0.53, 0.59, 0.26")
def test_checksum3(self):
self.seq_checksums("ATGCGTATCGATCGCGATACGATTAGGCGGAT",
817679856,
"CRC-6234FF451DC6DFC6",
7959,
"8WCUbVjBgiRmM10gfR7XJNjbwnE",
"1.98",
"0.00, 2.00, 1.99, 1.99, 2.00, 1.99, 1.97, 1.99, 1.99, 1.99, 1.96, 1.96, 1.96, 1.96")
def test_GC(self):
seq = "ACGGGCTACCGTATAGGCAAGAGATGATGCCC"
self.assertEqual(GC(seq), 56.25)
def test_seq1_seq3(self):
s3 = "MetAlaTyrtrpcysthrLYSLEUILEGlYPrOGlNaSnaLapRoTyRLySSeRHisTrpLysThr"
s1 = "MAYWCTKLIGPQNAPYKSHWKT"
self.assertEqual(seq1(s3), s1)
self.assertEqual(seq3(s1).upper(), s3.upper())
self.assertEqual(seq1(seq3(s1)), s1)
self.assertEqual(seq3(seq1(s3)).upper(), s3.upper())
if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity=2)
unittest.main(testRunner=runner)
|