1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
|
#!/usr/bin/env python
import io
import unittest
import gemmi
try:
from Bio import SeqIO
except ImportError:
SeqIO = None
# https://rest.uniprot.org/uniprotkb/P00698.fasta
FASTA1 = """\
>sp|P00698|LYSC_CHICK Lysozyme C OS=Gallus gallus OX=9031 GN=LYZ PE=1 SV=1
MRSLLILVLCFLPLAALGKVFGRCELAAAMKRHGLDNYRGYSLGNWVCAAKFESNFNTQA
TNRNTDGSTDYGILQINSRWWCNDGRTPGSRNLCNIPCSALLSSDITASVNCAKKIVSDG
NGMNAWVAWRNRCKGTDVQAWIRGCRL
""" # noqa: W291 - trailing whitespace
# https://en.wikipedia.org/wiki/FASTA_format
FASTA2 = """\
>MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken
MADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID
FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA
DIDGDGQVNYEEFVQMMTAK*
>gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
IENY
""" # noqa: W291 - trailing whitespace
# https://www.metagenomics.wiki/tools/fastq/multi-fasta-format
FASTA3 = """\
>sequenceID-001 description
AAGTAGGAATAATATCTTATCATTATAGATAAAAACCTTCTGAATTTGCTTAGTGTGTAT
ACGACTAGACATATATCAGCTCGCCGATTATTTGGATTATTCCCTG
>sequenceID-002 description
CAGTAAAGAGTGGATGTAAGAACCGTCCGATCTACCAGATGTGATAGAGGTTGCCAGTAC
AAAAATTGCATAATAATTGATTAATCCTTTAATATTGTTTAGAATATATCCGTCAGATAA
TCCTAAAAATAACGATATGATGGCGGAAATCGTC
>sequenceID-003 description
CTTCAATTACCCTGCTGACGCGAGATACCTTATGCATCGAAGGTAAAGCGATGAATTTAT
CCAAGGTTTTAATTTG
""" # noqa: W291 - trailing whitespace
# from _entity_poly.pdbx_seq_one_letter_code from 5I55 and 1PFE
FASTA4 = """\
>
(MSE)EFVAKLFKFFKDLLGKFLGNN
>
(DSN)A(N2C)(MVA)(DSN)A(NCY)(MVA)
"""
# https://biopython.org/docs/1.81/api/Bio.SeqIO.PirIO.html
PIR1 = """\
>P1;S27231
rhodopsin - northern leopard frog
MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*
>P1;I51200
rhodopsin - African clawed frog
MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*
>P1;JN0120
rhodopsin - Japanese lamprey
MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*
""" # noqa: W291 - trailing whitespace
# https://www.bioinformatics.nl/tools/crab_pir.html
PIR2 = """\
>P1;CRAB_ANAPL
ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
SDVPERSIPI TREEKPAIAG AQRK*
>P1;CRAB_BOVIN
ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
QASGPERTIP ITREEKPAVT AAPKK*
""" # noqa: W291 - trailing whitespace
@unittest.skipIf(SeqIO is None, "BioPython not installed.")
class TestReadingSeq(unittest.TestCase):
def test_fasta(self):
for string in (FASTA1, FASTA2, FASTA3, FASTA4):
gemmi_seqs = gemmi.read_pir_or_fasta(string)
biopy_seqs = list(SeqIO.parse(io.StringIO(string), 'fasta'))
self.assertEqual(len(gemmi_seqs), len(biopy_seqs))
for (gseq, bseq) in zip(gemmi_seqs, biopy_seqs):
self.assertEqual(gseq.header, bseq.description)
self.assertEqual(gseq.seq, bseq.seq.rstrip('*'))
def test_pir(self):
for string in (PIR1, PIR2):
gemmi_seqs = gemmi.read_pir_or_fasta(string)
biopy_seqs = list(SeqIO.parse(io.StringIO(string), 'pir'))
self.assertEqual(len(gemmi_seqs), len(biopy_seqs))
for (gseq, bseq) in zip(gemmi_seqs, biopy_seqs):
g_first, g_desc = gseq.header.splitlines()
g_type, _, g_id = g_first.partition(';')
self.assertEqual(g_type, bseq.annotations["PIR-type"])
self.assertEqual(g_id, bseq.id)
self.assertEqual(g_desc, bseq.description)
self.assertEqual(gseq.seq, bseq.seq)
def test_code_conversion_aa(self):
seq1 = gemmi.read_pir_or_fasta(FASTA1)[0].seq
seq3 = gemmi.expand_one_letter_sequence(seq1, gemmi.ResidueKind.AA)
self.assertEqual(seq1, gemmi.one_letter_code(seq3))
def test_code_conversion_dna(self):
seq1 = gemmi.read_pir_or_fasta(FASTA3)[0].seq
kind = gemmi.ResidueKind.DNA
seq3 = gemmi.expand_one_letter_sequence(seq1, kind)
self.assertEqual(seq1, gemmi.one_letter_code(seq3))
self.assertEqual(seq1, gemmi.pdbx_one_letter_code(seq3, kind))
def test_code_conversion_rna(self):
seq1 = 'GGCGAUACCAGCCGAAAGGCCCUUGGCAGCGCC' # from 8d2b
kind = gemmi.ResidueKind.RNA
seq3 = gemmi.expand_one_letter_sequence(seq1, kind)
self.assertEqual(seq1, gemmi.one_letter_code(seq3))
self.assertEqual(seq1, gemmi.pdbx_one_letter_code(seq3, kind))
def test_code_with_brackets(self):
# test 1PFE _entity_poly.pdbx_seq_one_letter_code[_can]
seq1 = gemmi.read_pir_or_fasta(FASTA4)[1].seq
kind = gemmi.ResidueKind.AA
seq3 = gemmi.expand_one_letter_sequence(seq1, kind)
self.assertEqual(seq3, ['DSN', 'ALA', 'N2C', 'MVA',
'DSN', 'ALA', 'NCY', 'MVA'])
self.assertEqual(seq1, gemmi.pdbx_one_letter_code(seq3, kind))
if __name__ == '__main__':
unittest.main()
|