1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
|
"""Example using Bio.SeqIO to load a FASTA file as a dictionary.
An example function (get_accession_num) is defined to demonstrate
a non-trivial naming scheme where the dictionary key is based on
the record identifier.
The first version uses Bio.SeqIO.parse() and loads the entire
FASTA file into memory as a Python dictionary of SeqRecord
objects. This is *not* suitable for large files.
The second version used Bio.SeqIO.index() which is suitable
for FASTA files with millions of records.
See also Bio.SeqIO.index_db() and the examples in the main tutorial.
"""
from __future__ import print_function
from Bio.Alphabet import generic_dna
from Bio import SeqIO
def get_accession_num(seq_record):
accession_atoms = seq_record.id.split('|')
gb_name = accession_atoms[3]
# strip the version info before returning
return gb_name[:-2]
# In Memory
# =========
# This next bit of code uses Bio.SeqIO.parse() to load a FASTA file,
# and then turns it into an in-memory python dictionary.
# This is *not* suitable for FASTA files with millions of entries.
rec_iterator = SeqIO.parse("ls_orchid.fasta", "fasta", generic_dna)
orchid_dict = SeqIO.to_dict(rec_iterator, get_accession_num)
for id_num in orchid_dict:
print('id number: %s' % id_num)
print('description: %s' % orchid_dict[id_num].description)
print('sequence: %s' % orchid_dict[id_num].seq)
# Indexed
# =======
# This next version uses the Bio.SeqIO.index() function which will index
# the FASTA file without loading all the records into memory at once.
# This is suitable for FASTA files with millions of entries.
orchid_dict = SeqIO.index("ls_orchid.fasta", "fasta", generic_dna)
for id_num in orchid_dict:
print('id number: %s' % id_num)
print('description: %s' % orchid_dict[id_num].description)
print('sequence: %s' % orchid_dict[id_num].seq)
|