File: fasta_iterator.py

package info (click to toggle)
python-biopython 1.78%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 65,756 kB
  • sloc: python: 221,141; xml: 178,777; ansic: 13,369; sql: 1,208; makefile: 131; sh: 70
file content (31 lines) | stat: -rw-r--r-- 940 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
#

"""Example using Bio.SeqIO to parse a FASTA file."""


from Bio import SeqIO


def extract_organisms(file_to_parse, fmt):
    """Extract species names from sequence description line."""
    all_species = set()
    for cur_record in SeqIO.parse(open(file_to_parse), fmt):
        # extract the info from the description
        new_species = cur_record.description.split()[1]

        all_species.add(new_species)

    # sorting the species wil convert the set to a list
    all_species = sorted(all_species)

    return all_species


if __name__ == "__main__":
    print("Using Bio.SeqIO on a FASTA file")
    all_species = extract_organisms("ls_orchid.fasta", "fasta")
    print("number of species: %i" % len(all_species))
    print("species names: %s" % all_species)