File: pyani_files.py

package info (click to toggle)
python-pyani 0.2.13-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 159,944 kB
  • sloc: python: 3,106; makefile: 86; sh: 30
file content (52 lines) | stat: -rw-r--r-- 1,664 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Copyright 2013-2017, The James Hutton Insitute
# Author: Leighton Pritchard
#
# This code is part of the pyani package, and is governed by its licence.
# Please see the LICENSE file that should have been included as part of
# this package.

"""Code to help handle files for average nucleotide identity calculations."""

import os

from Bio import SeqIO


# Get a list of FASTA files from the input directory
def get_fasta_files(dirname):
    """Returns a list of FASTA files in the passed directory

    - dirname - path to input directory
    """
    infiles = get_input_files(dirname, '.fasta', '.fas', '.fa', '.fna',
                              '.fsa_nt')
    return infiles


# Get list of FASTA files in a directory
def get_input_files(dirname, *ext):
    """Returns files in passed directory, filtered by extension.

    - dirname - path to input directory
    - *ext - list of arguments describing permitted file extensions
    """
    filelist = [f for f in os.listdir(dirname) if
                os.path.splitext(f)[-1] in ext]
    return [os.path.join(dirname, f) for f in filelist]


# Get lengths of input sequences
def get_sequence_lengths(fastafilenames):
    """Returns dictionary of sequence lengths, keyed by organism.

    Biopython's SeqIO module is used to parse all sequences in the FASTA
    file corresponding to each organism, and the total base count in each
    is obtained.

    NOTE: ambiguity symbols are not discounted.
    """
    tot_lengths = {}
    for fn in fastafilenames:
        tot_lengths[os.path.splitext(os.path.split(fn)[-1])[0]] = \
            sum([len(s) for s in SeqIO.parse(fn, 'fasta')])
    return tot_lengths