File: clustal_run.py

package info (click to toggle)

python-biopython 1.73%2Bdfsg-1

links: PTS, VCS
area: main
in suites: buster
size: 57,852 kB
sloc: python: 169,977; xml: 97,539; ansic: 15,653; sql: 1,208; makefile: 159; sh: 63

file content (74 lines) | stat: -rw-r--r-- 2,288 bytes

#!/usr/bin/env python
# Copyright 2000 Brad Chapman.  All rights reserved.
#
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Run clustalw and parse the output.

Example code to show how to create a clustalw command line, run clustalw
and parse the results into an object that can be dealt with easily.
"""
# standard library

from __future__ import print_function

import sys
import subprocess

# biopython
from Bio.Alphabet import Gapped, IUPAC
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio.SubsMat import FreqTable

# create the command line to run clustalw
# this assumes you've got clustalw somewhere on your path, otherwise
# you need to pass the full path of the executable to this via cmd="..."
cline = ClustalwCommandline(infile='opuntia.fasta', outfile='test.aln')

# actually perform the alignment
return_code = subprocess.call(str(cline), shell=(sys.platform != "win32"))
assert return_code == 0, "Calling ClustalW failed"

# Parse the output
alignment = AlignIO.read("test.aln", "clustal",
                         alphabet=Gapped(IUPAC.unambiguous_dna))

print(alignment)

print('first description: %s' % alignment[0].description)
print('first sequence: %s' % alignment[0].seq)

# get the length of the alignment
print('length %i' % alignment.get_alignment_length())

print(alignment)

# print out interesting information about the alignment
summary_align = AlignInfo.SummaryInfo(alignment)

consensus = summary_align.dumb_consensus()
print('consensus %s' % consensus)

my_pssm = summary_align.pos_specific_score_matrix(consensus,
                                                  chars_to_ignore=['N'])

print(my_pssm)

expect_freq = {
    'A': .3,
    'G': .2,
    'T': .3,
    'C': .2}

freq_table_info = FreqTable.FreqTable(expect_freq, FreqTable.FREQ,
                                      IUPAC.unambiguous_dna)

info_content = summary_align.information_content(5, 30,
                                                 chars_to_ignore=['N'],
                                                 e_freq_table=freq_table_info)

print("relative info content: %f" % info_content)