1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
|
from ost.bindings import utils
from ost import settings, io, seq, LogError
import os
import subprocess
def ClustalW(seq1, seq2=None, clustalw=None, keep_files=False, nopgap=False,
clustalw_option_string=False):
'''
Runs a ClustalW multiple sequence alignment. The results are returned as a
:class:`~ost.seq.AlignmentHandle` instance.
There are two ways to use this function:
- align exactly two sequences:
:param seq1: sequence_one
:type seq1: :class:`~ost.seq.SequenceHandle` or :class:`str`
:param seq2: sequence_two
:type seq2: :class:`~ost.seq.SequenceHandle` or :class:`str`
The two sequences can be specified as two separate function parameters
(`seq1`, `seq2`). The type of both parameters can be either
:class:`~ost.seq.SequenceHandle` or :class:`str`, but must be the same for
both parameters.
- align two or more sequences:
:param seq1: sequence_list
:type seq1: :class:`~ost.seq.SequenceList`
:param seq2: must be :class:`None`
Two or more sequences can be specified by using a
:class:`~ost.seq.SequenceList`. It is then passed as the first function
parameter (`seq1`). The second parameter (`seq2`) must be :class:`None`.
:param clustalw: path to ClustalW executable (used in :func:`~ost.settings.Locate`)
:type clustalw: :class:`str`
:param nopgap: turn residue-specific gaps off
:type nopgap: :class:`bool`
:param clustalw_option_string: additional ClustalW flags (see http://www.clustal.org/download/clustalw_help.txt)
:type clustalw_option_string: :class:`str`
:param keep_files: do not delete temporary files
:type keep_files: :class:`bool`
.. note ::
- In the passed sequences ClustalW will convert lowercase to uppercase, and
change all '.' to '-'. OST will convert and '?' to 'X' before aligning
sequences with ClustalW.
- If a :attr:`sequence name <ost.seq.SequenceHandle.name>` contains spaces,
only the part before the space is considered as sequence name. To avoid
surprises, you should remove spaces from the sequence name.
- Sequence names must be unique (:class:`ValueError` exception raised
otherwise).
ClustalW will accept only IUB/IUPAC amino acid and nucleic acid codes:
======= ======================= ======= ============================
Residue Name Residue Name
======= ======================= ======= ============================
A alanine P proline
B aspartate or asparagine Q glutamine
C cystine R arginine
D aspartate S serine
E glutamate T threonine
F phenylalanine U selenocysteine
G glycine V valine
H histidine W tryptophan
I isoleucine Y tyrosine
K lysine Z glutamate or glutamine
L leucine X any
M methionine \\* translation stop
N asparagine \\- gap of indeterminate length
======= ======================= ======= ============================
'''
clustalw_path=settings.Locate(('clustalw', 'clustalw2'),
explicit_file_name=clustalw)
if seq2!=None:
if isinstance(seq1, seq.SequenceHandle) and isinstance(seq2, seq.SequenceHandle):
seq_list=seq.CreateSequenceList()
seq_list.AddSequence(seq1)
seq_list.AddSequence(seq2)
elif isinstance(seq1, str) and isinstance(seq2, str):
seqh1=seq.CreateSequence("seq1", seq1)
seqh2=seq.CreateSequence("seq2", seq2)
seq_list=seq.CreateSequenceList()
seq_list.AddSequence(seqh1)
seq_list.AddSequence(seqh2)
else:
LogError("WARNING: Specify at least two Sequences")
return
elif isinstance(seq1, seq.SequenceList):
seq_list=seq1
else:
LogError("WARNING: Specify either two SequenceHandles or one SequenceList")
return
sequence_names = set()
for s in seq_list:
# we cut out anything after a space to be consistent with ClustalW behaviour
sequence_names.add(s.GetName().split(' ')[0])
if len(sequence_names) < len(seq_list):
raise ValueError("ClustalW can only process sequences with unique identifiers!")
new_list = seq.CreateSequenceList()
for s in seq_list:
ss = s.Copy()
for i,c in enumerate(ss):
if c=='?':
ss[i]='X'
new_list.AddSequence(ss)
seq_list = new_list
temp_dir=utils.TempDirWithFiles((seq_list,))
out=os.path.join(temp_dir.dirname, 'out.fasta')
command='%s -infile="%s" -output=fasta -outfile="%s"' % (clustalw_path,
temp_dir.files[0],
out)
if nopgap:
command+=" -nopgap"
if clustalw_option_string!=False:
command=command+" "+clustalw_option_string #see useful flags: http://toolkit.tuebingen.mpg.de/clustalw/help_params
subprocess.run(command, shell=True, stdout=subprocess.DEVNULL)
aln=io.LoadAlignment(out)
for sequence in seq_list:
for seq_num,aln_seq in enumerate(aln.sequences):
if aln_seq.GetName()==sequence.GetName():
break
aln.SetSequenceOffset(seq_num,sequence.offset)
if sequence.HasAttachedView():
aln.AttachView(seq_num,sequence.GetAttachedView().Copy())
if not keep_files:
temp_dir.Cleanup()
return aln
|