File: clustalw.py

package info (click to toggle)
openstructure 2.11.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 206,240 kB
  • sloc: cpp: 188,571; python: 36,686; ansic: 34,298; fortran: 3,275; sh: 312; xml: 146; makefile: 29
file content (148 lines) | stat: -rw-r--r-- 5,618 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from ost.bindings import utils
from ost import settings, io, seq, LogError
import os
import subprocess

def ClustalW(seq1, seq2=None, clustalw=None, keep_files=False, nopgap=False, 
             clustalw_option_string=False):
  '''
  Runs a ClustalW multiple sequence alignment. The results are returned as a
  :class:`~ost.seq.AlignmentHandle` instance.
  
  There are two ways to use this function:
  
   - align exactly two sequences:
   
      :param seq1: sequence_one
      :type seq1: :class:`~ost.seq.SequenceHandle` or :class:`str`
      
      :param seq2: sequence_two
      :type seq2: :class:`~ost.seq.SequenceHandle` or :class:`str`
  
      The two sequences can be specified as two separate function parameters 
      (`seq1`, `seq2`). The type of both parameters can be either
      :class:`~ost.seq.SequenceHandle` or :class:`str`, but must be the same for
      both parameters.
      
   - align two or more sequences:
   
      :param seq1: sequence_list
      :type seq1: :class:`~ost.seq.SequenceList`
      
      :param seq2: must be :class:`None`
      
      Two or more sequences can be specified by using a
      :class:`~ost.seq.SequenceList`. It is then passed as the first function 
      parameter (`seq1`). The second parameter (`seq2`) must be :class:`None`.
      
       
  :param clustalw: path to ClustalW executable (used in :func:`~ost.settings.Locate`)
  :type clustalw: :class:`str`
  :param nopgap: turn residue-specific gaps off
  :type nopgap: :class:`bool`
  :param clustalw_option_string: additional ClustalW flags (see http://www.clustal.org/download/clustalw_help.txt)
  :type clustalw_option_string: :class:`str`
  :param keep_files: do not delete temporary files
  :type keep_files: :class:`bool`

  .. note ::
   
    - In the passed sequences ClustalW will convert lowercase to uppercase, and
      change all '.' to '-'. OST will convert and '?' to 'X' before aligning
      sequences with ClustalW.
    - If a :attr:`sequence name <ost.seq.SequenceHandle.name>` contains spaces,
      only the part before the space is considered as sequence name. To avoid
      surprises, you should remove spaces from the sequence name.
    - Sequence names must be unique (:class:`ValueError` exception raised
      otherwise).

  ClustalW will accept only IUB/IUPAC amino acid and nucleic acid codes:

  ======= ======================= ======= ============================ 
  Residue  Name                   Residue  Name 
  ======= ======================= ======= ============================
     A    alanine                    P    proline
     B    aspartate or asparagine    Q    glutamine
     C    cystine                    R    arginine
     D    aspartate                  S    serine
     E    glutamate                  T    threonine
     F    phenylalanine              U    selenocysteine
     G    glycine                    V    valine
     H    histidine                  W    tryptophan
     I    isoleucine                 Y    tyrosine
     K    lysine                     Z    glutamate or glutamine
     L    leucine                    X    any
     M    methionine                 \\*   translation stop
     N    asparagine                 \\-   gap of indeterminate length
  ======= ======================= ======= ============================ 

  '''
  clustalw_path=settings.Locate(('clustalw', 'clustalw2'), 
                                explicit_file_name=clustalw)
  
  if seq2!=None:
    if isinstance(seq1, seq.SequenceHandle) and isinstance(seq2, seq.SequenceHandle):
      seq_list=seq.CreateSequenceList()
      seq_list.AddSequence(seq1)
      seq_list.AddSequence(seq2)
    elif isinstance(seq1, str) and isinstance(seq2, str):
      seqh1=seq.CreateSequence("seq1", seq1)
      seqh2=seq.CreateSequence("seq2", seq2)
      seq_list=seq.CreateSequenceList()
      seq_list.AddSequence(seqh1)
      seq_list.AddSequence(seqh2)
    else:
      LogError("WARNING: Specify at least two Sequences")
      return
  elif isinstance(seq1, seq.SequenceList):
    seq_list=seq1
  else:
    LogError("WARNING: Specify either two SequenceHandles or one SequenceList")
    return

  sequence_names = set()
  for s in seq_list:
    # we cut out anything after a space to be consistent with ClustalW behaviour
    sequence_names.add(s.GetName().split(' ')[0])
  if len(sequence_names) < len(seq_list):
    raise ValueError("ClustalW can only process sequences with unique identifiers!")


  new_list = seq.CreateSequenceList()
  for s in seq_list:
    ss = s.Copy()
    for i,c in enumerate(ss):
      if c=='?':
        ss[i]='X'
    new_list.AddSequence(ss)

  seq_list = new_list

  
  temp_dir=utils.TempDirWithFiles((seq_list,))
  out=os.path.join(temp_dir.dirname, 'out.fasta')
  command='%s -infile="%s" -output=fasta -outfile="%s"' % (clustalw_path,
                                                           temp_dir.files[0],
                                                           out)
  if nopgap:
    command+=" -nopgap"
  if clustalw_option_string!=False:
    command=command+" "+clustalw_option_string  #see useful flags: http://toolkit.tuebingen.mpg.de/clustalw/help_params

  subprocess.run(command, shell=True, stdout=subprocess.DEVNULL) 

  aln=io.LoadAlignment(out)
  

  for sequence in seq_list:
    for seq_num,aln_seq in enumerate(aln.sequences):
      if aln_seq.GetName()==sequence.GetName():
        break
    aln.SetSequenceOffset(seq_num,sequence.offset)
    if sequence.HasAttachedView():
      aln.AttachView(seq_num,sequence.GetAttachedView().Copy())

  if not keep_files:
    temp_dir.Cleanup()
  
  return aln