File: kclust.py

package info (click to toggle)
openstructure 2.11.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 206,240 kB
  • sloc: cpp: 188,571; python: 36,686; ansic: 34,298; fortran: 3,275; sh: 312; xml: 146; makefile: 29
file content (152 lines) | stat: -rw-r--r-- 4,490 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import subprocess, os, tempfile
from ost import io, seq, settings

"""
Wrapper for kClust a protein sequence clustering algorithm

unpublished, but mentioned in:

Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding
HHblits: lighting-fast iterative protein sequence searching by
HMM-HMM alignment
Nature Methods 9, 173-175 (2012)

source code can be downloaded from:

ftp://toolkit.genzentrum.lmu.de/pub/
"""

class cluster:

  """
  Holds the information of one cluster
  
  .. attribute:: sequences
    
    SequenceList containing all sequences of the cluster
  
  .. attribute:: representative_id
  
    a string, that contains the name of the representative sequence of the cluster
    as estimated by kClust. 
    
  .. attribute:: alignment

    alignment handle containing a multiple sequence alignment of all sequences of
    the cluster. Gets only calculated when binding is called with appropriate flag.
  """

  def __init__(self, sequences, representative_id):

    self.sequences=sequences
    self.representative_id=representative_id
    self.alignment=None

def _SetupFiles(sequences):

  # create temporary directory
  tmp_dir_name=tempfile.mkdtemp()
  io.SaveSequenceList(sequences, os.path.join(tmp_dir_name,'fastadb.fasta'))
  return tmp_dir_name

def _CleanupFiles(tmp_dir_name):

  import shutil
  shutil.rmtree(tmp_dir_name)

def _ParseOutput(tmp_dir_name):

  with open(os.path.join(tmp_dir_name,'headers.dmp'),'r') as f:
    header_data=f.readlines()
  with open(os.path.join(tmp_dir_name,'clusters.dmp'),'r') as f:
    cluster_data=f.readlines()
  sequences=io.LoadSequenceList(os.path.join(tmp_dir_name,'fastadb.fasta'))

  clusters=dict()
  header_mapper=dict()
  for line in header_data:
    header_mapper[int(line.split()[0])]=line.split()[1].strip().strip('>')

  #find numeric ids of the representatives of the clusters
  unique_representatives=list()
  for line in cluster_data[1:]:
    actual_cluster=int(line.split()[1])
    try:
      unique_representatives.index(actual_cluster)
    except:
      unique_representatives.append(actual_cluster)

  #assign every header to its corresponding cluster, where the
  #cluster id is given by the id of the representative of the cluster
  for idx in unique_representatives:
    clusters[idx]=seq.CreateSequenceList()
  for line in cluster_data[1:]:
    clusters[int(line.split()[1])].AddSequence(sequences.FindSequence(header_mapper[int(line.split()[0])]))

  #translate into final output

  res=list()
  for k, v in clusters.items():
    res.append(cluster(v, header_mapper[k]))

  return res
  
def _RunkClust(tmp_dir_name, clustering_thresh, create_alignments):

  bitscore=clustering_thresh*0.060269-0.68498

  executable=settings.Locate('kClust') 

  cmd=[]
  cmd.append(executable)
  cmd.append('-i')
  cmd.append(os.path.join(tmp_dir_name,'fastadb.fasta'))
  cmd.append('-d')
  cmd.append(tmp_dir_name)
  cmd.append('-s')
  cmd.append(str(bitscore))

  cmd=' '.join(cmd)
  ps=subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  stdout, stderr = ps.communicate()

  result=_ParseOutput(tmp_dir_name)

  if(create_alignments):
    from ost.bindings import clustalw
    for c in result:
      if len(c.sequences)>1:
        c.alignment=clustalw.ClustalW(c.sequences)
      else:
        aln=seq.CreateAlignment()
        aln.AddSequence(c.sequences[0])
        c.alignment=aln

  return result

def kClust(sequences, clustering_thresh=30, create_alignments=False):

  """
  Uses kClust to generate clusters of amino acid sequences.
  
  :param sequences: All sequences you want to cluster. 
  :type sequences: :class:`ost.seq.SequenceList`

  :param clustering_thres: Sequence identity threshold to build clusters. Note,
                           that clustering_thresh is more a rule of thumb, since
                           compositional bias in the sequence can also play a role.
                           The value gets transformed in a bitscore, that is used
                           as an input parameter of kClust 

  :param create_alignments: Flag, wether the alignments of the clusters get calculated.
                            Requires clustalw in the path.

  :returns: A list of cluster instances
  
  :raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.
  """

  tmp_dir_name=_SetupFiles(sequences)
  result=_RunkClust(tmp_dir_name, clustering_thresh, create_alignments)
  _CleanupFiles(tmp_dir_name)
  return result