File: __init__.py

package info (click to toggle)
python-biopython 1.42-2
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 17,584 kB
  • ctags: 12,272
  • sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203
file content (352 lines) | stat: -rw-r--r-- 13,279 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
from Numeric import *
from cluster import *


def _treesort(order, nodeorder, nodecounts, tree):
  nNodes = len(tree)
  nElements = nNodes + 1
  neworder = zeros(nElements,'d')
  clusterids = range(nElements)
  for i in range(nNodes):
    i1 = tree[i].left
    i2 = tree[i].right
    if i1 < 0:
      order1 = nodeorder[-i1-1]
      count1 = nodecounts[-i1-1]
    else:
      order1 = order[i1]
      count1 = 1
    if i2 < 0:
      order2 = nodeorder[-i2-1]
      count2 = nodecounts[-i2-1]
    else:
      order2 = order[i2]
      count2 = 1
    # If order1 and order2 are equal, their order is determined by the order in which they were clustered
    if i1 < i2:
      if order1 < order2:
        increase = count1
      else:
        increase = count2
      for j in range(nElements):
        clusterid = clusterids[j]
        if clusterid==i1 and order1>=order2: neworder[j] += increase
        if clusterid==i2 and order1<order2: neworder[j] += increase
        if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1
    else:
      if order1<=order2:
        increase = count1
      else:
        increase = count2
      for j in range(nElements):
        clusterid = clusterids[j]
        if clusterid==i1 and order1>order2: neworder[j] += increase
        if clusterid==i2 and order1<=order2: neworder[j] += increase
        if clusterid==i1 or clusterid==i2: clusterids[j] = -i-1
  return argsort(neworder)

def _savetree(jobname, tree, order, transpose):
  if transpose==0:
    extension = ".gtr"
    keyword = "GENE"
  else:
    extension = ".atr"
    keyword = "ARRY"
  nnodes = len(tree)
  outputfile = open(jobname+extension, "w");
  nodeindex = 0
  nodeID = [''] * (nnodes)
  nodecounts = zeros(nnodes)
  nodeorder = zeros(nnodes,'d')
  nodedist = array([node.distance for node in tree])
  for nodeindex in range(nnodes):
    min1 = tree[nodeindex].left
    min2 = tree[nodeindex].right
    nodeID[nodeindex] = "NODE%dX" % (nodeindex+1)
    outputfile.write(nodeID[nodeindex])
    outputfile.write("\t")
    if min1 < 0:
      index1 = -min1-1
      order1 = nodeorder[index1]
      counts1 = nodecounts[index1]
      outputfile.write(nodeID[index1]+"\t")
      nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index1])
    else:
      order1 = order[min1]
      counts1 = 1
      outputfile.write("%s%dX\t" % (keyword, min1))
    if min2 < 0:
      index2 = -min2-1
      order2 = nodeorder[index2]
      counts2 = nodecounts[index2]
      outputfile.write(nodeID[index2]+"\t")
      nodedist[nodeindex] = max(nodedist[nodeindex],nodedist[index2])
    else:
      order2 = order[min2];
      counts2 = 1;
      outputfile.write("%s%dX\t" % (keyword, min2))
    outputfile.write(str(1.0-nodedist[nodeindex]))
    outputfile.write("\n")
    nodecounts[nodeindex] = counts1 + counts2
    nodeorder[nodeindex] = (counts1*order1+counts2*order2) / (counts1+counts2)
  outputfile.close()
  # Now set up order based on the tree structure
  index = _treesort(order, nodeorder, nodecounts, tree)
  return index

class DataFile:
  """DataFile reads a file containing gene expression data following
     Michael Eisen's format for Cluster/TreeView. A DataFile object
     has the following members:
data:     a matrix containing the gene expression data
mask:     a matrix containing only 1's and 0's, denoting which values are
          present (1) or missing (0). If all elements of mask are one
          (no missing data), then None is returned instead of the mask
geneid:   a list containing a unique identifier for each gene
          (e.g., ORF name)
genename: a list containing an additional description for each gene
          (e.g., gene name)
gweight:  the weight to be used for each gene when calculating the distance
gorder:   an array of real numbers indicating the preferred order of the
          genes in the output file
expid:    a list containing a unique identifier for each experimental
          condition
eweight:  the weight to be used for each experimental condition when
          calculating the distance
eorder:   an array of real numbers indication the preferred order in the
          output file of the experimental conditions
uniqid:   the string that was used instead of UNIQID in the input file."""
  def __init__(self, filename=None):
    """Reads a data file in the format corresponding to Michael Eisen's Cluster/TreeView program, and stores the data in a DataFile object"""
    self.data = None
    self.mask = None
    self.geneid = None
    self.genename = None
    self.gweight = None
    self.gorder = None
    self.expid = None
    self.eweight = None
    self.eorder = None
    self.uniqid = None
    if not filename: return
    inputfile = open(filename)
    lines = inputfile.readlines()
    inputfile.close()
    lines = [line.strip("\r\n").split("\t") for line in lines]
    line = lines[0]
    n = len(line)
    self.uniqid = line[0]
    self.expid = []
    cols = {0: "GENEID"}
    for word in line[1:]:
      if word=="NAME":
        cols[line.index(word)] = word
        self.genename = []
      elif word=="GWEIGHT":
        cols[line.index(word)] = word
        self.gweight = []
      elif word=="GORDER":
        cols[line.index(word)] = word
        self.gorder = []
      else: self.expid.append(word)
    self.geneid = []
    self.data = []
    self.mask = []
    needmask = 0
    for line in lines[1:]:
      assert len(line)==n, "Line with %d columns found (expected %d)" % (len(line), n)
      if line[0]=="EWEIGHT":
        i = max(cols) + 1
        self.eweight = map(float, line[i:])
        continue
      if line[0]=="EORDER":
        i = max(cols) + 1
        self.eorder = map(float, line[i:])
        continue
      rowdata = []
      rowmask = []
      n = len(line)
      for i in range(n):
        word = line[i]
        if i in cols:
          if cols[i]=="GENEID": self.geneid.append(word)
          if cols[i]=="NAME": self.genename.append(word)
          if cols[i]=="GWEIGHT": self.gweight.append(float(word))
          if cols[i]=="GORDER": self.gorder.append(float(word))
          continue
        if not word:
          rowdata.append(0.0)
          rowmask.append(0)
          needmask = 1
        else:
          rowdata.append(float(word))
          rowmask.append(1)
      self.data.append(rowdata)
      self.mask.append(rowmask)
    self.data = array(self.data)
    if needmask: self.mask = array(self.mask)
    else: self.mask = None
    if self.gweight: self.gweight = array(self.gweight)
    if self.gorder: self.gorder = array(self.gorder)

  def treecluster(self, transpose=0, method='m', dist='e'):
    if transpose==0: weight = self.eweight
    else: weight = self.gweight
    return treecluster(self.data, self.mask, weight, transpose, method, dist)

  def kcluster(self, nclusters=2, transpose=0, npass=1, method='a', dist='e', initialid=None):
    if transpose==0: weight = self.eweight
    else: weight = self.gweight
    clusterid, error, nfound = kcluster(self.data, nclusters, self.mask, weight, transpose, npass, method, dist, initialid)
    return clusterid, error, nfound

  def somcluster(self, transpose=0, nxgrid=2, nygrid=1, inittau=0.02, niter=1, dist='e'):
    if transpose==0: weight = self.eweight
    else: weight = self.gweight
    clusterid, celldata = somcluster(self.data, self.mask, weight, transpose, nxgrid, nygrid, inittau, niter, dist)
    return clusterid, celldata

  def clustercentroids(self, clusterid=None, method='a', transpose=0):
    cdata, cmask = clustercentroids(self.data, self.mask, clusterid, method, transpose)
    return cdata, cmask

  def clusterdistance(self, index1=[0], index2=[0], method='a', dist='e', transpose=0):
    if transpose==0: weight = self.eweight
    else: weight = self.gweight
    return clusterdistance(self.data, self.mask, weight, index1, index2, method, dist, transpose)

  def distancematrix(self, transpose=0, dist='e'):
    if transpose==0: weight = self.eweight
    else: weight = self.gweight
    return distancematrix(self.data, self.mask, weight, transpose, dist)

  def save(self, jobname, geneclusters=None, expclusters=None):
    """save(jobname, geneclusters=None, expclusters=None)
saves the clustering results. The saved files follow the convention for
Java TreeView program, which can therefore be used to view the clustering
result.
Arguments:
jobname:   The base name of the files to be saved. The filenames are
           jobname.cdt, jobname.gtr, and jobname.atr for hierarchical
           clustering, and jobname-K*.cdt, jobname-K*.kgg, jobname-K*.kag
           for k-means clustering results
geneclusters=None:  For hierarchical clustering results, geneclusters
           is an (ngenes-1 x 2) array that describes the hierarchical
           clustering result for genes. This array can be calculated
           by the hierarchical clustering methods implemented in
           treecluster.
           For k-means clustering results, geneclusters is a vector
           containing ngenes integers, describing to which cluster a
           given gene belongs. This vector can be calculated by kcluster.
expclusters=None:  For hierarchical clustering results, expclusters
           is an (nexps-1 x 2) array that describes the hierarchical
           clustering result for experimental conditions. This array can
           be calculated by the hierarchical clustering methods implemented
           in treecluster.
           For k-means clustering results, expclusters is a vector
           containing nexps integers, describing to which cluster a
           given experimental condition belongs. This vector can be
           calculated by kcluster.
"""
    (ngenes,nexps) = shape(self.data)
    if self.gorder==None: gorder = arange(ngenes)
    else: gorder = self.gorder
    if self.eorder==None: eorder = arange(nexps)
    else: eorder = self.eorder
    if geneclusters and expclusters:
      assert type(geneclusters)==type(expclusters), "found one k-means and one hierarchical clustering solution in geneclusters and expclusters"
    gid = 0
    aid = 0
    filename = jobname
    postfix = ""
    if type(geneclusters)==Tree:
      # Hierarchical clustering result
      geneindex = _savetree(jobname, geneclusters, gorder, 0)
      gid = 1
    elif geneclusters:
      # k-means clustering result
      filename = jobname + "_K"
      k = max(geneclusters+1)
      kggfilename = "%s_K_G%d.kgg" % (jobname, k)
      geneindex = self._savekmeans(kggfilename, geneclusters, gorder, 0)
      postfix = "_G%d" % k
    else:
      geneindex = argsort(gorder)
    if type(expclusters)==Tree:
      # Hierarchical clustering result
      expindex = _savetree(jobname, expclusters, eorder, 1)
      aid = 1
    elif expclusters:
      # k-means clustering result
      filename = jobname + "_K"
      k = max(expclusters+1)
      kagfilename = "%s_K_A%d.kag" % (jobname, k)
      expindex = self._savekmeans(kagfilename, expclusters, eorder, 1)
      postfix += "_A%d" % k
    else:
      expindex = argsort(eorder)
    filename = filename + postfix
    self._savedata(filename,gid,aid,geneindex,expindex)

  def _savekmeans(self, filename, clusterids, order, transpose):
    if transpose==0:
      label = self.uniqid
      names = self.geneid
    else:
      label = "ARRAY"
      names = self.expid
    outputfile = open(filename, "w");
    if not outputfile: raise "Error: Unable to open output file"
    outputfile.write(label + "\tGROUP\n")
    index = argsort(order)
    n = len(names)
    sortedindex = zeros(n)
    counter = 0
    cluster = 0
    while counter < n:
      for j in index:
        if clusterids[j]==cluster:
          outputfile.write("%s\t%s\n" % (names[j], cluster))
          sortedindex[counter] = j
          counter+=1
      cluster+=1
    outputfile.close();
    return index

  def _savedata(self, jobname, gid, aid, geneindex, expindex):
    if self.genename==None: genename = self.geneid
    else: genename = self.genename
    (ngenes, nexps) = shape(self.data)
    outputfile = open(jobname+'.cdt', 'w')
    if not outputfile: return "Error: Unable to open output file"
    if self.mask: mask = self.mask
    else: mask = ones((ngenes,nexps))
    if self.gweight: gweight = self.gweight
    else: gweight = ones(ngenes)
    if self.eweight: eweight = self.eweight
    else: eweight = ones(nexps)
    if gid: outputfile.write ('GID\t')
    outputfile.write(self.uniqid)
    outputfile.write('\tNAME\tGWEIGHT')
    # Now add headers for data columns
    for j in expindex: outputfile.write('\t%s' % self.expid[j])
    outputfile.write('\n')
    if aid:
      outputfile.write("AID")
      if gid: outputfile.write('\t')
      outputfile.write("\t\t")
      for j in expindex: outputfile.write ('\tARRY%dX' % j)
      outputfile.write('\n')
    outputfile.write('EWEIGHT')
    if gid: outputfile.write('\t')
    outputfile.write('\t\t')
    for j in expindex: outputfile.write('\t%f' % eweight[j])
    outputfile.write('\n')
    for i in geneindex:
      if gid: outputfile.write('GENE%dX\t' % i)
      outputfile.write("%s\t%s\t%f" % (self.geneid[i], genename[i], gweight[i]))
      for j in expindex:
        outputfile.write('\t')
        if mask[i][j]: outputfile.write(str(self.data[i][j]))
      outputfile.write('\n')
    outputfile.close()