File: hhsearch.py

package info (click to toggle)
openstructure 2.11.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 206,256 kB
  • sloc: cpp: 188,571; python: 36,686; ansic: 34,298; fortran: 3,275; sh: 312; xml: 146; makefile: 29
file content (121 lines) | stat: -rw-r--r-- 3,586 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""
Author: Marco Biasini
"""

import re
from ost import seq

class HHSearchHit:
  def __init__(self, summary, alignment):
    self.summary=summary
    self.alignment=alignment

class HitSummary:
  def __init__(self, pdb_id, chain, prob, e_value, query_start, query_end, 
               template_start, template_end):
   self.prob=prob
   self.pdb_id=pdb_id
   self.chain=chain
   self.e_value=e_value   
   self.query_start=query_start
   self.query_end=query_end
   self.template_start=template_start
   self.template_end=template_end

class HHSearchResult:
  """
  Read HHSearch result file. The result is stored in a list of hh search hits.
  
  Usage:
  
  result=HHSearchResult('output.hhr')
  for hit in result.hits:
    print hit.pdb_id, hit.chain
    print hit.alignment.ToString(80)
  """
  def __init__(self, filename, pipe_separated=False):
    self.pipe_separated=pipe_separated    
    self._Read(filename)
  def _Read(self, filename):
    ifile=open(filename)
    self._ReadHeader(ifile)
    summaries=self._ReadHitSummaries(ifile)
    self.hits=self._ReadHitDetails(ifile, summaries)
    
  def _ReadHeader(self, ifile):
    header_lines=[]
    for line in ifile:
      stripped_line=line.strip()
      if stripped_line=='':
        break
      header_lines.append(stripped_line)
    for header_line in header_lines:
      var, value=re.split('\s+', header_line, 1)
      setattr(self, var.lower(), value)
      
  def _ReadHitSummaries(self, ifile):
    summary_lines=[]
    skip_header=True
    for line in ifile:
      if skip_header==True:
        skip_header=False
        continue

      stripped_line=line.strip()
      if stripped_line=='':
        break
      summary_lines.append(line)
    summaries=[]
    for summary_line in summary_lines:
      pdb_id, chain=(None, None)
      if self.pipe_separated:
        parts=summary_line[4:37].split('|')
        pdb_id=parts[1][:4]
        chain=parts[1][4]
      else:
        pdb_id=summary_line[4:8]
        chain=summary_line[9]
      prob=float(summary_line[36:40])
      e_value=0.0
      query_range=summary_line[76:84].split('-')
      query_start=int(query_range[0].strip())
      query_end=int(query_range[1].strip())
      template_range=summary_line[86:94].split('-')
      template_start=int(template_range[0].strip())
      template_end=int(template_range[1].strip())
      summaries.append(HitSummary(pdb_id, chain, prob, e_value, query_start,
                                  query_end, template_start, template_end))
    return summaries

  def _ReadHitDetails(self, ifile, summaries):
    hits=[]
    for summary in summaries:
      alignment=self._ReadHitDetail(ifile)
      hits.append(HHSearchHit(summary, alignment))
    return hits
  def _ReadHitDetail(self, ifile):
    skip_header=True
    q_seq, t_seq=('', '')
    for line in ifile:
      if skip_header==True:
        if line.startswith('>'):
          skip_header=False
        continue
      if line.startswith('No'):
        break
      if line.strip()=='':
        continue        
      if line.startswith('Q ss_pred') or line.startswith('Q Consensus'):
        continue
      if line.startswith('T ss_pred') or line.startswith('T Consensus'):
        continue
      if line.startswith(' '):
        continue
      if line.startswith('Q'):
        q_seq+=re.split('\s+', line)[3]
      if line.startswith('T'):
        t_seq+=re.split('\s+', line)[3]        
    ali=seq.AlignmentHandle()
    ali.AddSequence(seq.Sequence.FromString('query', q_seq))
    ali.AddSequence(seq.Sequence.FromString('target', t_seq))    
    return ali