File: pdb_color.py

package info (click to toggle)
python-cogent 1.5.3-2
  • links: PTS, VCS
  • area: main
  • in suites: jessie, jessie-kfreebsd
  • size: 16,424 kB
  • ctags: 24,343
  • sloc: python: 134,200; makefile: 100; ansic: 17; sh: 10
file content (273 lines) | stat: -rw-r--r-- 9,697 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#!/usr/bin/env python
from __future__ import division
from cogent.app.muscle_v38 import muscle_seqs
from cogent.app.util import get_tmp_filename
from cogent.parse.fasta import MinimalFastaParser
from numpy import array

__author__ = "Rob Knight"
__copyright__ = "Copyright 2007-2012, The Cogent Project"
__credits__ = ["Rob Knight", "Gavin Huttley", "Jeremy Widmann"]
__license__ = "GPL"
__version__ = "1.5.3"
__maintainer__ = "Rob Knight"
__email__ = "rob@spot.colorado.edu"
__status__ = "Production"

three_to_one = {'ALA':'A','CYS':'C','ASP':'D','GLU':'E', 'PHE':'F', 'GLY':'G',
    'HIS':'H','ILE':'I','LYS':'K','LEU':'L','FME':'M','MET':'M','MSE':'M',
    'ASN':'N','PRO':'P',
    'GLN':'Q','ARG':'R','SER':'S','THR':'T','SEC':'U','VAL':'V','TRP':'W',
    'TYR':'Y'}
nucleotides = {'A':'A','C':'C','G':'G','U':'U','T':'T','I':'I',\
    '1MA':'A','2MA':'A','SRA':'A','MIA':'A','MAD':'A','A2M':'A','AVC':'A',
    'PPU':'A','AET':'A','A23':'A','5AA':'A','T6A':'A','MTU':'A','LCA':'A',
    '5MC':'C','CCC':'C','OMC':'C','CH':'C','10C':'C',
    '1MG':'G','QUO':'G','G7M':'G','GDP':'G','YG':'G','7MG':'G','OMG':'G',
    'M2G':'G','2MG':'G','GTP':'G','YYG':'G',
    'PSU':'U','H2U':'U','5MU':'U','4SU':'U','ONE':'U','+U':'U','DHU':'U',
    'IU':'U','SSU':'U','OMU':'U','UR3':'U','MNU':'U','5BU':'U','S4U':'U',
    'UMS':'U'
    }


def get_aligned_muscle(seq1,seq2):
    """Returns aligned sequences and frac_same using MUSCLE.

        This needs to be moved to the muscle app controller
    """
    outname = get_tmp_filename()
    res = muscle_seqs([seq1,seq2], add_seq_names=True, WorkingDir="/tmp", out_filename=outname)
    seq1_aligned,seq2_aligned =list(MinimalFastaParser(res['MuscleOut'].read()))
    res.cleanUp()
    del(res)
    seq1_aligned = seq1_aligned[1][1:]
    seq2_aligned = seq2_aligned[1][1:]
    frac_same = (array(seq1_aligned, 'c') == array(seq2_aligned, 'c')).sum(0)\
            / min(len(seq1), len(seq2))
    
    return seq1_aligned,seq2_aligned,frac_same

def get_chains(lines):
    """From list of lines in pdb records, returns dict {chain:[(pos,residue)]}.

    Keeps original 1-based numbering. All residues will be returned.
    """
    chains = {}
    last_resnum = None
    ter=False
    prev_chains=[]
    for line in lines:
        #skip if not an atom line
        if not (line.startswith('ATOM') or line.startswith('HETATM')):
            #If chain terminated, record chain.
            if line.startswith('TER'):
                ter=True
                prev_chains.append(chain)
            else:
                continue
        else:
            residue = line[17:20].strip()
            chain = line[21].strip()
            try:
                resnum = int(line[22:27].strip())
            #Some resnum columns have non-integer values
            except ValueError:
                resnum = line[22:27].strip()
            #Continue until next chain is found.
            if ter:
                if chain in prev_chains:
                    continue
                else:
                    ter=False
                    if chain not in chains:
                        chains[chain] = []
                    curr_chain = chains[chain]
            else:
                if chain not in chains:
                    chains[chain] = []
                curr_chain = chains[chain]

                if resnum != last_resnum:
                    curr_chain.append((resnum,residue))
                last_resnum = resnum
    return chains



def ungapped_to_pdb_numbers(chain_list):
    """From a chain list, map seq position -> res number."""
    return dict(enumerate([i[0]for i in chain_list]))

def chains_to_seqs(chains):
    """Returns sequences as an array of chars for each chain.
    
    Will concatenate all the residues that exist; it's the job of alignment
    or similar to align it with other seqs. Can get numbering 
    """
    result = {}
    chain_to_seq_type = {}
    for chain_id, residues in chains.items():
        seq_type = None
        curr_seq = []
        
        first_res = residues[0][1].strip()
        if first_res in three_to_one:
            seq_type = 'Protein'
        else:
            seq_type = 'Nucleotide'
            
        for res_id, seq in residues:
            seq = seq.strip()
            if seq_type == 'Nucleotide':
                curr_seq.append(nucleotides.get(seq, 'N'))
            else:
                curr_seq.append(three_to_one.get(seq, '?'))

        result[chain_id] = ''.join(curr_seq)
        chain_to_seq_type[chain_id]=seq_type
    return result, chain_to_seq_type

def get_best_muscle_hits(subject_seq, query_aln,threshold,use_shorter=True):
    """Returns subset of query_aln with alignment scores above threshold.
    
        - subject_seq is sequence aligned against query_aln seqs.
        - query_aln is dict or Alignment object with candidate seqs to be
        aligned with subject_seq.
        - threshold is an alignment score (fraction shared aligned length)
        which returned seqs must be above when aligned w/ subject_seq.
        - use_shorter (default=True) is to decide whether to use the length
        of the shorter sequence to calculate the alignment score.
    """
    keep={}
    #best = 0
    for query_label, query_seq in query_aln.items():
        subject_aligned, query_aligned, frac_same = \
            get_aligned_muscle(subject_seq,query_seq)
        #if frac_same > best:
        if frac_same > threshold: 
            keep[query_label]=query_seq
            #best=frac_same
    return keep

def get_matching_chains(subject_seq, pdb_lines,\
    subject_type='Protein',threshold=0.8):
    """Returns PDB chains that match subject_seq.
    
        - subject_seq must be a sequence string.
        - pdb_lines must be a list of lines from a PDB record.
        - subject_type must be the type of sequence that subject_seq is.  This
        is used to build blast database.
    """
    #Get PDB sequence info
    chains = get_chains(pdb_lines)
    #Get pdb numbering
    ungapped_to_pdb = {}
    for k,v in chains.items():
        ungapped_to_pdb[k]=ungapped_to_pdb_numbers(v)
    #Get pdb alignment and sequence types
    pdb_aln, pdb_types = chains_to_seqs(chains)
    
    #Get only sequences of subject_type
    pdb_matching = \
        dict([(k,pdb_aln[k]) for k,v in pdb_types.items() if v == subject_type])
    
    #if there is more than one chain in pdb_matching
    if len(pdb_matching) > 1:
        #Get best hits using MUSCLE.
        pdb_matching = \
            get_best_muscle_hits(subject_seq, pdb_matching, threshold)
    
    return pdb_matching, ungapped_to_pdb

def align_subject_to_pdb(subject_seq, pdb_matching):
    """Returns pairwise aligned subject_seq and pdb_matching alignment.
    
        - result will be a dict:
            {pdb_chain:(aligned_subject, aligned_pdb)}
    """
    result = {}
    for pdb_chain, pdb_seq in pdb_matching.items():
        subject_aligned, pdb_aligned,frac_same = \
            get_aligned_muscle(subject_seq, pdb_seq)
        result[pdb_chain]=(subject_aligned,pdb_aligned)
    
    return result


#####The following code must be in the .pml script:####
def iterate_blocks(seq, max_len=100):
    """Yields successive blocks up to max_len from seq."""
    curr = 0
    while curr < len(seq):
        yield seq[curr:curr+max_len]
        curr += max_len

def make_color_list(colors, prefix="color_"):
    """Makes list of colors, sequentially numbered after prefix."""
    return [(prefix+str(i+1), color) for i, color in enumerate(colors)]

def set_color_list(color_list):
    """Uses cmd to set all the items in a list of colors as named colors."""
    for name, color in color_list:
        cmd.set_color(name, color)

def set_seq_colors(colors, indices, chain_id):
    """Takes list of colors same length as seq, index mapping, and chain id."""
    for i, color in enumerate(colors):
        idx = indices[i].split('+')
        for block in iterate_blocks(idx):
            curr = '+'.join(block)
            cmd.color(color, "chain %s and resi %s" % (chain_id, curr))

def set_show_shapes(indices, chain_id, shape="sticks"):
    """Takes list of indices and a chain id and sets to shape.
    """
    for block in iterate_blocks(indices):
        str_indices = '+'.join(map(str,block))
        cmd.show(shape,"chain %s and resi %s" % (chain_id, str_indices))
    

#pymol coloring functions string:
PYMOL_FUNCTION_STRING = \
'''
def iterate_blocks(seq, max_len=100):
    """Yields successive blocks up to max_len from seq."""
    curr = 0
    while curr < len(seq):
        yield seq[curr:curr+max_len]
        curr += max_len

def make_color_list(colors, prefix="color_"):
    """Makes list of colors, sequentially numbered after prefix."""
    return [(prefix+str(i+1), color) for i, color in enumerate(colors)]

def set_color_list(color_list):
    """Uses cmd to set all the items in a list of colors as named colors."""
    for name, color in color_list:
        cmd.set_color(name, color)

def set_seq_colors(colors, indices, chain_id):
    """Takes list of colors same length as seq, index mapping, and chain id."""
    for i, color in enumerate(colors):
        idx = indices[i].split('+')
        for block in iterate_blocks(idx):
            curr = '+'.join(block)
            cmd.color(color, "chain %s and resi %s" % (chain_id, curr))

def set_show_shapes(indices, chain_id, shape="sticks"):
    """Takes list of indices and a chain id and sets to shape.
    """
    str_indices = "+".join(map(str,indices))
    cmd.show(shape,"chain %s and resi %s" % (chain_id, str_indices))

'''

MAIN_FUNCTION_STRING = \
'''
cmd.hide()
cmd.show("cartoon")
cmd.color("white")
    
'''