1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
|
"""
Colorspace conversion routines.
Inspired by agapython/util/Dibase.py from Corona lite,
but reimplemented to avoid licensing issues.
Encoding Table
A C G T
A 0 1 2 3
C 1 0 3 2
G 2 3 0 1
T 3 2 1 0
"""
import string
import sys
__author__ = 'Marcel Martin'
if sys.version > '3':
xrange = range
def _initialize_dicts():
"""
Create the colorspace encoding and decoding dictionaries.
"""
enc = {}
for i, c1 in enumerate("ACGT"):
enc['N' + c1] = '4'
enc[c1 + 'N'] = '4'
enc['.' + c1] = '4'
enc[c1 + '.'] = '4'
for j, c2 in enumerate("ACGT"):
# XOR of nucleotides gives color
enc[c1 + c2] = chr(ord('0') + (i ^ j))
enc.update({ 'NN': '4', 'N.': '4', '.N': '4', '..': '4'})
dec = {}
for i, c1 in enumerate("ACGT"):
dec['.' + str(i)] = 'N'
dec['N' + str(i)] = 'N'
dec[c1 + '4'] = 'N'
for j, c2 in enumerate("ACGT"):
# XOR of nucleotides gives color
dec[c1 + chr(ord('0') + (i ^ j))] = c2
dec['N4'] = 'N'
return (enc, dec)
def encode(s):
"""
Given a sequence of nucleotides, convert them to
color space. Only uppercase characters are allowed.
>>> encode("ACGGTC")
"A13012"
"""
if not s:
return s
r = s[0]
for i in range(len(s) - 1):
r += ENCODE[s[i:i+2]]
return r
def decode(s):
"""
Decode a sequence of colors to nucleotide space.
The first character in s must be a nucleotide.
Only uppercase characters are allowed.
>>> decode("A13012")
"ACGGTC"
"""
if len(s) < 2:
return s
x = s[0]
result = x
for c in s[1:]:
x = DECODE[x + c]
result += x
return result
(ENCODE, DECODE) = _initialize_dicts()
if sys.version > '3':
# convert to "bytes"
def _str_dict_to_bytes(d):
return dict((bytes(k, 'ascii'), bytes(v, 'ascii')) for k,v in d.items())
ENCODE.update(_str_dict_to_bytes(ENCODE))
DECODE.update(_str_dict_to_bytes(DECODE))
|