1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
|
# Copyright 2002 by Yves Bastide and Brad Chapman.
# Copyright 2007 by Sebastian Bassi
# All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Functions to calculate assorted sequence checksums."""
# crc32, crc64, gcg, and seguid
# crc64 is adapted from BioPerl
from __future__ import print_function
from binascii import crc32 as _crc32
from Bio._py3k import _as_bytes
def crc32(seq):
"""Returns the crc32 checksum for a sequence (string or Seq object).
Note that the case is important:
>>> crc32("ACGTACGTACGT")
20049947
>>> crc32("acgtACGTacgt")
1688586483
"""
# NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned
# Docs suggest should use crc32(x) & 0xffffffff for consistency.
# TODO - Should we return crc32(x) & 0xffffffff here?
try:
# Assume its a Seq object
return _crc32(_as_bytes(str(seq)))
except AttributeError:
# Assume its a string/unicode
return _crc32(_as_bytes(seq))
def _init_table_h():
_table_h = []
for i in range(256):
l = i
part_h = 0
for j in range(8):
rflag = l & 1
l >>= 1
if part_h & 1:
l |= (1 << 31)
part_h >>= 1
if rflag:
part_h ^= 0xd8000000
_table_h.append(part_h)
return _table_h
# Initialisation
_table_h = _init_table_h()
def crc64(s):
"""Returns the crc64 checksum for a sequence (string or Seq object).
Note that the case is important:
>>> crc64("ACGTACGTACGT")
'CRC-C4FBB762C4A87EBD'
>>> crc64("acgtACGTacgt")
'CRC-DA4509DC64A87EBD'
"""
crcl = 0
crch = 0
for c in s:
shr = (crch & 0xFF) << 24
temp1h = crch >> 8
temp1l = (crcl >> 8) | shr
idx = (crcl ^ ord(c)) & 0xFF
crch = temp1h ^ _table_h[idx]
crcl = temp1l
return "CRC-%08X%08X" % (crch, crcl)
def gcg(seq):
"""Returns the GCG checksum (int) for a sequence (string or Seq object).
Given a nucleotide or amino-acid secuence (or any string),
returns the GCG checksum (int). Checksum used by GCG program.
seq type = str.
Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi
with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina.
All sequences are converted to uppercase.
>>> gcg("ACGTACGTACGT")
5688
>>> gcg("acgtACGTacgt")
5688
"""
try:
# Assume its a Seq object
seq = str(seq)
except AttributeError:
# Assume its a string
pass
index = checksum = 0
for char in seq:
index += 1
checksum += index * ord(char.upper())
if index == 57:
index = 0
return checksum % 10000
def seguid(seq):
"""Returns the SEGUID (string) for a sequence (string or Seq object).
Given a nucleotide or amino-acid secuence (or any string),
returns the SEGUID string (A SEquence Globally Unique IDentifier).
seq type = str.
Note that the case is not important:
>>> seguid("ACGTACGTACGT")
'If6HIvcnRSQDVNiAoefAzySc6i4'
>>> seguid("acgtACGTacgt")
'If6HIvcnRSQDVNiAoefAzySc6i4'
For more information about SEGUID, see:
http://bioinformatics.anl.gov/seguid/
DOI: 10.1002/pmic.200600032
"""
import hashlib
import base64
m = hashlib.sha1()
try:
# Assume it's a Seq object
seq = str(seq)
except AttributeError:
# Assume it's a string
pass
m.update(_as_bytes(seq.upper()))
try:
# For Python 3+
return base64.encodebytes(m.digest()).decode().replace("\n", "").rstrip("=")
except AttributeError:
pass
# For all other Pythons
return base64.b64encode(m.digest()).rstrip("=")
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest()
|