1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
|
/*++
Module Name:
Seed.h
Abstract:
Headers for code to handle seeds in the SNAP sequencer.
Authors:
Bill Bolosky, August, 2011
Environment:
User mode service.
Revision History:
Adapted from Matei Zaharia's Scala implementation.
--*/
#pragma once
#include "Compat.h"
#include "Tables.h"
#include "Util.h"
const unsigned LargestSeedSize = 32;
struct Seed {
//
// We exclude seeds with "N" in them. This checks for that (or other garbage).
//
static bool DoesTextRepresentASeed(const char *textBases, unsigned seedLen);
inline Seed(const char *textBases, unsigned seedLen)
{
bases = 0;
reverseComplement = 0;
for (unsigned i = 0; i < seedLen; i++) {
_uint64 encodedBase = BASE_VALUE[textBases[i]];
_ASSERT(255 != encodedBase);
bases |= encodedBase << ((seedLen - i - 1) * 2);
reverseComplement |= (encodedBase ^ 0x3) << (i * 2);
}
}
inline Seed() {}
inline Seed(_int64 i_bases, _int64 i_reverseComplement)
: bases(i_bases), reverseComplement(i_reverseComplement)
{
}
inline _uint64 getLowBases(unsigned keySizeInBytes) const { // Returns the lowest bases as an unsigned
if (keySizeInBytes == 8) {
return bases;
} else {
return bases & (((_uint64)1 << (keySizeInBytes * 8)) - 1);
}
}
inline unsigned getHighBases(unsigned keySizeInBytes) const { // Returns any high as an unsigned. If seedLen <= 16, returns 0.
if (keySizeInBytes == 8) {
return 0;
} else {
return (unsigned)(bases >> (keySizeInBytes * 8));
}
}
inline _uint64 getBases() const {
return bases;
}
inline _uint64 getRCBases() const {
return reverseComplement;
}
inline Seed operator~() const
//
// Compute the reverse complement of this. We just copy it from when our constructor ran.
//
{
Seed rc;
rc.bases = reverseComplement;
rc.reverseComplement = bases;
return rc;
}
inline bool isBiggerThanItsReverseComplement() const {
return bases > reverseComplement;
}
inline bool isOwnReverseComplement() const {
return bases == reverseComplement;
}
inline bool operator>(Seed &peer) const {
return bases > peer.bases;
}
inline bool operator>=(Seed &peer) const {
return bases >= peer.bases;
}
inline bool operator<(Seed &peer) const {
return bases < peer.bases;
}
inline bool operator<=(Seed &peer) const {
return bases <= peer.bases;
}
inline bool operator==(Seed &peer) const {
return bases == peer.bases;
}
inline bool operator!=(Seed &peer) const {
return bases != peer.bases;
}
inline void setBase(int i, int seedLen, int value) {
int shift = (seedLen - i - 1) * 2;
_int64 mask = (_int64) 3 << shift;
bases = (bases & ~mask) | (((_int64) value << shift) & mask);
int shift2 = i * 2;
_int64 mask2 = (_int64) 3 << shift2;
reverseComplement = (reverseComplement & ~mask2) | (((_int64) (value ^ 3) << shift2) & mask2);
}
inline int getBase(int i, int seedLen) {
return (int) (bases >> ((seedLen - i - 1) * 2)) & 3;
}
inline void shiftIn(int b, int seedLen) {
int shift = (seedLen - 1) * 2;
bases = ((bases << 2) & ~((_uint64) 3 << (shift + 2))) | (b & 3);
reverseComplement = ((_uint64) reverseComplement >> 2) | (((_uint64) (b ^ 3) & 3) << shift);
}
inline void toString(char* o_bases, int seedLength) {
for (int i = (seedLength - 1) * 2; i >= 0; i -= 2) {
*o_bases++ = VALUE_BASE[(bases >> i) & 3];
}
}
static Seed fromBases(_int64 bases, int seedLength);
inline _uint64 hash64()
{
return 1+util::hash64(min(bases, reverseComplement));
}
inline unsigned hash()
{
return (unsigned) hash64();
}
static _uint64 hash64(const char* sequence, int length)
{
if (length <= MaxBases) {
Seed s(sequence, length);
return s.hash64();
} else {
// string compare seq & reverse, hash smallest one
for (int i = 0; i < length / 2; i++) {
char c = sequence[i];
char r = COMPLEMENT[sequence[length - 1 - i]];
if (c < r) {
return util::hash(sequence, length);
} else if (r < c) {
char* rc = (char*) alloca(length);
util::toComplement(rc, sequence, length);
return util::hash64(rc, length);
}
}
return util::hash(sequence, length); // rc palindrome
}
}
static const int MaxBases = 32;
private:
_uint64 bases;
//
// Since we pretty much always compute the reverse complement of a seed, we just keep it
// here. That way we only execute the loop once: when the constructor runs.
//
_uint64 reverseComplement;
};
|