1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
|
package bbmin;
import java.util.Arrays;
import structures.LongHashSet;
import structures.LongList;
/**
* Generates an array of minimal hash codes (as positive 64-bit longs) for an input sequence.<br>
* The resulting array is guaranteed to contain the minimal hash code<br>
* for every window, with no duplicates.
* On average this is expected to yield 2*(L-K)/W hash codes for sequence length L and window size W.
*
* @author Brian Bushnell
* @date October 8, 2021
*
*/
public class Minimizer {
public static void main(String[] args){
int k=4, w=7;
String seq="ACGTCTGAGCCTTGACACATGACT";
try {
k=Integer.parseInt(args[0]);
w=Integer.parseInt(args[1]);
seq=args[2];
} catch (NumberFormatException e) {
//e.printStackTrace();
System.err.println("Usage: bbmin.Minimizer kmerlen window seq\n"
+ "E.G.\n"
+ "bbmin.Minimizer 4 7 ACGTCTGAGCCTTGACACATGACT");
System.exit(1);
}
Minimizer minnow=new Minimizer(k, w);
long[] array=minnow.minimize(seq.getBytes());
System.err.println(Arrays.toString(array));
}
public Minimizer(int k_, int window_){this(k_, window_, 2);}
public Minimizer(int k_, int window_, int bitsPerSymbol_){
k=k_;
window=window_;
bitsPerSymbol=bitsPerSymbol_;
shift=bitsPerSymbol*k;
shift2=shift-bitsPerSymbol;
mask=(shift>63 ? -1L : ~((-1L)<<shift));
}
public long[] minimize(String str){
return minimize(str.getBytes());
}
public long[] minimize(byte[] bases){
return minimize(bases, new LongList(16), new LongHashSet(16));
}
/** This method is typically faster since you don't need to construct a new set each time. */
public long[] minimize(byte[] bases, LongList list, LongHashSet set){
list.clear();
//If the set is way too big, resize it
if(set.capacity()*(long)window>100L+16L*bases.length){
set.resizeDestructive(16);
}else{
set.clear();
}
long kmersProcessed=0;
long kmer=0;
long rkmer=0;
int len=0;
long bestCode=Long.MAX_VALUE;
int bestPosition=-1;
long bestKmer=-1;
long bestRkmer=-1;
int currentWindow=0;
for(int i=0; i<bases.length; i++){
byte b=bases[i];
long x=baseToNumber[b];
long x2=baseToComplementNumber[b];
kmer=((kmer<<2)|x)&mask;
rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
if(x<0){
len=0;
rkmer=0;
}else{
len++;
}
if(len>=k){
kmersProcessed++;
currentWindow++;
final long hashcode=hash(kmer, rkmer);
System.err.println("i="+i+", code="+hashcode);
//Track the best code in the window and its state
if(hashcode>=minCode && hashcode<=bestCode){
bestCode=hashcode;
bestPosition=i;
bestKmer=kmer;
bestRkmer=rkmer;
}
//Once the window size is met, store the best code,
//and backtrack to its position to start the next window
if(currentWindow>=window && bestPosition>=0){
if(!set.contains(bestCode)){
set.add(bestCode);
list.add(bestCode);
}
i=bestPosition;
kmer=bestKmer;
rkmer=bestRkmer;
len=k;
bestCode=Long.MAX_VALUE;
bestPosition=-1;
currentWindow=0;
}
}
}
list.sort();//optional
return list.toArray();
}
public static long canon(long kmer, long rkmer){return max(kmer, rkmer);}
public static long hash(long kmer, long rkmer){return hash(canon(kmer, rkmer));}
public static long hash(long key) {
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >>> 24);
key = (key + (key << 3)) + (key << 8); // key * 265
key = key ^ (key >>> 14);
key = (key + (key << 2)) + (key << 4); // key * 21
key = key ^ (key >>> 28);
key = key + (key << 31);
return key;
}
private static final long max(long x, long y){return x>y ? x : y;}
public final int k;
public final int window;
public final int bitsPerSymbol; //2 for nucleotides, 5 for amino acids.
private final int shift;
private final int shift2;
private final long mask;
private final long minCode=0;
static final byte[] baseToNumber = new byte[128];
static final byte[] baseToComplementNumber = new byte[128];
static {
Arrays.fill(baseToNumber, (byte)-1);
Arrays.fill(baseToComplementNumber, (byte)-1);
baseToNumber['A']=baseToNumber['a']=baseToComplementNumber['T']=baseToComplementNumber['t']=0;
baseToNumber['C']=baseToNumber['c']=baseToComplementNumber['G']=baseToComplementNumber['c']=1;
baseToNumber['G']=baseToNumber['g']=baseToComplementNumber['C']=baseToComplementNumber['g']=2;
baseToNumber['T']=baseToNumber['t']=baseToComplementNumber['A']=baseToComplementNumber['a']=3;
}
}
|