File: kmer.h

package info (click to toggle)
fermi-lite 0.1-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, buster, sid
  • size: 652 kB
  • sloc: ansic: 5,157; makefile: 63; sh: 13
file content (106 lines) | stat: -rw-r--r-- 2,880 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#ifndef BFC_KMER_H
#define BFC_KMER_H

#include <stdint.h>

typedef struct {
	uint64_t x[4];
} bfc_kmer_t;

static inline void bfc_kmer_append(int k, uint64_t x[4], int c)
{ // IMPORTANT: 0 <= c < 4
	uint64_t mask = (1ULL<<k) - 1;
	x[0] = (x[0]<<1 | (c&1))  & mask;
	x[1] = (x[1]<<1 | (c>>1)) & mask;
	x[2] = x[2]>>1 | (1ULL^(c&1))<<(k-1);
	x[3] = x[3]>>1 | (1ULL^c>>1) <<(k-1);
}

static inline void bfc_kmer_change(int k, uint64_t x[4], int d, int c) // d-bp from the 3'-end of k-mer; 0<=d<k
{ // IMPORTANT: 0 <= c < 4
	uint64_t t = ~(1ULL<<d);
	x[0] = (uint64_t) (c&1)<<d | (x[0]&t);
	x[1] = (uint64_t)(c>>1)<<d | (x[1]&t);
	t = ~(1ULL<<(k-1-d));
	x[2] = (uint64_t)(1^(c&1))<<(k-1-d) | (x[2]&t);
	x[3] = (uint64_t)(1^ c>>1)<<(k-1-d) | (x[3]&t);
}

// Thomas Wang's integer hash functions. See <https://gist.github.com/lh3/59882d6b96166dfc3d8d> for a snapshot.
static inline uint64_t bfc_hash_64(uint64_t key, uint64_t mask)
{
	key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1;
	key = key ^ key >> 24;
	key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265
	key = key ^ key >> 14;
	key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21
	key = key ^ key >> 28;
	key = (key + (key << 31)) & mask;
	return key;
}

static inline uint64_t bfc_hash_64_inv(uint64_t key, uint64_t mask)
{
	uint64_t tmp;
 
	// Invert key = key + (key << 31)
	tmp = (key - (key << 31));
	key = (key - (tmp << 31)) & mask;
 
	// Invert key = key ^ (key >> 28)
	tmp = key ^ key >> 28;
	key = key ^ tmp >> 28;
 
	// Invert key *= 21
	key = (key * 14933078535860113213ull) & mask;
 
	// Invert key = key ^ (key >> 14)
	tmp = key ^ key >> 14;
	tmp = key ^ tmp >> 14;
	tmp = key ^ tmp >> 14;
	key = key ^ tmp >> 14;
 
	// Invert key *= 265
	key = (key * 15244667743933553977ull) & mask;
 
	// Invert key = key ^ (key >> 24)
	tmp = key ^ key >> 24;
	key = key ^ tmp >> 24;
 
	// Invert key = (~key) + (key << 21)
	tmp = ~key;
	tmp = ~(key - (tmp << 21));
	tmp = ~(key - (tmp << 21));
	key = ~(key - (tmp << 21)) & mask;
 
	return key;
}

static inline uint64_t bfc_kmer_hash(int k, const uint64_t x[4], uint64_t h[2])
{
	int t = k>>1, u = ((x[1]>>t&1) > (x[3]>>t&1)); // the middle base is always different
	uint64_t mask = (1ULL<<k) - 1, ret;
	h[0] = bfc_hash_64((x[u<<1|0] + x[u<<1|1]) & mask, mask);
	h[1] = bfc_hash_64(h[0] ^ x[u<<1|1], mask);
	ret = (h[0] ^ h[1]) << k | ((h[0] + h[1]) & mask);
	h[0] = (h[0] + h[1]) & mask;
	return ret;
}

static inline void bfc_kmer_hash_inv(int k, const uint64_t h[2], uint64_t y[2])
{
	uint64_t mask = (1ULL<<k) - 1, t = (h[0] - h[1]) & mask;
	y[1] = bfc_hash_64_inv(h[1], mask) ^ t;
	y[0] = (bfc_hash_64_inv(t, mask) - y[1]) & mask;
}

static inline char *bfc_kmer_2str(int k, const uint64_t y[2], char *buf)
{
	int l;
	for (l = 0; l < k; ++l)
		buf[k - 1 - l] = "ACGT"[(y[1]>>l&1)<<1 | (y[0]>>l&1)];
	buf[k] = 0;
	return buf;
}

#endif