File: kb_collector.h

package info (click to toggle)
kmc 3.2.4%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,716 kB
  • sloc: cpp: 38,308; python: 664; makefile: 216; perl: 179; sh: 34
file content (120 lines) | stat: -rw-r--r-- 2,738 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*
  This file is a part of KMC software distributed under GNU GPL 3 licence.
  The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
  
  Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
  
  Version: 3.2.4
  Date   : 2024-02-09
*/

#ifndef _KB_COLLECTOR_H
#define _KB_COLLECTOR_H

#include "defs.h"
#include "params.h"
#include "kmer.h"
#include "queues.h"
#include "radix.h"
#include "rev_byte.h"
#include <string>
#include <algorithm>
#include <numeric>
#include <array>
#include <vector>
#include <stdio.h>
using namespace std;

//----------------------------------------------------------------------------------
// Class collecting kmers belonging to a single bin
class CKmerBinCollector
{
	list<pair<uint64, uint64>> expander_parts; //range, n_plus_x_recs_in_range
	uint64 prev_n_plus_x_recs = 0;
	uint64 prev_pos = 0;

	enum comparision_state  { kmer_smaller, rev_smaller, equals };
	uint32 bin_no;
	CBinPartQueue *bin_part_queue;
	CBinDesc *bd;
	uint32 kmer_len;
	uchar* buffer;
	uint32 buffer_size;
	uint32 buffer_pos;

	uint32 super_kmer_no = 0;
	const uint32 max_super_kmers_expander_pack = 1ul << 12; 
	
	CMemoryPool *pmm_bins;
	uint32 n_recs;
	uint32 n_plus_x_recs;
	uint32 n_super_kmers;	
	uint32 max_x;

	uint32 kmer_bytes;
	bool both_strands;

	template<unsigned DIVIDE_FACTOR> void update_n_plus_x_recs(char* seq, uint32 n);

public:
	CKmerBinCollector(CKMCQueues& Queues, CKMCParams& Params, uint32 _buffer_size, uint32 _bin_no);
	void PutExtendedKmer(char* seq, uint32 n);	
	void Flush();
};

//---------------------------------------------------------------------------------
template<unsigned DIVIDE_FACTOR> void CKmerBinCollector::update_n_plus_x_recs(char* seq, uint32 n)
{
	uchar kmer, rev;
	uint32 kmer_pos = 4;
	uint32 rev_pos = kmer_len;
	uint32 x;

	kmer = (seq[0] << 6) + (seq[1] << 4) + (seq[2] << 2) + seq[3];
	rev = ((3 - seq[kmer_len - 1]) << 6) + ((3 - seq[kmer_len - 2]) << 4) + ((3 - seq[kmer_len - 3]) << 2) + (3 - seq[kmer_len - 4]);

	x = 0;
	comparision_state current_state, new_state;
	if (kmer < rev)
		current_state = kmer_smaller;
	else if (rev < kmer)
		current_state = rev_smaller;
	else
		current_state = equals;


	for (uint32 i = 0; i < n - kmer_len; ++i)
	{
		rev >>= 2;
		rev += (3 - seq[rev_pos++]) << 6;
		kmer <<= 2;
		kmer += seq[kmer_pos++];

		if (kmer < rev)
			new_state = kmer_smaller;
		else if (rev < kmer)
			new_state = rev_smaller;
		else
			new_state = equals;

		if (new_state == current_state)
		{
			if (current_state == equals)
				++n_plus_x_recs;
			else
				++x;
		}
		else
		{
			current_state = new_state;
			n_plus_x_recs += 1 + x / DIVIDE_FACTOR;

			x = 0;
		}
	}
	n_plus_x_recs += 1 + x / DIVIDE_FACTOR;
}

#endif

// ***** EOF