File: splitter.h

package info (click to toggle)
kmc 3.2.4%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,716 kB
  • sloc: cpp: 38,308; python: 664; makefile: 216; perl: 179; sh: 34
file content (199 lines) | stat: -rw-r--r-- 5,747 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
/*
  This file is a part of KMC software distributed under GNU GPL 3 licence.
  The homepage of the KMC project is http://sun.aei.polsl.pl/kmc
  
  Authors: Sebastian Deorowicz, Agnieszka Debudaj-Grabysz, Marek Kokot
  
  Version: 3.2.4
  Date   : 2024-02-09
*/

#ifndef _SPLITTER_H
#define _SPLITTER_H

#include "defs.h"
#include "kmer.h"
#include "kb_collector.h"
#include "queues.h"
#include "s_mapper.h"
#include "../kmc_api/mmer.h"
#include <stdio.h>
#include <vector>
#include "small_k_buf.h"
#include "bam_utils.h"

using namespace std;

//************************************************************************************************************
// CSplitter class - splits kmers into bins according to their signatures
//************************************************************************************************************
class CSplitter
{
	uint64 total_kmers = 0;	
	uchar *part;
	uint64_t part_size, part_pos;
	std::vector<std::unique_ptr<CKmerBinCollector>> bins;
	CBinPartQueue *bin_part_queue;
	CMemoryPool *pmm_reads;
	int64 mem_part_pmm_bins;
	int64 mem_part_pmm_reads;

	char codes[256];
	InputType file_type;
	bool both_strands;

	uint32_t curr_read_len = 0;

	uint32 kmer_len;
	//uint32 prefix_len;
	uint32 signature_len;
	uint32 n_bins;	
	uint64 n_reads;//for multifasta its a sequences counter	

	CSignatureMapper* s_mapper;

	bool homopolymer_compressed;

	CntHashEstimator* ntHashEstimator;

	bool GetSeqLongRead(char *seq, uint32 &seq_size, uchar header_marker);

	bool GetSeq(char *seq, uint32 &seq_size, ReadType read_type);

	void HomopolymerCompressSeq(char* seq, uint32 &seq_size);

public:
	static uint32 MAX_LINE_SIZE;
	
	CSplitter(CKMCParams &Params, CKMCQueues &Queues); 
	void InitBins(CKMCParams &Params, CKMCQueues &Queues);	
	void CalcStats(uchar* _part, uint64 _part_size, ReadType read_type, uint32* _stats);
	bool ProcessReadsOnlyEstimate(uchar* _part, uint64 _part_size, ReadType read_type);
	bool ProcessReads(uchar *_part, uint64 _part_size, ReadType read_type);
	template<typename COUNTER_TYPE> bool ProcessReadsSmallK(uchar *_part, uint64 _part_size, ReadType read_type, CSmallKBuf<COUNTER_TYPE>& small_k_buf);
	void Complete();
	inline void GetTotal(uint64 &_n_reads);
	inline uint64 GetTotalKmers();
};

//----------------------------------------------------------------------------------
// Return the number of reads processed by splitter
void CSplitter::GetTotal(uint64 &_n_reads)
{
	_n_reads = n_reads;
}

//----------------------------------------------------------------------------------
// Return the number of kmers processed by splitter (!!! only for small k optimization)
uint64 CSplitter::GetTotalKmers()
{
	return total_kmers;
}

//************************************************************************************************************
// CWSplitter class - wrapper for multithreading purposes
//************************************************************************************************************

//----------------------------------------------------------------------------------
class CWSplitter {
	CPartQueue *pq;
	CBinPartQueue *bpq;
	CMemoryPool *pmm_fastq;

	std::unique_ptr<CSplitter> spl;
	uint64 n_reads;

public:
	CWSplitter(CKMCParams &Params, CKMCQueues &Queues);	
	void operator()();
	void GetTotal(uint64 &_n_reads);
	~CWSplitter();
};

//************************************************************************************************************
// CWStatsSplitter class - wrapper for multithreading purposes
//************************************************************************************************************

//----------------------------------------------------------------------------------
class CWStatsSplitter {
	CStatsPartQueue *spq;
	CMemoryPool *pmm_fastq, *pmm_stats;
	uint32 *stats;
	std::unique_ptr<CSplitter> spl;
	uint32 signature_len;
	KMC::IProgressObserver* progressObserver;
public:
	CWStatsSplitter(CKMCParams &Params, CKMCQueues &Queues);
	~CWStatsSplitter();

	void operator()();
	void GetStats(uint32* _stats);
};


//************************************************************************************************************
// CWSmallKSplitter class - wrapper for multithreading purposes
//************************************************************************************************************
//----------------------------------------------------------------------------------
template <typename COUNTER_TYPE> class CWSmallKSplitter {
	CPartQueue *pq;	
	CMemoryPool *pmm_fastq, *pmm_small_k;	
	CSmallKBuf<COUNTER_TYPE> small_k_buf;

	std::unique_ptr<CSplitter> spl;
	uint64 n_reads;
	uint64 total_kmers;
	uint32 kmer_len;

public:
	CWSmallKSplitter(CKMCParams &Params, CKMCQueues &Queues);
	~CWSmallKSplitter();

	void operator()();
	void GetTotal(uint64 &_n_reads);

	CSmallKBuf<COUNTER_TYPE> GetResult()
	{
		return small_k_buf;
	}

	uint64 GetTotalKmers()
	{
		if (spl)
			return spl->GetTotalKmers();
		return total_kmers;
	}

	void Release()
	{
		pmm_small_k->free(small_k_buf.buf);
	}
};



//************************************************************************************************************
// CWEstimateOnlySplitter class - wrapper for multithreading purposes
//************************************************************************************************************

//----------------------------------------------------------------------------------
class CWEstimateOnlySplitter {
	CPartQueue* pq;
	CBinPartQueue* bpq;
	CMemoryPool* pmm_fastq;

	std::unique_ptr<CSplitter> spl;
	uint64 n_reads;

public:
	CWEstimateOnlySplitter(CKMCParams& Params, CKMCQueues& Queues);
	void operator()();
	void GetTotal(uint64& _n_reads);
	~CWEstimateOnlySplitter();
};



#endif

// ***** EOF