File: references.hpp

package info (click to toggle)
sortmerna 4.3.7-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 134,048 kB
  • sloc: cpp: 24,424; ansic: 15,923; python: 1,453; sh: 224; makefile: 31
file content (97 lines) | stat: -rw-r--r-- 3,034 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/*
 @copyright 2016-2021  Clarity Genomics BVBA
 @copyright 2012-2016  Bonsai Bioinformatics Research Group
 @copyright 2014-2016  Knight Lab, Department of Pediatrics, UCSD, La Jolla

 @parblock
 SortMeRNA - next-generation reads filter for metatranscriptomic or total RNA
 This is a free software: you can redistribute it and/or modify
 it under the terms of the GNU Lesser General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 SortMeRNA is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU Lesser General Public License for more details.

 You should have received a copy of the GNU Lesser General Public License
 along with SortMeRNA. If not, see <http://www.gnu.org/licenses/>.
 @endparblock

 @contributors Jenya Kopylova   jenya.kopylov@gmail.com
			   Laurent No      laurent.noe@lifl.fr
			   Pierre Pericard  pierre.pericard@lifl.fr
			   Daniel McDonald  wasade@gmail.com
			   Mikal Salson    mikael.salson@lifl.fr
			   Hlne Touzet    helene.touzet@lifl.fr
			   Rob Knight       robknight@ucsd.edu
*/

/*
 * FILE: references.hpp
 * Created: Nov 06, 2017 Mon
 */

#pragma once

#include <cstdint>
#include <string>
#include <vector>
#include <algorithm>

#include "common.hpp" // Format, FASTA_HEADER_START, FASTQ_HEADER_START

// forward
class Refstats;
struct Runopts;

class References {
public:
	struct BaseRecord
	{
		size_t nid; // position of the sequence in the Reference file [0...'number of sequences in the ref.file - 1']
		std::string id; // ID from header
		std::string header;
		std::string sequence;
		std::string quality; // "" (fasta) | "xxx..." (fastq)
		BIO_FORMAT format; // FASTA | FATSQ
		bool isEmpty;
		BaseRecord(): isEmpty(true) {}
		void clear()
		{
			header.clear();
			sequence.clear();
			quality.clear();
			isEmpty = true;
		}

		std::string getId() {
			std::string id = header.substr(0, header.find(' '));
			id.erase(id.begin(), std::find_if(id.begin(), id.end(), [](auto ch) {return !(ch == FASTA_HEADER_START || ch == FASTQ_HEADER_START);}));
			return id;
		}
	};

	std::vector<BaseRecord> buffer; // Container for references TODO: change name?

	References(): num(0), part(0) {}
	//~References() {}

	void load(uint32_t idx_num, uint32_t idx_part, Runopts & opts, Refstats & refstats); // load references into the buffer given index number and index part
	void convert_fix(std::string & seq); // convert sequence to numberical form and fix ambiguous chars
	std::string convertChar(int idx); // convert numerical form to char string
	/*
	* For debugging needs.
	* find a reference index given a header.
	*/
	int findref(const std::string& id);
	void unload();

public:
	uint16_t num; // number of the reference file currently loaded
	uint16_t part; // part of the reference file currently loaded

//private:
//	bool load_for_search;
}; // ~class References