File: FileRecordMgr.h

package info (click to toggle)
bedtools 2.27.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 54,804 kB
  • sloc: cpp: 38,072; sh: 7,307; makefile: 2,241; python: 163
file content (152 lines) | stat: -rw-r--r-- 4,451 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/*
 * FileRecordMgr.h
 *
 *  Created on: Nov 8, 2012
 *      Author: nek3d
 */

#ifndef FILERECORDMGR_H_
#define FILERECORDMGR_H_

#include <string>
#include "string.h"
#include <set>
//#include "DualQueue.h"

//include headers for all FileReader and derivative classes.
#include "BufferedStreamMgr.h"
#include "FileReader.h"
#include "SingleLineDelimTextFileReader.h"
#include "BamFileReader.h"

//record manager and all record classes
#include "RecordMgr.h"

#include "RecordKeyVector.h"
#include "BlockMgr.h"

class Record;
class NewGenomeFile;

class FileRecordMgr {
public:
	FileRecordMgr(const string & filename);
	virtual ~FileRecordMgr();
	bool open(bool inheader=false);
	void close();
	virtual bool eof();
	void setFileIdx(int fileIdx) { _fileIdx = fileIdx; }
	int getFileIdx() const { return _fileIdx; }


	//This is an all-in-one method to give the user a new record that is initialized with
	//the next entry in the data file.
	//NOTE!! User MUST pass back the returned pointer to deleteRecord method for cleanup!
	//Also Note! User must check for NULL returned, meaning we failed to get the next record.
	virtual Record *getNextRecord(RecordKeyVector *keyList = NULL);
	void deleteRecord(const Record *);
	virtual void deleteRecord(RecordKeyVector *keyList);



	const string &getFileName() const { return _filename;}
	bool hasHeader() const { return _fileReader->hasHeader(); }
	const string &getHeader() const { return _fileReader->getHeader(); }

	bool recordsHaveName() const {
		return _bufStreamMgr->getTypeChecker().recordTypeHasName(_recordType);
	}
	bool recordsHaveScore() const {
		return _bufStreamMgr->getTypeChecker().recordTypeHasScore(_recordType);
	}
	bool recordsHaveStrand() const {
		return _bufStreamMgr->getTypeChecker().recordTypeHasStrand(_recordType);
	}

	FileRecordTypeChecker::FILE_TYPE getFileType() const {
		return _fileType;
	}
	FileRecordTypeChecker::RECORD_TYPE getRecordType() const {
		return _recordType;
	}
	const string &getFileTypeName() const {
		return _bufStreamMgr->getTypeChecker().getFileTypeName();
	}

	const string &getRecordTypeName() const {
		return _bufStreamMgr->getTypeChecker().getRecordTypeName();
	}

	const BamTools::RefVector &getBamReferences();

	int getNumFields() const { return _fileReader->getNumFields(); }

	//File statistics
	unsigned long getTotalRecordLength() const { return _totalRecordLength; } //sum of length of all returned records
	unsigned long getTotalMergedRecordLength() const { return _totalMergedRecordLength; } // sum of all merged intervals



	//Setting the freeListBlockSize is optional. If the user never calls this,
	//the blockSize defaults to 512.
	void setFreeListBlockSize(int blockSize) { _freeListBlockSize = blockSize; }

	//special: For BAM files, our default is to not use all the
	//tag information in a BAM file, which reduces the run time in some
	//cases by more than 50%. But setting this method to true
	//will use all the tags, if more information is desired.
	//MUST BE CALLED BEFORE THE BAM FILE IS OPEN.
	void setFullBamFlags(bool flag) { _useFullBamTags = flag; }
	void setGenomeFile(NewGenomeFile *genomeFile) {
		_genomeFile = genomeFile;
		_hasGenomeFile = true;
	}

	void setIsSorted(bool val) { _isSortedInput = val; }
	void setIoBufSize(int val) { _ioBufSize = val; }
	void setNoEnforceCoordSort(bool val) { _noEnforceCoordSort = val; }
	void setIsGroupBy(bool val) { _isGroupBy = val; }

protected:
	int _fileIdx;
	string _filename;
	BufferedStreamMgr *_bufStreamMgr;

	FileReader *_fileReader;
	FileRecordTypeChecker::FILE_TYPE _fileType;
	FileRecordTypeChecker::RECORD_TYPE _recordType;
	RecordMgr *_recordMgr;
	bool _isSortedInput;
	int _freeListBlockSize;
	bool _useFullBamTags;

	//members for enforcing sorted order.
        std::set<string> _foundChroms;
	string _prevChrom;
	int _prevStart;
	int _prevChromId;


	bool _mustBeForward;
	bool _mustBeReverse;

	//available stats after run
	unsigned long _totalRecordLength;
	unsigned long _totalMergedRecordLength;

	BlockMgr *_blockMgr;
	BamTools::BamReader *_bamReader;
	bool _hasGenomeFile;
	NewGenomeFile *_genomeFile;
	int _ioBufSize;
	bool _noEnforceCoordSort; //only true for GroupBy
	bool _isGroupBy; //hopefully also only true for GroupBy

	void allocateFileReader(bool inheader=false);
	void testInputSortOrder(Record *record);
	void assignChromId(Record *);
	void sortError(const Record *record, bool genomeFileError);
};


#endif /* FILERECORDMGR_H_ */