File: FastQFile.h

package info (click to toggle)
libstatgen 1.0.15-8
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 4,588 kB
  • sloc: cpp: 49,624; ansic: 1,408; makefile: 320; sh: 60
file content (232 lines) | stat: -rw-r--r-- 8,943 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
/*
 *  Copyright (C) 2010  Regents of the University of Michigan
 *
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#ifndef __FASTQ_VALIDATOR_H__
#define __FASTQ_VALIDATOR_H__

#include <iostream>
#include <map>
#include "StringBasics.h"
#include "InputFile.h"
#include "BaseComposition.h"
#include "FastQStatus.h"

/// Class for reading/validating a fastq file.
class FastQFile
{
 public:
    /// Constructor.
    /// /param minReadLength The minimum length that a base sequence must be for
    ///                      it to be valid.
    /// \param numPrintableErrors The maximum number of errors that should be reported
    ///                           in detail before suppressing the errors.
    FastQFile(int minReadLength = 10, int numPrintableErrors = 20);
    
    /// Disable messages - do not write to cout.
    void disableMessages();
    
    /// Enable messages - write to cout.
    void enableMessages();
    
    /// Disable Unique Sequence ID checking
    /// (Unique Sequence ID checking is enabled by default).
    void disableSeqIDCheck();
    
    /// Enable Unique Sequence ID checking.
    /// (Unique Sequence ID checking is enabled by default).
    void enableSeqIDCheck();
    
    /// Interleaved.
    void interleaved();
    
    /// Set the number of errors after which to quit reading/validating a file,
    /// defaults to -1.
    /// \param maxErrors # of errors before quitting, 
    /// -1 indicates to not quit until the entire file has been read/validated (default), 
    /// 0 indicates to quit without reading/validating anything.
    void setMaxErrors(int maxErrors);

    /// Open a FastQFile.
    /// Use the specified SPACE_TYPE to determine BASE, COLOR, or UNKNOWN.
    FastQStatus::Status openFile(const char* fileName,
                                 BaseAsciiMap::SPACE_TYPE spaceType = BaseAsciiMap::UNKNOWN);

    /// Close a FastQFile.
    FastQStatus::Status closeFile();

    /// Check to see if the file is open.
    bool isOpen();

    /// Check to see if the file is at the end of the file.
    bool isEof();

    /// Returns whether or not to keep reading the file,
    /// it stops reading (false) if eof or there is a problem reading the file.
    bool keepReadingFile();

    /// Validate the specified fastq file
    /// \param filename fastq file to be validated.
    /// \param printBaseComp whether or not to print the base composition for the file.
    ///                      true means print it, false means do not.
    /// \param spaceType the spaceType to use for validation - BASE_SPACE, COLOR_SPACE,
    ///                  or UNKNOWN (UNKNOWN means to determine the spaceType to
    ///                  validate against from the first character of the first
    ///                  sequence).
    /// \param printQualAvg  whether or not to print the quality averages for the file.
    ///                      true means to print it, false (default) means do not.
    /// \return the fastq validation status,  SUCCESS on a successfully
    /// validated fastq file.
    FastQStatus::Status validateFastQFile(const String &filename,  
                                          bool printBaseComp,
                                          BaseAsciiMap::SPACE_TYPE spaceType,
                                          bool printQualAvg = false);

    /// Read 1 FastQSequence, validating it.
    FastQStatus::Status readFastQSequence();

    ///////////////////////
    /// @name  Public Sequence Line variables.
    /// Keep public variables for a sequence's line so they can be accessed
    /// without having to do string copies.
    //@{
    String myRawSequence;
    String mySequenceIdLine;
    String mySequenceIdentifier;
    String myPlusLine;
    String myQualityString;
    //@}

    /// Get the space type used for this file.
    inline BaseAsciiMap::SPACE_TYPE getSpaceType()
    {
        return(myBaseComposition.getSpaceType());
    }
    
private:
    // Validates a single fastq sequence from myFile.
    bool validateFastQSequence();

    // Reads and validates the sequence identifier line of a fastq sequence.
    bool validateSequenceIdentifierLine();

    // Reads and validates the raw sequence line(s) and the plus line.  Both are
    // included in one method since it is unknown when the raw sequence line
    // ends until you find the plus line that divides it from the quality
    // string.  Since this method will read the plus line to know when the
    // raw sequence ends, it also validates that line.
    bool validateRawSequenceAndPlusLines();

    // Reads and validates the quality string line(s).
    bool validateQualityStringLines();

    // Method to validate a line that contains part of the raw sequence.
    // offset specifies where in the sequence to start validating.
    bool validateRawSequence(int offset);

    // Method to validate the "+" line that seperates the raw sequence and the
    // quality string.
    bool validateSequencePlus();

    // Method to validate the quality string.
    // offset specifies where in the quality string to start validating.
    bool validateQualityString(int offset);

    // Helper method to read a line from the input file into a string.
    // It also tracks the line number.
    void readLine();

    // Helper method for printing the contents of myErrorString.  It will
    // only print the errors until the maximum number of reportable errors is
    // reached.
    void reportErrorOnLine();

    // Reset the member data for each fastq file.
    void reset();

    // Reset the member data for each sequence.
    void resetForEachSequence();

    // Log the specified message if enabled.
    void logMessage(const char* message);

    // Determine if it is time to quit by checking if we are to quit after a
    // certain number of errors and that many errors have been encountered.
    bool isTimeToQuit();

    void printAvgQual();

    //////////////////////////////////////////////////////////////////////
    // Following member data elements are reset for each validated sequence.
    //

    // Buffer for storing the contents of the line read.
    // Stored as member data so memory allocation is only done once.
    String myLineBuffer;

    // Buffer for storing the error string.  This prevents the reallocation of
    // the string buffer for each error.
    String myErrorString;

    String myTempPartialQuality;

    //////////////////////////////////////////////////////////////////////
    // Following member data elements are reset for each validated file.
    //
    IFILE myFile; // Input file to be read.
    String myFileName; // Name of file being processed.
    int myNumErrors;   // Tracks the number of errors.
    unsigned int myLineNum;    // Track the line number - used for reporting errors.
    BaseComposition myBaseComposition;  // Tracks the base composition.
    std::vector<int> myQualPerCycle;  // Tracks the quality by cycle.
    std::vector<int> myCountPerCycle;  // Tracks the number of entries by cycle.

    // Whether or not to check the sequence identifier for uniqueness.
    // Checking may use up a lot of memory.
    bool myCheckSeqID;

    // Whether or not to check that the file is interleaved.
    // Disabled by myCheckSeqID
    bool myInterleaved;

    // Previous sequence id for checking interleaved.
    std::string myPrevSeqID;

    // Map to track which identifiers have appeared in the file.
    std::map<std::string, unsigned int> myIdentifierMap;
 
    //////////////////////////////////////////////////////////////////////
    // Following member data do not change for each call to the validator.
    //
    int myMinReadLength; // Min Length for a read.
    int myNumPrintableErrors;  // Max number of errors to print the details of.

    // Number of errors after which to quit reading/validating a file.
    // Defaults to -1.
    //   -1 indicates to not quit until the entire file has been read/validated.
    //    0 indicates to quit without reading/validating anything.
    int myMaxErrors;

    // Whether or not messages should be printed.  
    // Defaulted to false (they should be printed).
    bool myDisableMessages;

    // Track if there is a problem reading the file.  If there are read
    // problems, stop reading the file.
    bool myFileProblem;
};

#endif