1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
|
/*++
Module Name:
FASTA.cpp
Abstract:
FASTA reader
Authors:
Bill Bolosky, August, 2011
Environment:
User mode service.
Revision History:
Adapted from Matei Zaharia's Scala implementation.
--*/
#include "stdafx.h"
#include "Compat.h"
#include "FASTA.h"
#include "Error.h"
#include "exit.h"
using namespace std;
const Genome *
ReadFASTAGenome(
const char *fileName,
const char *pieceNameTerminatorCharacters,
bool spaceIsAPieceNameTerminator,
unsigned chromosomePaddingSize)
{
//
// We need to know a bound on the size of the genome before we create the Genome object.
// A bound is the number of bytes in the FASTA file, because we store at most one base per
// byte. Get the file size to use for this bound.
//
_int64 fileSize = QueryFileSize(fileName);
bool isValidGenomeCharacter[256];
for (int i = 0; i < 256; i++) {
isValidGenomeCharacter[i] = false;
}
isValidGenomeCharacter['A'] = isValidGenomeCharacter['T'] = isValidGenomeCharacter['C'] = isValidGenomeCharacter['G'] = isValidGenomeCharacter['N'] = true;
isValidGenomeCharacter['a'] = isValidGenomeCharacter['t'] = isValidGenomeCharacter['c'] = isValidGenomeCharacter['g'] = isValidGenomeCharacter['n'] = true;
FILE *fastaFile = fopen(fileName, "r");
if (fastaFile == NULL) {
WriteErrorMessage("Unable to open FASTA file '%s' (even though we already got its size)\n",fileName);
return NULL;
}
const size_t lineBufferSize = 4096;
char lineBuffer[lineBufferSize];
//
// Count the chromosomes
//
unsigned nChromosomes = 0;
while (NULL != fgets(lineBuffer,lineBufferSize,fastaFile)) {
if (lineBuffer[0] == '>') {
nChromosomes++;
}
}
rewind(fastaFile);
Genome *genome = new Genome(fileSize + (nChromosomes+1) * (size_t)chromosomePaddingSize, fileSize + (nChromosomes+1) * (size_t)chromosomePaddingSize, chromosomePaddingSize, nChromosomes + 1);
char *paddingBuffer = new char[chromosomePaddingSize+1];
for (unsigned i = 0; i < chromosomePaddingSize; i++) {
paddingBuffer[i] = 'n';
}
paddingBuffer[chromosomePaddingSize] = '\0';
bool warningIssued = false;
bool inAContig = false;
while (NULL != fgets(lineBuffer,lineBufferSize,fastaFile)) {
if (lineBuffer[0] == '>') {
inAContig = true;
//
// A new contig. Add in the padding first.
//
genome->addData(paddingBuffer);
//
// Now supply the chromosome name.
//
if (NULL != pieceNameTerminatorCharacters) {
for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) {
char *terminator = strchr(lineBuffer+1, pieceNameTerminatorCharacters[i]);
if (NULL != terminator) {
*terminator = '\0';
}
}
}
if (spaceIsAPieceNameTerminator) {
char *terminator = strchr(lineBuffer, ' ');
if (NULL != terminator) {
*terminator = '\0';
}
terminator = strchr(lineBuffer, '\t');
if (NULL != terminator) {
*terminator = '\0';
}
}
char *terminator = strchr(lineBuffer, '\n');
if (NULL != terminator) {
*terminator = '\0';
}
terminator = strchr(lineBuffer, '\r');
if (NULL != terminator) {
*terminator = '\0';
}
genome->startContig(lineBuffer+1);
} else {
if (!inAContig) {
WriteErrorMessage("\nFASTA file doesn't beging with a contig name (i.e., the first line doesn't start with '>').\n");
soft_exit(1);
}
//
// Convert it to upper case and truncate the newline before adding it to the genome.
//
char *newline = strchr(lineBuffer, '\n');
if (NULL != newline) {
*newline = 0;
}
//
// But convert any 'N' to 'n'. This is so we don't match the N from the genome with N
// in reads (where we just do a straight text comparison.
//
size_t lineLen = strlen(lineBuffer);
for (unsigned i = 0; i < lineLen; i++) {
lineBuffer[i] = toupper(lineBuffer[i]);
}
for (unsigned i = 0; i < lineLen; i++) {
if ('N' == lineBuffer[i]) {
lineBuffer[i] = 'n';
}
if (!isValidGenomeCharacter[(unsigned char)lineBuffer[i]]) {
if (!warningIssued) {
WriteErrorMessage("\nFASTA file contained a character that's not a valid base (or N): '%c', full line '%s'; \nconverting to 'N'. This may happen again, but there will be no more warnings.\n", lineBuffer[i], lineBuffer);
warningIssued = true;
}
lineBuffer[i] = 'N';
}
}
genome->addData(lineBuffer);
}
}
//
// And finally add padding at the end of the genome.
//
genome->addData(paddingBuffer);
genome->fillInContigLengths();
genome->sortContigsByName();
fclose(fastaFile);
delete [] paddingBuffer;
return genome;
}
//
// TODO: Reduce code duplication with the mutator.
//
bool AppendFASTAGenome(const Genome *genome, FILE *fasta, const char *prefix="")
{
int nContigs = genome->getNumContigs();
const Genome::Contig *contigs = genome->getContigs();
for (int i = 0; i < nContigs; ++i) {
const Genome::Contig &contig = contigs[i];
GenomeLocation start = contig.beginningLocation;
GenomeLocation end = i + 1 < nContigs ? contigs[i + 1].beginningLocation : genome->getCountOfBases();
GenomeDistance size = end - start;
const char *bases = genome->getSubstring(start, size);
fprintf(fasta, ">%s%s\n", prefix, contig.name);
fwrite(bases, 1, size, fasta);
fputc('\n', fasta);
}
return !ferror(fasta);
}
|