File: ref_read.h

package info (click to toggle)
bowtie 1.1.1-2
links: PTS, VCS
area: main
in suites: jessie, jessie-kfreebsd
size: 14,864 kB
ctags: 5,533
sloc: cpp: 32,737; perl: 2,084; ansic: 1,241; sh: 1,066; makefile: 344; python: 133
file content (304 lines) | stat: -rw-r--r-- 7,743 bytes
parent folder | download | duplicates (3)
#ifndef REF_READ_H_
#define REF_READ_H_

#include <iostream>
#include <cassert>
#include <vector>
#include <string>
#include <ctype.h>
#include <fstream>
#include <stdexcept>
#include <seqan/sequence.h>
#include "alphabet.h"
#include "assert_helpers.h"
#include "filebuf.h"
#include "word_io.h"
#include "endian_swap.h"

using namespace std;
using namespace seqan;

class RefTooLongException : public exception {

public:
	RefTooLongException() {
#ifdef BOWTIE_64BIT_INDEX
		// This should never happen!
		msg = "Error: Reference sequence has more than 2^64-1 characters!  "
		      "Please divide the reference into smaller chunks and index each "
			  "independently.";
#else
		msg = "Error: Reference sequence has more than 2^32-1 characters!  "
		      "Please build a large index by passing the --large-index option "
			  "to bowtie2-build";
#endif
	}

	~RefTooLongException() throw() {}

	const char* what() const throw() {
		return msg.c_str();
	}

protected:

	string msg;

};


/**
 * Encapsulates a stretch of the reference containing only unambiguous
 * characters.  From an ordered list of RefRecords, one can (almost)
 * deduce the "shape" of the reference sequences (almost because we
 * lose information about stretches of ambiguous characters at the end
 * of reference sequences).
 */
struct RefRecord {
	RefRecord() : off(), len(), first() { }
	RefRecord(TIndexOffU _off, TIndexOffU _len, bool _first) :
		off(_off), len(_len), first(_first)
	{ }

	RefRecord(FILE *in, bool swap) {
		assert(in != NULL);
		if(!fread(&off, OFF_SIZE, 1, in)) {
			cerr << "Error reading RefRecord offset from FILE" << endl;
			throw 1;
		}
		if(swap) off = endianSwapU(off);
		if(!fread(&len, OFF_SIZE, 1, in)) {
			cerr << "Error reading RefRecord offset from FILE" << endl;
			throw 1;
		}
		if(swap) len = endianSwapU(len);
		first = fgetc(in) ? true : false;
	}

	void write(std::ostream& out, bool be) {
		writeU<TIndexOffU>(out, off, be);
		writeU<TIndexOffU>(out, len, be);
		out.put(first ? 1 : 0);
	}

	TIndexOffU off; /// Offset of the first character in the record
	TIndexOffU len; /// Length of the record
	bool   first; /// Whether this record is the first for a reference sequence
};

enum {
	REF_READ_FORWARD = 0, // don't reverse reference sequence
	REF_READ_REVERSE,     // reverse entire reference sequence
	REF_READ_REVERSE_EACH // reverse each unambiguous stretch of reference
};

/**
 * Parameters governing treatment of references as they're read in.
 */
struct RefReadInParams {
	RefReadInParams(bool col, int r, bool nsToA, bool bisulf) :
		color(col), reverse(r), nsToAs(nsToA), bisulfite(bisulf) { }
	// extract colors from reference
	bool color;
	// reverse each reference sequence before passing it along
	int reverse;
	// convert ambiguous characters to As
	bool nsToAs;
	// bisulfite-convert the reference
	bool bisulfite;
};

extern RefRecord
fastaRefReadSize(
	FileBuf& in,
	const RefReadInParams& rparms,
	bool first,
	BitpairOutFileBuf* bpout = NULL);

extern std::pair<size_t, size_t>
fastaRefReadSizes(
	vector<FileBuf*>& in,
	vector<RefRecord>& recs,
	vector<uint32_t>& plens,
	const RefReadInParams& rparms,
	BitpairOutFileBuf* bpout,
	TIndexOff& numSeqs);

extern void
reverseRefRecords(
	const vector<RefRecord>& src,
	vector<RefRecord>& dst,
	bool recursive = false,
	bool verbose = false);

/**
 * Reads the next sequence from the given FASTA file and appends it to
 * the end of dst, optionally reversing it.
 */
template <typename TStr>
static RefRecord fastaRefReadAppend(FileBuf& in,
                                    bool first,
                                    TStr& dst,
                                    RefReadInParams& rparms,
                                    string* name = NULL)
{
	typedef typename Value<TStr>::Type TVal;
	int c;
	static int lastc = '>';
	if(first) {
		c = in.getPastWhitespace();
		if(c != '>') {
			cerr << "Reference file does not seem to be a FASTA file" << endl;
			throw 1;
		}
		lastc = c;
	}
	assert_neq(-1, lastc);

	// RefRecord params
	size_t len = 0;
	size_t off = 0;
	first = true;

	size_t ilen = length(dst);

	// Chew up the id line; if the next line is either
	// another id line or a comment line, keep chewing
	int lc = -1; // last-DNA char variable for color conversion
	c = lastc;
	if(c == '>' || c == '#') {
		do {
			while (c == '#') {
				if((c = in.getPastNewline()) == -1) {
					lastc = -1;
					goto bail;
				}
			}
			assert_eq('>', c);
			while(true) {
				c = in.get();
				if(c == -1) {
					lastc = -1;
					goto bail;
				}
				if(c == '\n' || c == '\r') {
					while(c == '\r' || c == '\n') c = in.get();
					if(c == -1) {
						lastc = -1;
						goto bail;
					}
					break;
				}
				if (name) name->push_back(c);
			}
			// c holds the first character on the line after the name
			// line
		} while (c == '>' || c == '#');
	} else {
		ASSERT_ONLY(int cc = toupper(c));
		assert(cc != 'A' && cc != 'C' && cc != 'G' && cc != 'T');
		first = false;
	}

	// Skip over an initial stretch of gaps or ambiguous characters.
	// For colorspace we skip until we see two consecutive unambiguous
	// characters (i.e. the first unambiguous color).
	while(true) {
		int cat = dna4Cat[c];
		if(rparms.nsToAs && cat == 2) {
			c = 'A';
		}
		int cc = toupper(c);
		if(rparms.bisulfite && cc == 'C') c = cc = 'T';
		if(cat == 1) {
			// This is a DNA character
			if(rparms.color) {
				if(lc != -1) {
					// Got two consecutive unambiguous DNAs
					break; // to read-in loop
				}
				// Keep going; we need two consecutive unambiguous DNAs
				lc = charToDna5[(int)c];
				// The 'if(off > 0)' takes care of the case where
				// the reference is entirely unambiguous and we don't
				// want to incorrectly increment off.
				if(off > 0) off++;
			} else {
				break; // to read-in loop
			}
		} else if(cat == 2) {
			if(lc != -1 && off == 0) {
				off++;
			}
			lc = -1;
			off++; // skip it
		} else if(c == '>') {
			lastc = '>';
			goto bail;
		}
		c = in.get();
		if(c == -1) {
			lastc = -1;
			goto bail;
		}
	}
	if(first && rparms.color && off > 0) {
		// Handle the case where the first record has ambiguous
		// characters but we're in color space; one of those counts is
		// spurious
		off--;
	}
	assert(!rparms.color || lc != -1);
	assert_eq(1, dna4Cat[c]);

	// in now points just past the first character of a sequence
	// line, and c holds the first character
	while(true) {
		// Note: can't have a comment in the middle of a sequence,
		// though a comment can end a sequence
		int cat = dna4Cat[c];
		assert_neq(2, cat);
		if(cat == 1) {
			// Consume it
			if(!rparms.color || lc != -1) len++;
			// Add it to referenece buffer
			if(rparms.color) {
				appendValue(dst, (Dna)dinuc2color[charToDna5[(int)c]][lc]);
			} else if(!rparms.color) {
				appendValue(dst, (Dna)(char)c);
			}
			assert_lt((uint8_t)(Dna)dst[length(dst)-1], 4);
			lc = charToDna5[(int)c];
		}
		c = in.get();
		if(rparms.nsToAs && dna4Cat[c] == 2) c = 'A';
		if (c == -1 || c == '>' || c == '#' || dna4Cat[c] == 2) {
			lastc = c;
			break;
		}
		if(rparms.bisulfite && toupper(c) == 'C') c = 'T';
	}

  bail:
	// Optionally reverse the portion that we just appended.
	// ilen = length of buffer before this last sequence was appended.
	if(rparms.reverse == REF_READ_REVERSE_EACH) {
		// Find limits of the portion we just appended
		size_t nlen = length(dst);
		assert_eq(nlen - ilen, len);
		if(len > 0) {
			size_t halfway =  ilen + (len>>1);
			// Reverse it in-place
			for(size_t i = ilen; i < halfway; i++) {
				size_t diff = i-ilen;
				size_t j = nlen-diff-1;
				TVal tmp = dst[i];
				dst[i] = dst[j];
				dst[j] = tmp;
			}
		}
	}
	return RefRecord((TIndexOffU)off, (TIndexOffU)len, first);
}

#endif /*ndef REF_READ_H_*/