File: reference.h

package info (click to toggle)
bowtie 1.2.2%2Bdfsg-4
links: PTS, VCS
area: main
in suites: buster
size: 16,704 kB
sloc: cpp: 35,614; perl: 5,903; ansic: 1,247; sh: 1,128; python: 483; makefile: 426
file content (745 lines) | stat: -rw-r--r-- 23,609 bytes
#ifndef REFERENCE_H_
#define REFERENCE_H_

#include <stdexcept>
#include "endian_swap.h"
#include "mm.h"
#include "shmem.h"
#include "timer.h"
#include "btypes.h"


/**
 * Concrete reference representation that bulk-loads the reference from
 * the bit-pair-compacted binary file and stores it in memory also in
 * bit-pair-compacted format.  The user may request reference
 * characters either on a per-character bases or by "stretch" using
 * getBase(...) and getStretch(...) respectively.
 *
 * Most of the complexity in this class is due to the fact that we want
 * to represent references with ambiguous (non-A/C/G/T) characters but
 * we don't want to use more than two bits per base.  This means we
 * need a way to encode the ambiguous stretches of the reference in a
 * way that is external to the bitpair sequence.  To accomplish this,
 * we use the RefRecords vector, which is stored in the .3.ebwt index
 * file.  The bitpairs themselves are stored in the .4.ebwt index file.
 *
 * Once it has been loaded, a BitPairReference is read-only, and is
 * safe for many threads to access at once.
 */
class BitPairReference {

public:
	/**
	 * Load from .3.ebwt/.4.ebwt Bowtie index files.
	 */
	BitPairReference(const string& in,
	                 bool color,
	                 bool sanity,
	                 std::vector<string>* infiles,
	                 std::vector<String<Dna5> >* origs,
	                 bool infilesSeq,
	                 bool loadSequence, // as opposed to just records
	                 bool useMm,
	                 bool useShmem,
	                 bool mmSweep,
	                 bool verbose,
	                 bool startVerbose) :
	buf_(NULL),
	sanityBuf_(NULL),
	loaded_(true),
	sanity_(sanity),
	useMm_(useMm),
	useShmem_(useShmem),
	verbose_(verbose)
	{
		string s3 = in + ".3." + gEbwt_ext;
		string s4 = in + ".4." + gEbwt_ext;

		FILE *f3, *f4;
		if((f3 = fopen(s3.c_str(), "rb")) == NULL) {
			cerr << "Could not open reference-string index file " << s3 << " for reading." << endl;
			cerr << "This is most likely because your index was built with an older version" << endl
			     << "(<= 0.9.8.1) of bowtie-build.  Please re-run bowtie-build to generate a new" << endl
			     << "index (or download one from the Bowtie website) and try again." << endl;
			loaded_ = false;
			return;
		}
		if((f4 = fopen(s4.c_str(), "rb")) ==NULL) {
			cerr << "Could not open reference-string index file " << s4 << " for reading." << endl;
			loaded_ = false;
			return;
		}
#ifdef BOWTIE_MM
		char *mmFile = NULL;
		if(useMm_) {
			if(verbose_ || startVerbose) {
				cerr << "  Memory-mapping reference index file " << s4 << ": ";
				logTime(cerr);
			}
			struct stat sbuf;
			if (stat(s4.c_str(), &sbuf) == -1) {
				perror("stat");
				cerr << "Error: Could not stat index file " << s4.c_str() << " prior to memory-mapping" << endl;
				throw 1;
			}
			mmFile = (char*)mmap((void *)0, sbuf.st_size,
			                     PROT_READ, MAP_SHARED, fileno(f4), 0);
			if(mmFile == (void *)(-1) || mmFile == NULL) {
				perror("mmap");
				cerr << "Error: Could not memory-map the index file " << s4.c_str() << endl;
				throw 1;
			}
			if(mmSweep) {
				TIndexOff sum = 0;
				for(off_t i = 0; i < sbuf.st_size; i += 1024) {
					sum += (TIndexOff) mmFile[i];
				}
				if(startVerbose) {
					cerr << "  Swept the memory-mapped ref index file; checksum: " << sum << ": ";
					logTime(cerr);
				}
			}
		}
#endif

		// Read endianness sentinel, set 'swap'
		uint32_t one;
		bool swap = false;
		one = readU<int32_t>(f3, swap);
		if(one != 1) {
			if(useMm_) {
				cerr << "Error: Can't use memory-mapped files when the index is the opposite endianness" << endl;
				throw 1;
			}
			assert_eq(0x1000000, one);
			swap = true; // have to endian swap U32s
		}

		// Read # records
		TIndexOffU sz;
		sz = readU<TIndexOffU>(f3, swap);
		if(sz == 0) {
			cerr << "Error: number of reference records is 0 in " << s3 << endl;
			throw 1;
		}

		// Read records
		nrefs_ = 0;
		nNoGapRefs_ = 0;

		// Cumulative count of all unambiguous characters on a per-
		// stretch 8-bit alignment (i.e. count of bytes we need to
		// allocate in buf_)
		TIndexOffU cumsz = 0;
		TIndexOffU cumlen = 0;
		TIndexOffU unambiglen = 0;
		TIndexOffU maxlen = 0;
		// For each unambiguous stretch...
		for(TIndexOffU i = 0; i < sz; i++) {
			recs_.push_back(RefRecord(f3, swap));
		}
		for(TIndexOffU i = 0; i < sz; i++) {
			if(recs_[i].first) {
				if(nrefs_ > 0) {
					refLens_.push_back(cumlen);
				}
				// Stupid hack to get around the fact that, for
				// colorspace Ebwts, not only do we omit reference
				// sequences that are *all* gaps from
				// nPat/plen/refnames, but we also omit reference
				// sequences that never have a stretch of more than 1
				// unambiguous character.
				if(unambiglen > 0 && (!color || maxlen > 1)) {
					refApproxLens_.push_back(cumlen);
				}
				// More hackery to detect references that won't be
				// in the Ebwt even though they have non-zero length
				bool willBeInEbwt = true;
				if(recs_[i].len > 0 && color) {
					// Omit until we prove it should be kept
					willBeInEbwt = false;
					for(uint32_t j = i; j < sz; j++) {
						if(j > i && recs_[j].first) break;
						if(recs_[j].len >= 2) {
							willBeInEbwt = true;
							break;
						}
					}
				}
				if(recs_[i].len > 0 && willBeInEbwt) {
					// Remember that this is the first record for this
					// reference sequence (and the last record for the one
					// before)
					refRecOffs_.push_back(i);
					refOffs_.push_back(cumsz);
					expandIdx_.push_back(nrefs_);
					shrinkIdx_.push_back(nNoGapRefs_);
					nNoGapRefs_++;
					isGaps_.push_back(false);
				} else {
					shrinkIdx_.push_back(nNoGapRefs_);
					isGaps_.push_back(true);
				}
				cumlen = 0;
				unambiglen = 0;
				maxlen = 0;
				nrefs_++;
				assert_eq(nNoGapRefs_, expandIdx_.size());
				assert_eq(nrefs_, shrinkIdx_.size());
			} else if(i == 0) {
				//cerr << "First record in reference index file was not marked as 'first'" << endl;
				//throw 1;
			}
			cumUnambig_.push_back(cumsz);
			cumRefOff_.push_back(cumlen);
			cumsz += recs_[i].len;
#ifdef ACCOUNT_FOR_ALL_GAP_REFS
			cumlen += recs_[i].off;
			cumlen += recs_[i].len;
#else
			if(recs_[i].len > 0) {
				cumlen += recs_[i].off;
				cumlen += recs_[i].len;
			}
#endif
			unambiglen += recs_[i].len;
			if(recs_[i].len > maxlen) maxlen = recs_[i].len;
		}
		if(verbose_ || startVerbose) {
			cerr << "Read " << nrefs_ << " reference strings (" << nNoGapRefs_ << " non-empty) from " << sz << " records: ";
			logTime(cerr);
		}
		// Store a cap entry for the end of the last reference seq
		refRecOffs_.push_back((TIndexOffU)recs_.size());
		refOffs_.push_back(cumsz);
		if(unambiglen > 0 && (!color || maxlen > 1)) {
			refApproxLens_.push_back(cumlen);
		}
		refLens_.push_back(cumlen);
		cumUnambig_.push_back(cumsz);
		cumRefOff_.push_back(cumlen);
		bufSz_ = cumsz;
		assert_eq(nNoGapRefs_, refApproxLens_.size());
		assert_eq(sz, recs_.size());
		if (f3 != NULL) fclose(f3); // done with .3.ebwt file
		// Round cumsz up to nearest byte boundary
		if((cumsz & 3) != 0) {
			cumsz += (4 - (cumsz & 3));
		}
		bufAllocSz_ = cumsz >> 2;
		assert_eq(0, cumsz & 3); // should be rounded up to nearest 4
		if(!loadSequence) return;
		if(useMm_) {
#ifdef BOWTIE_MM
			buf_ = (uint8_t*)mmFile;
			if(sanity_) {
				FILE *ftmp = fopen(s4.c_str(), "rb");
				sanityBuf_ = new uint8_t[cumsz >> 2];
				size_t ret = fread(sanityBuf_, 1, cumsz >> 2, ftmp);
				if(ret != (cumsz >> 2)) {
					cerr << "Only read " << ret << " bytes (out of " << (cumsz >> 2) << ") from reference index file " << s4 << endl;
					throw 1;
				}
				fclose(ftmp);
				for(size_t i = 0; i < (cumsz >> 2); i++) {
					assert_eq(sanityBuf_[i], buf_[i]);
				}
			}
#else
			cerr << "Shouldn't be at " << __FILE__ << ":" << __LINE__ << " without BOWTIE_MM defined" << endl;
			throw 1;
#endif
		} else {
			bool shmemLeader = true;
			if(!useShmem_) {
				// Allocate a buffer to hold the reference string
				try {
					buf_ = new uint8_t[cumsz >> 2];
					if(buf_ == NULL) throw std::bad_alloc();
				} catch(std::bad_alloc& e) {
					cerr << "Error: Ran out of memory allocating space for the bitpacked reference.  Please" << endl
						 << "re-run on a computer with more memory." << endl;
					throw 1;
				}
			} else {
				shmemLeader = ALLOC_SHARED_U8(
					(s4 + "[ref]"), (cumsz >> 2), &buf_,
					"ref", (verbose_ || startVerbose));
			}
			if(shmemLeader) {
				// Open the bitpair-encoded reference file
				FILE *f4 = fopen(s4.c_str(), "rb");
				if(f4 == NULL) {
					cerr << "Could not open reference-string index file " << s4 << " for reading." << endl;
					cerr << "This is most likely because your index was built with an older version" << endl
						 << "(<= 0.9.8.1) of bowtie-build.  Please re-run bowtie-build to generate a new" << endl
						 << "index (or download one from the Bowtie website) and try again." << endl;
					loaded_ = false;
					return;
				}
				// Read the whole thing in
				size_t ret = fread(buf_, 1, cumsz >> 2, f4);
				// Didn't read all of it?
				if(ret != (cumsz >> 2)) {
					cerr << "Only read " << ret << " bytes (out of " << (cumsz >> 2) << ") from reference index file " << s4 << endl;
					throw 1;
				}
				// Make sure there's no more
				char c;
				ret = fread(&c, 1, 1, f4);
				assert_eq(0, ret); // should have failed
				fclose(f4);
				if(useShmem_) NOTIFY_SHARED(buf_, (cumsz >> 2));
			} else {
				if(useShmem_) WAIT_SHARED(buf_, (cumsz >> 2));
			}
		}

		// Populate byteToU32_
		bool big = currentlyBigEndian();
		for(int i = 0; i < 256; i++) {
			uint32_t word = 0;
			if(big) {
				word |= ((i >> 0) & 3) << 24;
				word |= ((i >> 2) & 3) << 16;
				word |= ((i >> 4) & 3) << 8;
				word |= ((i >> 6) & 3) << 0;
			} else {
				word |= ((i >> 0) & 3) << 0;
				word |= ((i >> 2) & 3) << 8;
				word |= ((i >> 4) & 3) << 16;
				word |= ((i >> 6) & 3) << 24;
			}
			byteToU32_[i] = word;
		}

#ifndef NDEBUG
		if(sanity_) {
			// Compare the sequence we just read from the compact index
			// file to the true reference sequence.
			std::vector<seqan::String<seqan::Dna5> > *os; // for holding references
			std::vector<seqan::String<seqan::Dna5> > osv; // for holding references
			if(infiles != NULL) {
				if(infilesSeq) {
					for(size_t i = 0; i < infiles->size(); i++) {
						// Remove initial backslash; that's almost
						// certainly being used to protect the first
						// character of the sequence from getopts (e.g.,
						// when the first char is -)
						if((*infiles)[i].at(0) == '\\') {
							(*infiles)[i].erase(0, 1);
						}
						osv.push_back(String<Dna5>((*infiles)[i]));
					}
				} else {
					readSequenceFiles<seqan::String<seqan::Dna5>, seqan::Fasta>(*infiles, osv);
				}
				os = &osv;
			} else {
				assert(origs != NULL);
				os = origs;
			}

			// Never mind; reference is always letters, even if index
			// and alignment run are colorspace

			// If we're building a colorspace index, we need to convert
			// osv to colors first
//			if(color) {
//				for(size_t i = 0; i < os->size(); i++) {
//					size_t olen = seqan::length((*os)[i]);
//					for(size_t j = 0; j < olen-1; j++) {
//						int b1 = (int)(*os)[i][j];
//						assert_geq(b1, 0); assert_leq(b1, 4);
//						int b2 = (int)(*os)[i][j+1];
//						assert_geq(b2, 0); assert_leq(b2, 4);
//						(*os)[i][j] = (Dna5)dinuc2color[b1][b2];
//						assert((b1 != 4 && b2 != 4) || (int)(*os)[i][j] == 4);
//					}
//					seqan::resize((*os)[i], olen-1);
//				}
//			}
			// Go through the loaded reference files base-by-base and
			// sanity check against what we get by calling getBase and
			// getStretch
			size_t refi = 0;
			int longestStretch = 0;
			int curStretch = 0;
			for(size_t i = 0; i < os->size(); i++) {
				size_t olen = seqan::length((*os)[i]);
				for(size_t j = 0; j < olen; j++) {
					if((int)(*os)[i][j] < 4) {
						curStretch++;
						if(curStretch > longestStretch) longestStretch = curStretch;
					} else {
						curStretch = 0;
					}
				}
				if(longestStretch == 0 || (color && longestStretch == 1)) {
					continue;
				}
				longestStretch = 0;
				size_t olenU32 = (olen + 12) / 4;
				uint32_t *buf = new uint32_t[olenU32];
				uint8_t *bufadj = (uint8_t*)buf;
				bufadj += getStretch(buf, refi, 0, olen);
				for(size_t j = 0; j < olen; j++) {
					assert_eq((int)(*os)[i][j], (int)bufadj[j]);
					assert_eq((int)(*os)[i][j], (int)getBase(refi, j));
				}
				refi++;
				delete[] buf;
			}
		}
#endif
	}

	~BitPairReference() {
		if(buf_ != NULL && !useMm_ && !useShmem_) delete[] buf_;
		if(sanityBuf_ != NULL) delete[] sanityBuf_;
	}

	/**
	 * Return a single base of the reference.  Calling this repeatedly
	 * is not an efficient way to retrieve bases from the reference;
	 * use loadStretch() instead.
	 *
	 * This implementation scans linearly through the records for the
	 * unambiguous stretches of the target reference sequence.  When
	 * there are many records, binary search would be more appropriate.
	 */
	int getBase(size_t tidx, size_t toff) const {
		uint64_t reci = refRecOffs_[tidx];   // first record for target reference sequence
		uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
		assert_gt(recf, reci);
		uint64_t bufOff = refOffs_[tidx];
		uint64_t off = 0;
		// For all records pertaining to the target reference sequence...
		for(uint64_t i = reci; i < recf; i++) {
			assert_geq(toff, off);
			off += recs_[i].off;
			if(toff < off) {
				return 4;
			}
			assert_geq(toff, off);
			uint64_t recOff = off + recs_[i].len;
			if(toff < recOff) {
				toff -= off;
				bufOff += toff;
				assert_lt(bufOff, bufSz_);
				const uint64_t bufElt = (bufOff) >> 2;
				const uint64_t shift = (bufOff & 3) << 1;
				return ((buf_[bufElt] >> shift) & 3);
			}
			bufOff += recs_[i].len;
			off = recOff;
			assert_geq(toff, off);
		} // end for loop over records
		return 4;
	}

	/**
	 * Load a stretch of the reference string into memory at 'dest'.
	 *
	 * This implementation scans linearly through the records for the
	 * unambiguous stretches of the target reference sequence.  When
	 * there are many records, binary search would be more appropriate.
	 */
	int getStretchNaive(uint32_t *destU32,
	                    size_t tidx,
	                    size_t toff,
	                    size_t count) const
	{
		uint8_t *dest = (uint8_t*)destU32;
		uint64_t reci = refRecOffs_[tidx];   // first record for target reference sequence
		uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
		assert_gt(recf, reci);
		uint64_t cur = 0;
		uint64_t bufOff = refOffs_[tidx];
		uint64_t off = 0;
		// For all records pertaining to the target reference sequence...
		for(uint64_t i = reci; i < recf; i++) {
			assert_geq(toff, off);
			off += recs_[i].off;
			for(; toff < off && count > 0; toff++) {
				dest[cur++] = 4;
				count--;
			}
			if(count == 0) break;
			assert_geq(toff, off);
			if(toff < off + recs_[i].len) {
				bufOff += (TIndexOffU)(toff - off); // move bufOff pointer forward
			} else {
				bufOff += recs_[i].len;
			}
			off += recs_[i].len;
			for(; toff < off && count > 0; toff++) {
				assert_lt(bufOff, bufSz_);
				const uint64_t bufElt = (bufOff) >> 2;
				const uint64_t shift = (bufOff & 3) << 1;
				dest[cur++] = (buf_[bufElt] >> shift) & 3;
				bufOff++;
				count--;
			}
			if(count == 0) break;
			assert_geq(toff, off);
		} // end for loop over records
		// In any chars are left after scanning all the records,
		// they must be ambiguous
		while(count > 0) {
			count--;
			dest[cur++] = 4;
		}
		assert_eq(0, count);
		return 0;
	}

	/**
	 * Load a stretch of the reference string into memory at 'dest'.
	 *
	 * This implementation scans linearly through the records for the
	 * unambiguous stretches of the target reference sequence.  When
	 * there are many records, binary search would be more appropriate.
	 */
	int getStretch(uint32_t *destU32,
	               size_t tidx,
	               size_t toff,
	               size_t count) const
	{
		ASSERT_ONLY(size_t origCount = count);
		ASSERT_ONLY(size_t origToff = toff);
		if(count == 0) return 0;
		uint8_t *dest = (uint8_t*)destU32;
#ifndef NDEBUG
		uint32_t *destU32_2 = new uint32_t[(origCount >> 2) + 2];
		int off2 = getStretchNaive(destU32_2, tidx, origToff, origCount);
		uint8_t *dest_2 = ((uint8_t*)destU32_2) + off2;
#endif
		destU32[0] = 0x04040404; // Add Ns, which we might end up using later
		uint64_t reci = refRecOffs_[tidx];   // first record for target reference sequence
		uint64_t recf = refRecOffs_[tidx+1]; // last record (exclusive) for target seq
		assert_gt(recf, reci);
		uint64_t cur = 4; // keep a cushion of 4 bases at the beginning
		uint64_t bufOff = refOffs_[tidx];
		uint64_t off = 0;
		int64_t offset = 4;
		bool firstStretch = true;
		ASSERT_ONLY(bool binarySearched = false);
		uint64_t left  = reci;
		uint64_t right = recf;
		uint64_t mid   = 0;
		// For all records pertaining to the target reference sequence...
		for(uint64_t i = reci; i < recf; i++) {
			uint64_t origBufOff = bufOff;
			assert_geq(toff, off);
		if (firstStretch && recf > reci + 16){
			// binary search finds smallest i s.t. toff >= cumRefOff_[i]
			while (left < right-1) {
				mid = left + ((right - left) >> 1);
				if (cumRefOff_[mid] <= toff)
					left = mid;
				else
					right = mid;
			}
			off = cumRefOff_[left];
			bufOff = cumUnambig_[left];
			origBufOff = bufOff;
			i = left;
			assert(cumRefOff_[i+1] == 0 || cumRefOff_[i+1] > toff);
			ASSERT_ONLY(binarySearched = true);
		}
		off += recs_[i].off; // skip Ns at beginning of stretch
		assert_gt(count, 0);
		if(toff < off) {
			size_t cpycnt = min((size_t)(off - toff), count);
			memset(&dest[cur], 4, cpycnt);
			count -= cpycnt;
			toff += cpycnt;
			cur += cpycnt;
			if(count == 0) break;
		}
		assert_geq(toff, off);
		if(toff < off + recs_[i].len) {
			bufOff += toff - off; // move bufOff pointer forward
		} else {
			bufOff += recs_[i].len;
		}
		off += recs_[i].len;
		assert(off == cumRefOff_[i+1] || cumRefOff_[i+1] == 0);
		assert(!binarySearched || toff < off);
		if(toff < off) {
			if(firstStretch) {
				if(toff + 8 < off && count > 8) {
					// We already added some Ns, so we have to do
					// a fixup at the beginning of the buffer so
					// that we can start clobbering at cur >> 2
					if(cur & 3) {
						offset -= (cur & 3);
					}
					uint64_t curU32 = cur >> 2;
					// Do the initial few bases
					if(bufOff & 3) {
						const uint64_t bufElt = (bufOff) >> 2;
						const int64_t low2 = bufOff & 3;
						// Lots of cache misses on the following line
						destU32[curU32] = byteToU32_[buf_[bufElt]];
						for(int j = 0; j < low2; j++) {
							((char *)(&destU32[curU32]))[j] = 4;
						}
						curU32++;
						offset += low2;
						const int64_t chars = 4 - low2;
						count -= chars;
						bufOff += chars;
						toff += chars;
					}
					assert_eq(0, bufOff & 3);
					uint64_t bufOffU32 = bufOff >> 2;
					uint64_t countLim = count >> 2;
					uint64_t offLim = ((off - (toff + 4)) >> 2);
					uint64_t lim = min(countLim, offLim);
					// Do the fast thing for as far as possible
					for(uint64_t j = 0; j < lim; j++) {
						// Lots of cache misses on the following line
						destU32[curU32] = byteToU32_[buf_[bufOffU32++]];
#ifndef NDEBUG
						if(dest_2 != NULL) {
							assert_eq(dest[(curU32 << 2) + 0], dest_2[(curU32 << 2) - offset + 0]);
							assert_eq(dest[(curU32 << 2) + 1], dest_2[(curU32 << 2) - offset + 1]);
							assert_eq(dest[(curU32 << 2) + 2], dest_2[(curU32 << 2) - offset + 2]);
							assert_eq(dest[(curU32 << 2) + 3], dest_2[(curU32 << 2) - offset + 3]);
						}
#endif
						curU32++;
					}
					toff += (lim << 2);
					assert_leq(toff, off);
					assert_leq((lim << 2), count);
					count -= (lim << 2);
					bufOff = bufOffU32 << 2;
					cur = curU32 << 2;
				}
				// Do the slow thing for the rest
				for(; toff < off && count > 0; toff++) {
					assert_lt(bufOff, bufSz_);
					const uint64_t bufElt = (bufOff) >> 2;
					const uint64_t shift = (bufOff & 3) << 1;
					dest[cur++] = (buf_[bufElt] >> shift) & 3;
					bufOff++;
					count--;
				}
				firstStretch = false;
			} else {
				// Do the slow thing
				for(; toff < off && count > 0; toff++) {
					assert_lt(bufOff, bufSz_);
					const uint64_t bufElt = (bufOff) >> 2;
					const uint64_t shift = (bufOff & 3) << 1;
					dest[cur++] = (buf_[bufElt] >> shift) & 3;
					bufOff++;
					count--;
				}
			}
		}
		if(count == 0) break;
		assert_eq(recs_[i].len, bufOff - origBufOff);
		assert_geq(toff, off);
	} // end for loop over records
	// In any chars are left after scanning all the records,
	// they must be ambiguous
	while(count > 0) {
		count--;
		dest[cur++] = 4;
	}
	assert_eq(0, count);
#ifndef NDEBUG
		delete[] destU32_2;
#endif
		return (int)offset;
	}

	/// Return the number of reference sequences.
	TIndexOffU numRefs() const {
		return nrefs_;
	}

	/// Return the number of reference sequences that don't consist
	/// entirely of stretches of ambiguous characters.
	uint32_t numNonGapRefs() const {
		return nNoGapRefs_;
	}

	/**
	 *
	 */
	uint32_t shrinkIdx(uint32_t idx) const {
		assert_lt(idx, shrinkIdx_.size());
		return shrinkIdx_[idx];
	}

	/**
	 *
	 */
	uint32_t expandIdx(uint32_t idx) const {
		assert_lt(idx, expandIdx_.size());
		return expandIdx_[idx];
	}

	/// Return the lengths of reference sequences.
	TIndexOffU approxLen(TIndexOffU elt) const {
		assert_lt(elt, nrefs_);
		return refApproxLens_[elt];
	}

	/// Return the lengths of reference sequences.
	uint32_t len(uint32_t elt) const {
		assert_lt(elt, nrefs_);
		return refLens_[elt];
	}

	/// Return true iff ref 'elt' is all gaps
	bool isAllGaps(uint32_t elt) const {
		assert_lt(elt, nrefs_);
		assert_eq(isGaps_.size(), nrefs_);
		return isGaps_[elt];
	}

	/// Return true iff buf_ and all the vectors are populated.
	bool loaded() const {
		return loaded_;
	}

	/**
	 * Return constant reference to the RefRecord list.
	 */
	const std::vector<RefRecord>& refRecords() const { return recs_; }

protected:

	uint32_t byteToU32_[256];

	std::vector<RefRecord> recs_;       /// records describing unambiguous stretches
	std::vector<uint32_t>  refApproxLens_; /// approx lens of ref seqs (excludes trailing ambig chars)
	std::vector<TIndexOffU>  refLens_;    /// approx lens of ref seqs (excludes trailing ambig chars)
	std::vector<TIndexOffU>  refOffs_;    /// buf_ begin offsets per ref seq
	std::vector<TIndexOffU>  cumUnambig_;    /// # unambig ref chars up to each record
	std::vector<TIndexOffU>  cumRefOff_;    /// # ref chars up to each record
	std::vector<TIndexOffU>  refRecOffs_; /// record begin/end offsets per ref seq
	std::vector<uint32_t>  expandIdx_; /// map from small idxs (e.g. w/r/t plen) to large ones (w/r/t refnames)
	std::vector<uint32_t>  shrinkIdx_; /// map from large idxs to small
	std::vector<bool>      isGaps_;    /// ref i is all gaps?
	uint8_t *buf_;      /// the whole reference as a big bitpacked byte array
	uint8_t *sanityBuf_;/// for sanity-checking buf_
	TIndexOffU bufSz_;    /// size of buf_
	TIndexOffU bufAllocSz_;
	TIndexOffU nrefs_;      /// the number of reference sequences
	uint32_t nNoGapRefs_; /// the number of reference sequences that aren't totally ambiguous
	bool     loaded_;   /// whether it's loaded
	bool     sanity_;   /// do sanity checking
	bool     useMm_;    /// load the reference as a memory-mapped file
	bool     useShmem_; /// load the reference into shared memory
	bool     verbose_;
};

#endif