1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307
|
/*++
Module Name:
GenomeIndex.h
Abstract:
Headers for the index builder for the SNAP sequencer
Authors:
Bill Bolosky, August, 2011
Environment:
User mode service.
Revision History:
Adapted from Matei Zaharia's Scala implementation.
--*/
#pragma once
#include "HashTable.h"
#include "Seed.h"
#include "Genome.h"
#include "ApproximateCounter.h"
#include "GenericFile_map.h"
class GenomeIndex {
public:
const Genome *getGenome() {return genome;}
//
// This looks up a seed and its reverse complement, and returns the number and list of hits for each.
// It guarantees that if the lookup succeeds that hits[-1] and rcHits[-1] are valid memory with
// arbirtary values. The -32 version is used for indices with 32 bit genome offsets; using the version
// that doesn't match the genome index GenomeLocation size is an error. Check the index type with
// doesGenomeIndexHave64BitLocations();
//
// The 64 bit version requires the called to supply a special location for storing a single forward or
// reverse hit. This is because in the hash table (but not overflow table), these may be stored in
// 5-7 bytes in order to save space. This means that there's no address in the hash table that can
// be pointed to as a return value. When only a single hit is returned, *hits == singleHit, so there's
// no need to check on the caller's side.
//
void lookupSeed(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit);
void lookupSeed32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits);
bool doesGenomeIndexHave64BitLocations() const {return locationSize > 4;}
//
// Looks up a seed and its reverse complement, restricting the search to a given range of locations,
// and returns the number and list of hits for each.
//
// virtual void lookupSeed(Seed seed, unsigned minLocation, unsigned maxLocation,
// unsigned *nHits, const unsigned **hits, unsigned *nRCHits, const unsigned **rcHits) = 0;
//
// This issues a compiler prefetch for the genome data.
//
inline void prefetchGenomeData(GenomeLocation genomeLocation) const {
genome->prefetchData(genomeLocation);
}
inline int getSeedLength() const { return seedLen; }
void dropIndex(); // This frees the memory for the index, but leaves the genome itself around.
virtual ~GenomeIndex();
//
// run the indexer from command line arguments
//
static void runIndexer(int argc, const char **argv);
static GenomeIndex *loadFromDirectory(char *directoryName, bool map, bool prefetch);
int getMajorVersion();
int getMinorVersion();
protected:
int seedLen;
unsigned hashTableKeySize;
unsigned nHashTables;
const Genome *genome;
int majorVersion;
int minorVersion;
bool largeHashTable;
unsigned locationSize;
//
// The overflow table is indexed by numbers > than the number of bases in the genome.
// The hash table(s) point into the overflow table when they have a seed that's got more
// than one instance in the genome. For locationSize <= 4, the table is made of 32
// bit entries (and pointed to by overflowTable32), otherwise it's 64 bit entries.
//
_uint64 overflowTableSize;
unsigned *overflowTable32;
_int64 *overflowTable64;
GenericFile_map *mappedOverflowTable;
void *tablesBlob; // All of the hash tables in one giant blob
GenericFile_map *mappedTables;
//
// We have to build the overflow table in two stages. While we're walking the genome, we first
// assign tentative overflow table locations, and build up a list of places where each repeat
// occurs. Once we've read the whole thing (and so know the exact number of instances of each
// repeated seed) we build the actual overflow table and go back and update the entries in the
// hash table.
//
// The list of repeats works as a singly linked list, headed by the hash table entry. The entries
// use the index in the overflow table as links, rather than using real pointers, in order to save
// space. So that we can dynamically allocate overflow entries while still using indices to
// find them, they're built in a two level table.
//
struct OverflowBackpointer {
_int64 nextIndex;
GenomeLocation genomeLocation;
};
class OverflowBackpointerAnchor {
public:
OverflowBackpointerAnchor(_int64 maxOverflowEntries_);
~OverflowBackpointerAnchor();
OverflowBackpointer *getBackpointer(_int64 index);
void trimTo(_int64 trimToIndex, FILE *trimFile);
void loadFromFile(FILE *tripFile);
private:
ExclusiveLock lock;
static const unsigned batchSize;
_int64 maxOverflowEntries;
OverflowBackpointer **table;
static OverflowBackpointer spilledTableSlot; // This value is used to indicate that the table slot in question has been spilled
};
//
// Build a genome index and write it to a directory. If you don't already have a saved index
// the only way to get one is to build it into a directory and then load it from the directory.
// NB: This deletes the Genome that's passed into it.
//
static bool BuildIndexToDirectory(const Genome *genome, int seedLen, double slack,
const char *directory,
unsigned maxThreads, unsigned chromosomePaddingSize, bool forceExact,
unsigned hashTableKeySize, bool large, const char *histogramFileName,
unsigned locationSize, bool smallMemory);
//
// Allocate set of hash tables indexed by seeds with bias
//
static SNAPHashTable** allocateHashTables(unsigned* o_nTables, GenomeDistance countOfBases, double slack,
int seedLen, unsigned hashTableKeySize, bool large, unsigned locationSize, double* biasTable = NULL);
static const unsigned GenomeIndexFormatMajorVersion = 7;
static const unsigned GenomeIndexFormatMinorVersion = 1; // Index version 5.0 has Ns in the FASTA stored in lower case in the index so that they won't match Ns in reads. 5.1 does away with that.
static const unsigned largestBiasTable = 32; // Can't be bigger than the biggest seed size, which is set in Seed.h. Bigger than 32 means a new Seed structure.
static const unsigned largestKeySize = 8;
static double *hg19_biasTables[largestKeySize+1][largestBiasTable+1];
static double *hg19_biasTables_large[largestKeySize+1][largestBiasTable+1];
static void ComputeBiasTable(const Genome* genome, int seedSize, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large);
struct ComputeBiasTableThreadContext {
SingleWaiterObject *doneObject;
volatile int *runningThreadCount;
GenomeDistance genomeChunkStart;
GenomeDistance genomeChunkEnd;
unsigned nHashTables;
unsigned hashTableKeySize;
std::vector<ApproximateCounter> *approxCounters;
const Genome *genome;
volatile _int64 *nBasesProcessed;
unsigned seedLen;
volatile _int64 *validSeeds;
bool large;
ExclusiveLock *approximateCounterLocks;
};
static void ComputeBiasTableWorkerThreadMain(void *param);
struct OverflowBackpointer;
struct BuildHashTablesThreadContext {
unsigned nThreads;
unsigned whichThread;
SingleWaiterObject *doneObject;
volatile int *runningThreadCount;
GenomeLocation genomeChunkStart;
GenomeLocation genomeChunkEnd;
const Genome *genome;
volatile _int64 *nBasesProcessed;
unsigned seedLen;
volatile _int64 *noBaseAvailable;
volatile _int64 *nonSeeds;
volatile _int64 *seedsWithMultipleOccurrences;
volatile _int64 *bothComplementsUsed;
GenomeIndex *index;
OverflowBackpointerAnchor *overflowAnchor;
volatile _int64 *nextOverflowBackpointer;
volatile _int64 *genomeLocationsInOverflowTable;
unsigned hashTableKeySize;
bool large;
unsigned locationSize;
//
// The "small memory" option causes SNAP to write out the backpointer table as it's
// built in order to save the memory it uses, because it's accessed sequentially as
// the table is built. In order to be able to tell when it's safe to free some of the
// table, each thread occasionally records the largest backpointer index that it's done
// with. It's safe to free table chunks that are less than the least of these. The lock
// is used to keep more than one thread from trying to spill table at a time.
//
_int64 *lastBackpointerIndexUsedByThread;
ExclusiveLock *backpointerSpillLock;
FILE *backpointerSpillFile;
ExclusiveLock *hashTableLocks;
ExclusiveLock *overflowTableLock;
};
struct PerHashTableBatch {
PerHashTableBatch() : nUsed(0) {}
static const unsigned nSeedsPerBatch = 1000;
unsigned nUsed;
struct Entry {
bool usingComplement;
_uint64 lowBases;
GenomeLocation genomeLocation;
};
Entry entries[nSeedsPerBatch];
bool addSeed(GenomeLocation genomeLocation, _uint64 seedLowBases, bool seedUsingComplement) {
_ASSERT(nUsed < nSeedsPerBatch);
entries[nUsed].lowBases = seedLowBases;
entries[nUsed].usingComplement = seedUsingComplement;
entries[nUsed].genomeLocation = genomeLocation;
nUsed++;
return nUsed >= nSeedsPerBatch;
}
void clear()
{
nUsed = 0;
}
};
struct IndexBuildStats {
IndexBuildStats() : noBaseAvailable(0), nonSeeds(0), bothComplementsUsed(0), genomeLocationsInOverflowTable(0),
unrecordedSkippedSeeds(0), seedsWithMultipleOccurrences(0) {}
_int64 noBaseAvailable;
_int64 nonSeeds;
_int64 bothComplementsUsed;
_int64 genomeLocationsInOverflowTable;
_int64 seedsWithMultipleOccurrences;
_uint64 unrecordedSkippedSeeds;
};
static const _int64 printPeriod;
virtual void indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
virtual void completeIndexing(PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
static void BuildHashTablesWorkerThreadMain(void *param);
void BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context);
static void ApplyHashTableUpdate(BuildHashTablesThreadContext *context, _uint64 whichHashTable, GenomeLocation genomeLocation, _uint64 lowBases, bool usingComplement,
_int64 *bothComplementsUsed, _int64 *genomeLocationsInOverflowTable, _int64 *seedsWithMultipleOccurrences, bool large);
static int BackwardsUnsignedCompare(const void *, const void *);
static int BackwardsInt64Compare(const void *, const void *);
GenomeIndex();
SNAPHashTable **hashTables;
static _int64 AddOverflowBackpointer(
_int64 previousOverflowBackpointer,
BuildHashTablesThreadContext*context,
GenomeLocation genomeLocation);
void fillInLookedUpResults32(const unsigned *subEntry, _int64 *nHits, const unsigned **hits);
void fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation);
};
extern Genome::Contig ContigForInvalidGenomeLocation;
|