File: embedhash.cpp

package info (click to toggle)
wham-align 0.1.5-8
links: PTS, VCS
area: main
in suites: bookworm, bullseye, sid, trixie
size: 892 kB
sloc: cpp: 8,769; sh: 76; makefile: 52
file content (712 lines) | stat: -rw-r--r-- 19,350 bytes
/**
 *    WHAM - high-throughput sequence aligner
 *    Copyright (C) 2011  WHAM Group, University of Wisconsin
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*	$Id: hash.cpp 152 2012-07-22 10:52:53Z yinan $ */

#include <stdlib.h>
#include <memory.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <math.h>
#include "hash.h"
#include "embedhash.h"
#include "bitread.h"
#include "error.h"
#include "pair.h"
#include "edit_distance.h"
#include "util.h"
#include "rdtsc.h"

#define BITWISE_ALIGNMENT
//#define DEBUG_PRINT_LIST

unsigned long long statEmbedHashLookup = 0;
unsigned long long statEmbedHashLookupEntry = 0;

EmbedHashTable::EmbedHashTable() {
  memset(this, 0, sizeof(EmbedHashTable));
}

EmbedHashTable::~EmbedHashTable() {
  delete[] buckets;
  delete[] overflowPool;
}

/*
 *	HashTable::init()
 *	initialize the private variables
 */
void EmbedHashTable::init(CompactSequence * seq, int len, unsigned int nBucket,
    int numError, int nPartition) {
  sequence = seq;

  length = len;
  lenSeq = length * BITS_PER_BASE;

  nMismatch = numError;
  nMaxError = nMismatch;
  nMaxGap = 0;
  maxQual = MAX_INT;

  if (nBucket == 0) {
    double nEntry, nSpace;

    nEntry = (double) seq->getNum();
    nSpace = pow(8.0, length / nPartition * (nPartition - nMismatch));
    numBucket = nEntry < nSpace ? (int) nEntry : (int) nSpace;
  } else
    numBucket = nBucket;

  numBucket = nextPrime(numBucket);
  numEmpty = numBucket;

  /*
   *	if the sequence size is greater than 2^31, we have to
   *	use a normal hash table, otherwise, we use a compressed
   *	hash table to speedup the searches.
   */
  if (seq->getNum() < COMPRESS_TABLE_SIZE
    )
    compressedTable = true;
  else
    compressedTable = false;
}

/*
 *	HashTable::preProcessInit()
 *	Allocate and initialize the hash bucket array and bitmap arrays 
 *	for collision bits and empty bits.
 */
int EmbedHashTable::preProcessInit() {
  /*	allocate hash buckets.	*/
  buckets = (unsigned int *) malloc((int64) numBucket * sizeof(unsigned int));
  if (buckets == NULL
    )
    return ERR_MEM;

  /*
   *	allocate bitmap arrays to identify empty buckets
   *	and collision buckets.
   */
  emptyBits = (unsigned char *) malloc((int64) numBucket / BITS_PER_BYTE + 1);
  if (emptyBits == NULL
    )
    return ERR_MEM;

  collisionBits = (unsigned char *) malloc(
      (int64) numBucket / BITS_PER_BYTE + 1);
  if (collisionBits == NULL
    )
    return ERR_MEM;

  /* Initialization */
  memset(buckets, 0, numBucket * sizeof(unsigned int));
  memset(emptyBits, 0, numBucket / BITS_PER_BYTE + 1);
  memset(collisionBits, 0, numBucket / BITS_PER_BYTE + 1);

  return SUCCESS;
}

/*
 *	HashTable::preProcessEnd()
 *	apply the empty bits and collision bits to the hash buckets. 
 *	For empty buckets, the bucket values are set to be HASH_EMPTY. 
 *	For the buckets with collisions, the most significant bits in 
 *	the buckets are set to be 1.
 */
int EmbedHashTable::preProcessEnd() {
  uint32 i;
  uint32 tmp, sum = 0;
  uint32 collision;
  const double ln2 = log(2);

  if (buckets == NULL)
  {
    elog(ERROR, "ERROR: unallocated bucket array in hash table.\n");
    return ERR_PARA;
  }

  if (emptyBits == NULL || collisionBits == NULL)
  {
    elog(ERROR, "ERROR: unallocated bitmap in hash table.\n");
    return ERR_PARA;
  }

  if (numOverflowEntry < 0)
    return ERR_PARA;

  /* allicate overflow pool and bitmaps */
  overflowPool = (unsigned int *) malloc(
      (int64) numOverflowEntry * sizeof(unsigned int));
  if (overflowPool == NULL
    )
    return ERR_MEM;

  memset(overflowPool, 0, numOverflowEntry * sizeof(unsigned int));

  if (!compressedTable) {
    overflowBits = (unsigned char *) malloc(
        (int64) numOverflowEntry / BITS_PER_BYTE + 1);
    if (overflowBits == NULL
      )
      return ERR_MEM;

    memset(overflowBits, 0,
        (numOverflowEntry / BITS_PER_BYTE + 1) * sizeof(char));
  }

  /*
   *	scan the hash buckets to appy the empty bits and
   *	collision bits.
   */
  for (i = 0; i < numBucket; i++) {
    /*
     *	The current value of the bucket is the number of collision
     *	entries in each bucket. We accumulate this value to compute
     *	the position of the last entries in the overflow array for each bucket,
     *	and store the position into the bucket. For the non-collision
     *	buckets, the values will be updated to the position of segment
     *	in the function insert.
     */
    tmp = buckets[i];
    sum += tmp;
    buckets[i] = sum;
    if (buckets[i] > 0) {
      if (compressedTable) {
        HASH_SET_END(overflowPool[buckets[i] - 1]);
      } else {
        BITMAP_SET(overflowBits[(buckets[i] - 1) / BITS_PER_BYTE],
            (buckets[i] - 1) % BITS_PER_BYTE);
      }
    }

    /* update histogram */
    /*		int h = 0;
     if (tmp > 0)
     h = (int)ceil(log(tmp)/ln2);
     if (h >= nHistogram)
     histogram[nHistogram - 1]++;
     else
     histogram[h]++;
     */

    /*	set the values for empty buckets */
    if (!BITMAP_IS(emptyBits[i / BITS_PER_BYTE], i % BITS_PER_BYTE))
      buckets[i] = HASH_EMPTY;
    else {
      if (compressedTable) {
        /* apply the collision bit to the most significant bit of the bucket */
        collision =
            BITMAP_IS(collisionBits[i / BITS_PER_BYTE], i % BITS_PER_BYTE);
        buckets[i] |= HASH_COLLISION_MASK(collision);
      }
    }
  }

  /*	free the bitmap arrays */
  free(emptyBits);
  emptyBits = NULL;

  if (compressedTable) {
    free(collisionBits);
    collisionBits = NULL;
  }

//	setScanThreshold(0.001);

  elog(
      DEBUG2,
      "  numBucket|   numEmpty|  Collision|   numEntry| Col Rat| Emp Rat|Avg List|Avg Miss\n");
  elog(
      DEBUG2,
      "%11u %11u %11u %11u %8.2f %8.2f %8.2f %8.2f\n",
      numBucket,
      numEmpty,
      numCollision,
      numEntry,
      (double) (numCollision) / (numBucket - numEmpty),
      (double) (numEmpty) / (numBucket),
      (double) (numOverflowEntry) / numCollision,
      (double) (numOverflowEntry + numBucket - numEmpty - numCollision)
          / (numBucket - numEmpty));

  return SUCCESS;
}

void EmbedHashTable::setScanThreshold(double r) {
  int64 sum = 0;
  int64 total = numBucket;
  int64 top = total - total * r;
//	uint32 top = total - total / 1000;
  /*
   elog(INFO, "Scan threshold: %f\n", r);
   elog(INFO, "Hash List Length Histogram:\n");
   elog(INFO, "Empty: %d\n", numEmpty);
   for (int i = 0; i < nHistogram - 1; i++) {
   sum += histogram[i];
   if (sum <= top)
   maxScan = 0x1 << (i + 1);
   elog(INFO, "List length <= %d: %d (%.2f%%)\n", (0x1 << i), histogram[i], histogram[i] * 100.0 / total);
   }
   sum += histogram[nHistogram - 1];
   if (sum <= top)
   maxScan = 0x1 << nHistogram;
   elog(INFO, "List length for the rest: %d (%.2f%%)\n", histogram[nHistogram - 1], histogram[nHistogram - 1] * 100.0 / total);
   elog(INFO, "Choose maximum of scan: %d\n", maxScan);
   */
}

/*
 *	HashTable::preProcessInsert()
 *	update the statistics infos for the hash tables. In particular, 
 *	we update the empty bitmap array and collision bitmap array, 
 *	and update the bucket value to be the number of collision entries 
 *	hashed into the bucket.
 */
void EmbedHashTable::preProcessInsert(int64 * key) {
  uint32 bucketID;

  numEntry++;

  /*	compute the hash value */
  HASH_FUNCTION(key, numBucket, words, bucketID);

  if (!BITMAP_IS(emptyBits[bucketID/BITS_PER_BYTE], bucketID % BITS_PER_BYTE)) {
    /* set the empty bit*/
    BITMAP_SET(emptyBits[bucketID/BITS_PER_BYTE], bucketID % BITS_PER_BYTE);
    numEmpty--;
  } else {
    if (!BITMAP_IS(collisionBits[bucketID/BITS_PER_BYTE], bucketID % BITS_PER_BYTE)) {
      /* The two collsision entries will be added into the overflow array */
      BITMAP_SET(collisionBits[bucketID/BITS_PER_BYTE],
          bucketID % BITS_PER_BYTE);
      numCollision++;
      numOverflowEntry += 2;
      buckets[bucketID] += 2;
    } else {
      /* The collsision entry will be added into the overflow array */
      numOverflowEntry++;
      buckets[bucketID]++;
    }
  }
}

/*
 *	HashTable::buildInit()
 *	allocate and initialize the overflow pool.
 */
int EmbedHashTable::buildInit() {
  return SUCCESS;
}

/*
 *	HashTable::insert()
 *	insert an segment(entry) into the hash table. If the collision
 *	bit is 0, the position of the segment is directly stored in the 
 *	hash bucket. Otherwise, the position of the segment is stored in
 *	the end of the overflow list of the bucket.	
 */
void EmbedHashTable::insert(int64 * key, unsigned int offset) {
  uint32 curOverflowEntry;
  uint32 bucketId;
  uint32 seqOffset;
  bool collision;
  uint32 counter = 0;
  int64 * seqVector;
  int64 tspace[16];
  int64 * target = &tspace[8];
  bool isBloomFilter;
  uint32 * bloomFilter;
  uint32 bloomFilterNum;

  seqVector = sequence->getSequence();

  /*	compute the hash value */
  HASH_FUNCTION(key, numBucket, words, bucketId);

  if (compressedTable)
    collision = HASH_IS_COLLISION(buckets[bucketId]);
  else
    collision =
        BITMAP_IS(collisionBits[bucketId / BITS_PER_BYTE], bucketId % BITS_PER_BYTE);

  if (!collision) {
    /*	store the position into the bucket */
    if (compressedTable)
      buckets[bucketId] = HASH_GET_OFFSET(offset);
    else
      buckets[bucketId] = offset;
  } else {
    buckets[bucketId]--;

    /* get the overflow list position */
    if (compressedTable)
      curOverflowEntry = HASH_GET_OFFSET(buckets[bucketId]);
    else
      curOverflowEntry = buckets[bucketId];

    /*
     *	append the position of the new segment to the end of
     *	overflow list.
     */
    if (compressedTable)
      overflowPool[curOverflowEntry] =
          HASH_SET_OFFSET(overflowPool[curOverflowEntry], offset);
    else
      overflowPool[curOverflowEntry] = offset;
  }
}

/*
 *	HashTable::lookup()
 *	search the segment(key) on the hash table, find the potential 
 *	matched portions in the genome sequence. Call function 
 *	pairAligner::pairAlign to perform pairwise alignment between 
 *	query sequence and the potential matched portions.
 */
unsigned int EmbedHashTable::lookup(int64 * orgkey, int64 * key, int keyOffset,
    char * quals, strand s, int rid, HitSet * hits, bool noGap) {
  int num;
  uint32 bucketId;
  uint32 seqOffset, entryOffset, startOffset;
  int64 tspace1[16], tspace2[16];
  int64 * diff = &tspace1[8], *target = &tspace2[8];
  int64 * seqVector;
  bool collision;
  uint32 sid, soffset;
  int ret;
  ErrorVector error;
  int nScanEntry = 0;
  int rett = SUCCESS;
  int maxGap = 0;

  if (!noGap)
    maxGap = nMaxGap;

  seqVector = sequence->getSequence();

  /*	compute the hash value */
  HASH_FUNCTION(key, numBucket, words, bucketId);

  if (compressedTable)
    collision = HASH_IS_COLLISION(buckets[bucketId]);
  else
    collision =
        BITMAP_IS(collisionBits[bucketId / BITS_PER_BYTE], bucketId % BITS_PER_BYTE);

  statEmbedHashLookup++;

  if (!collision) {
    /*	get the position of potential matched portion */
    if (compressedTable)
      seqOffset = HASH_GET_OFFSET(buckets[bucketId]);
    else
      seqOffset = buckets[bucketId];

    if (seqOffset != HASH_EMPTY)
    {
      statEmbedHashLookupEntry++;
      HASH_DEBUG(printf(" %u", seqOffset - keyOffset / BITS_PER_BASE));

#ifdef DEBUG_HASH_PRINT
      BitRead::extract(seqVector, target,
          seqOffset * BITS_PER_BASE_LL + lenRest,
          lenKey);
      if (BitRead::compare(target, key))
      printf("*");
#endif
      /*	get the potential matched portion */
      BitRead::extract(seqVector, target,
          seqOffset * BITS_PER_BASE_LL - maxGap * BITS_PER_BASE - keyOffset,
          lenSeq + maxGap * 2 * BITS_PER_BASE);

      nScanEntry++;

      /**
       *	perform the pairwise alignment under the constraint
       *	on the number of errors.
       */
      error = PairAligner::pairAlign(orgkey, target, length, nMaxError, maxGap);

      if (error.num <= nMaxError) {
        seqOffset = seqOffset - maxGap + error.offset
            - keyOffset / BITS_PER_BASE;

        if (maxGap != 0) {
          /**
           * 	if supports indel, re-extract the matched portion
           * 	with proper offset and length
           **/
          BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL,
          error.len * BITS_PER_BASE);
        }

        ret = hits->add(orgkey, target, seqOffset, s, &error, error.qual, rid);
        if (ret == MSG_HITSETFULL)
        {
          HASH_DEBUG(printf(" HIT"));
          rett = ret;
        }
      }
    }
//		else
//			stat_empty++;
  } else {
//		stat_collision++;
    /* get the position of the overflow list */
    if (compressedTable)
      entryOffset = HASH_GET_OFFSET(buckets[bucketId]);
    else
      entryOffset = buckets[bucketId];

    startOffset = entryOffset;

    /* scan the overflow list */
    while (entryOffset < numOverflowEntry) {
      /*	get the position of potential matched portion */
      if (compressedTable)
        seqOffset = HASH_GET_OFFSET(overflowPool[entryOffset]);
      else
        seqOffset = overflowPool[entryOffset];

      if (seqOffset != HASH_EMPTY)
      {
        statEmbedHashLookupEntry++;
        HASH_DEBUG(printf(" %u", seqOffset - keyOffset / BITS_PER_BASE));
#ifdef DEBUG_HASH_PRINT
        BitRead::extract(seqVector, target,
            seqOffset * BITS_PER_BASE_LL + lenRest,
            lenKey);
        if (BitRead::compare(target, key))
        printf("*");
#endif

        /*	get the potential matched portion */
        BitRead::extract(seqVector, target,
            seqOffset * BITS_PER_BASE_LL - maxGap * BITS_PER_BASE - keyOffset,
            lenSeq + maxGap * 2 * BITS_PER_BASE);

        nScanEntry++;

        /**
         *	perform the pairwise alignment under the constraint
         *	on the number of errors.
         */
        error = PairAligner::pairAlign(orgkey, target, length, nMaxError,
            maxGap);

        if (error.num <= nMaxError) {
          seqOffset = seqOffset - maxGap + error.offset
              - keyOffset / BITS_PER_BASE;

          if (maxGap != 0) {
            /**
             * 	if supports indel, re-extract the matched portion
             * 	with proper offset and length
             **/
            BitRead::extract(seqVector, target, seqOffset * BITS_PER_BASE_LL,
            error.len * BITS_PER_BASE);
          }

          ret = hits->add(orgkey, target, seqOffset, s, &error, error.qual,
              rid);
          if (ret == MSG_HITSETFULL)
          {
            HASH_DEBUG(printf(" HIT"));
            rett = ret;
            break;
          }
        }
      }

//			if (entryOffset > startOffset + HASH_OVERFLOW_LIST_SCAN_BOUND)
//			if (entryOffset > startOffset + 64 * 16)
//				break;

      if (compressedTable) {
        if (HASH_IS_END(overflowPool[entryOffset]))
          break;
      } else {
        if (BITMAP_IS(overflowBits[entryOffset / BITS_PER_BYTE], entryOffset % BITS_PER_BYTE))
          break;
      }

      entryOffset++;
    }
  }

  HASH_DEBUG(printf("\n"));

  return rett;
}

/*
 *	HashTable::save()
 *	This function is used to save the in-memory hash table on 
 *	disk.
 */
int EmbedHashTable::save(FILE * file) {
  size_t ret;

  ret = fwrite(this, sizeof(EmbedHashTable), 1, file);
  if (ret != 1) {
    return ERR_FILE;
  }

  ret = fwrite(buckets, sizeof(uint32), numBucket, file);
  if (ret != numBucket) {
    return ERR_FILE;
  }

  ret = fwrite(overflowPool, sizeof(uint32), numOverflowEntry, file);
  if (ret != numOverflowEntry) {
    return ERR_FILE;
  }

  if (!compressedTable) {
    ret = fwrite(collisionBits, sizeof(unsigned char),
        numBucket / BITS_PER_BYTE + 1, file);
    if (ret != numBucket / BITS_PER_BYTE + 1) {
      return ERR_FILE;
    }

    ret = fwrite(overflowBits, sizeof(unsigned char),
        numOverflowEntry / BITS_PER_BYTE + 1, file);
    if (ret != numOverflowEntry / BITS_PER_BYTE + 1) {
      return ERR_FILE;
    }
  }

  return SUCCESS;
}

/*
 *	HashTable::load()
 *	This function is used to load the on-disk copy of hash table 
 *	into memory.
 */
int EmbedHashTable::load(FILE * file, CompactSequence * seq) {
  size_t ret;

  ret = fread(this, sizeof(EmbedHashTable), 1, file);
  if (ret != 1) {
    elog(ERROR, "failed to load hash table head.\n");
    return ERR_FILE;
  }

  sequence = seq;

  /* we can use smaller block to reduct the memory consumption */
  buckets = (uint32 *) malloc((int64) numBucket * sizeof(uint32));
  if (buckets == NULL)
    return ERR_MEM;
  ret = fread(buckets, sizeof(uint32), numBucket, file);
  if (ret != numBucket) {
    elog(ERROR, "failed to load buckets.\n");
    return ERR_FILE;
  }

  overflowPool = (uint32 *) malloc((int64) numOverflowEntry * sizeof(uint32));
  if (overflowPool == NULL)
    return ERR_MEM;
  ret = fread(overflowPool, sizeof(uint32), numOverflowEntry, file);
  if (ret != numOverflowEntry) {
    elog(ERROR, "failed to load overflow array.\n");
    return ERR_FILE;
  }

  if (!compressedTable) {
    collisionBits = (unsigned char *) malloc(
        (int64) (numBucket / BITS_PER_BYTE + 1) * sizeof(unsigned char));
    if (collisionBits == NULL)
      return ERR_MEM;
    ret = fread(collisionBits, sizeof(unsigned char),
        numBucket / BITS_PER_BYTE + 1, file);
    if (ret != numBucket / BITS_PER_BYTE + 1) {
      elog(ERROR, "failed to load collision bits.\n");
      return ERR_FILE;
    }

    overflowBits = (unsigned char *) malloc(
        (int64) (numOverflowEntry / BITS_PER_BYTE + 1) * sizeof(unsigned char));
    if (overflowBits == NULL)
      return ERR_MEM;
    ret = fread(overflowBits, sizeof(unsigned char),
        numOverflowEntry / BITS_PER_BYTE + 1, file);
    if (ret != numOverflowEntry / BITS_PER_BYTE + 1) {
      elog(ERROR, "failed to load overflow bits. %d %d\n", ret,
          numOverflowEntry / BITS_PER_BYTE + 1);
      return ERR_FILE;
    }
  }

  return SUCCESS;
}

/*
 *	HashTable::remove
 *	free the space occupied by the hash index.
 */
int EmbedHashTable::remove() {
  if (buckets) {
    free(buckets);
    buckets = NULL;
  }

  if (overflowPool) {
    free(overflowPool);
    overflowPool = NULL;
  }

  if (emptyBits) {
    free(emptyBits);
    emptyBits = NULL;
  }

  if (collisionBits) {
    free(collisionBits);
    collisionBits = NULL;
  }

  return SUCCESS;
}

/*
 * nextPrime()
 * return the least prime number that is greater 
 * than the input number.
 */
unsigned int EmbedHashTable::nextPrime(unsigned int num) {
  unsigned int i, j, x;

  num = num / 2 * 2 + 1;
  for (i = num; i < num + 1000; i += 2) {
    x = (unsigned int) sqrtl(i);
    for (j = 3; j < x; j += 2) {
      if (i % j == 0)
        break;
    }

    if (j >= x)
      return i;
  }

  return i;
}