File: model.cpp

package info (click to toggle)
wham-align 0.1.5-8
links: PTS, VCS
area: main
in suites: bookworm, bullseye, forky, sid, trixie
size: 892 kB
sloc: cpp: 8,769; sh: 76; makefile: 52
file content (223 lines) | stat: -rw-r--r-- 6,339 bytes
/**
 *    WHAM - high-throughput sequence aligner
 *    Copyright (C) 2011  WHAM Group, University of Wisconsin
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*	$Id: model.cpp 165 2012-11-26 10:23:16Z yinan $ */

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include "model.h"
#include "error.h"

/*
 *	AlignerModel::computeNumIndex(int nError, int nPartition)
 *	Given the number of errors and the number of partitions, 
 *	compute the number of required hash tables, according to the 
 *	formula C(nPartition-1, nMismatch).
 */

int AlignerModel::computeNumIndex(int nError, int nPartition) {
  int i;
  unsigned long long x = 1;

  for (i = nPartition - nError; i <= nPartition - 1; i++)
    x *= i;

  for (i = 1; i <= nError; i++)
    x /= i;

  return (int) x;
}

/*
 *	AlignerModel::computeNumLookup(int nError, int nPartition)
 *	Given the number of errors and the number of partitions, 
 *	compute the number of lookups for each alignment, according 
 *	to the formula C(nPartition, nMismatch).
 */

int AlignerModel::computeNumLookup(int nError, int nPartition) {
  int i;
  unsigned long long x = 1;

  for (i = nPartition - nError + 1; i <= nPartition; i++)
    x *= i;

  for (i = 1; i <= nError; i++)
    x /= i;

  return (int) x;
}

/*
 *	AlignerModel::estimateNumPartition(int nEntry, int length, int nError)
 *	Estimate the optimal number of partitions based on a cost model, given 
 *	parameters of the aligner.
 */
int AlignerModel::estimateNumPartition(unsigned int nEntry, int length,
    int nError, bool memory) {
  int p, bestp;
  unsigned int nIndex, nLookup, nBucket;
  double nSpace;
  double costLookup, cost, min;
  double p0, p1, p2;

  min = 1000000000;
  bestp = 0;
  for (p = nError + 1; p <= nError + 5; p++) {
    if (memory
        && estimateIndexSpace(nEntry, length, nError, p) >= getFreeMemory())
      continue;

    nIndex = computeNumIndex(nError, p);
    nLookup = computeNumLookup(nError, p);

    nSpace = pow(4.0, length / p * (p - nError));
    nBucket = (double) nEntry < nSpace ? nEntry : (unsigned int) nSpace;

    costLookup = 0;

    /*	probability of empty bucket */
    p0 = pow(1 - double(1) / nBucket, nEntry);
    costLookup += p0;

    /*	probability of non-overflow bucket */
    p1 = nEntry * (double(1) / nBucket)
        * pow(1 - double(1) / nBucket, nEntry - 1);
    costLookup += 2 * p1;

    /* probability of overflow bucket */
    p2 = 1 - p0 - p1;
    costLookup += p2 * ((nEntry - nEntry * p1) / (nBucket * p2));

    cost = costLookup * nLookup;
    if (cost < min) {
      min = cost;
      bestp = p;
    }
  }

  return bestp;
}

bool AlignerModel::isFitMemory(unsigned int nEntry, int length, int nMismatch,
    int nPartition) {
  unsigned int szIndex;
  unsigned int szMemory;

  szIndex = estimateIndexSpace(nEntry, length, nMismatch, nPartition);
  szMemory = getFreeMemory();
  elog(DEBUG1, "Estimated index size: %d MB\n", szIndex);
  elog(DEBUG1, "Free memory size: %d MB\n", szMemory);

  /*	left 200MB free space*/
  return (szIndex + 200) < szMemory;
}

unsigned int AlignerModel::getNumHashtableFitMemory(unsigned int nEntry,
    int length, int nMismatch, int nPartition) {
  unsigned int szSeq;
  unsigned int szIndex;
  unsigned int szMemory;
  unsigned int nHashtable;

  szSeq = nEntry / 32 * 3 * sizeof(int) / 1024 / 1024;

  nHashtable = computeNumIndex(nMismatch, nPartition);
  szIndex = estimateHashtableSpace(nEntry, length, nMismatch, nPartition);
  szMemory = getFreeMemory();

  elog(DEBUG1, "Estimated hashtable size: %d MB\n", szIndex);
  elog(DEBUG1, "Free memory size: %d MB\n", szMemory);

  if ((szMemory - szSeq - 200) / szIndex < nHashtable)
    return (szMemory - szSeq - 200) / szIndex;
  else
    return nHashtable;
}

unsigned int AlignerModel::getFreeMemory() {
  unsigned int memTotal, memFree, memBuffer, memCache;
  FILE * file;

//  return 4096;

  file = fopen("/proc/meminfo", "r");
  if (file == NULL
    ) //on non-linux system
    return 0;

  fscanf(file, "MemTotal: %d kB\n", &memTotal);
  fscanf(file, "MemFree: %d kB\n", &memFree);
  fscanf(file, "Buffers: %d kB\n", &memBuffer);
  fscanf(file, "Cached: %d kB\n", &memCache);
  fclose(file);

  return memTotal / 1024;
//	return (memFree + memBuffer + memCache)/1024;
}

unsigned int AlignerModel::estimateIndexSpace(unsigned int nEntry, int length,
    int nError, int nPartition) {
  int nHashtable = 0;
  unsigned int szSeq;
  unsigned int szHashtable;
  unsigned int szIndex;

  szSeq = nEntry / 32 * 3 * sizeof(int) / 1024 / 1024;

  nHashtable = computeNumIndex(nError, nPartition);
  szHashtable = estimateHashtableSpace(nEntry, length, nError, nPartition);

  szIndex = szSeq + szHashtable * nHashtable;
  if (szIndex == 0)
    szIndex = 1;
  return szIndex;
}

unsigned int AlignerModel::estimateHashtableSpace(unsigned int nEntry,
    int length, int nError, int nPartition) {
  unsigned int szHashtable;
  unsigned int nBucket;
  double nSpace, p0, p1, p2;

  nSpace = pow(4.0, length / nPartition * (nPartition - nError));
  nBucket = (double) nEntry < nSpace ? nEntry : (unsigned int) nSpace;

  /*	probability of empty bucket */
  p0 = pow(1 - double(1) / nBucket, nEntry);

  /*	probability of non-overflow bucket */
  p1 = nEntry * (double(1) / nBucket)
      * pow(1 - double(1) / nBucket, nEntry - 1);

  /* probability of overflow bucket */
  p2 = 1 - p0 - p1;

//	nSpace = pow(8.0, length / nPartition * (nPartition - nError));
//	nBucket = (double)nEntry < nSpace? nEntry : (unsigned int)nSpace;

  szHashtable = (unsigned int) ((nEntry - nEntry * p1 + nBucket) / 1024 / 1024
      * sizeof(int));

  if (szHashtable == 0)
    szHashtable = 1;
  return szHashtable;
}