File: sequence.cpp

package info (click to toggle)
wham-align 0.1.5-8
links: PTS, VCS
area: main
in suites: bookworm, bullseye, sid, trixie
size: 892 kB
sloc: cpp: 8,769; sh: 76; makefile: 52
file content (896 lines) | stat: -rw-r--r-- 23,836 bytes
/**
 *    WHAM - high-throughput sequence aligner
 *    Copyright (C) 2011  WHAM Group, University of Wisconsin
 *
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*	$Id: sequence.cpp 157 2012-07-25 05:58:09Z yinan $ */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <memory.h>
#include <assert.h>
#include "error.h"
#include "sequence.h"
#include "bitread.h"
#include "util.h"
#include "aligner.h"
#include "hash.h"

/*
 * A = 0, C = 1, G = 2, T = 2, N = 7.
 * If the value of A, C, G, T is changed, modify this array.
 */
const char code2Gene[8] = { 'A', 'C', 'G', 'T', 'N', 'N', 'N', 'N' };

CompactSequence::CompactSequence() {
  memset(this, 0, sizeof(CompactSequence));
  skipMask = true;
}

CompactSequence::CompactSequence(bool skip) {
  memset(this, 0, sizeof(CompactSequence));
  skipMask = skip;
}

/*
 *	CompactSequence::skipLine
 *	skip the next line in the specified file
 */
int CompactSequence::skipLine(FILE * file) {
  char c;
  int i = 0;

  /*
   *	skip all characters until get a character
   *	with the value of 10, 13, or EOF
   */
  while (1) {
    c = fgetc(file);
    i++;
    if (c == 10 || c == 13 || c == EOF
      )
      break;
  }
  return i;
}

int CompactSequence::getSeqName(FILE * file, char * str) {
  int i = 0;
  char * c;

  /*
   *	skip all characters until get a character
   *	with the value of 10, 13, or EOF
   */
  while (1) {
    str[i] = fgetc(file);
    if (str[i] == 10 || str[i] == 13 || str[i] == EOF
      )
      break;
    i++;
  }
  c = strchr(str, ' ');
  if (c == NULL
    )
    str[i] = '\0';
  else
    *c = '\0';
  return i;
}

void CompactSequence::extractFileName(char * dest, char * src) {
  int start, end;
  char * str;

  start = 0;
  end = strlen(src) - 1;
  for (int i = strlen(src) - 1; i >= 0; i--) {
    //remove path
    if (src[i] == '\\' || src[i] == '/') {
      start = i + 1;
      break;
    }

    //remove file extension name
    if (src[i] == '.') {
      if (strcmp(&src[i + 1], "fq") == 0 || strcmp(&src[i + 1], "fa") == 0
          || strcmp(&src[i + 1], "fastq") == 0
          || strcmp(&src[i + 1], "mfa") == 0)
        end = i - 1;
    }
  }

  strncpy(dest, &src[start], end - start + 1);
  dest[end - start + 1] = '\0';
}

/*
 *	CompactSequence::preProcess
 *	This function is used to collection statistics infomations 
 *	for the building phase. The informations include:
 *	1) the number of effective characters. Effective characters 
 *	include all A, C, G adn T characters, and the first numError+1 
 *	unknown characters in each N segment.
 *	2) the number of N Segments
 *	3) the number of sequences in the specific file
 */
int CompactSequence::preProcess(char * fname, uint32 numError, int64 & num,
    int64 & numNSegment, int64 & numFileSeq) {
  FILE * file;
  char c;
  bool isUnknownChar;
  unsigned int numContinuousN;

  int ret;

  file = fopen(fname, "rb");
  if (file == NULL)
  {
    printf("File does not exist.\n");
    return ERR_FILE;
  }

  num = 0;
  numContinuousN = 0;
  numNSegment = 0;
  numFileSeq = 0;
  while ((c = fgetc(file)) != EOF) {
    isUnknownChar = true;

    /*	skip the comments */
    if (c == '>') {
      skipLine(file);
      numFileSeq++;

      numContinuousN = 0;

      continue;
    } else if (c == 10 || c == 13) {
      continue;
    }
    /*	handle valid characters */
    else if (c == 'A' || c == 'C' || c == 'G' || c == 'T') {
      isUnknownChar = false;
    } else if ((c == 'a' || c == 'c' || c == 'g' || c == 't') && !skipMask) {
      isUnknownChar = false;
    }

    /*	all other characters are treated as unknown characters */
    if (!isUnknownChar) {
      /*
       *	the segment contains (numError + 1) unknown
       *	characters are treated as a N segment.
       */
      if (numContinuousN > numError + 1)
        numNSegment++;
      num++;
      numContinuousN = 0;
    } else {
      numContinuousN++;

      /*
       *	the first numError + 1 unknown characters are counted
       *	in the number of effective characters.
       */
      if (numContinuousN <= numError + 1)
        num++;
    }

  }
  ret = fclose(file);
  if (ret != 0)
    return ret;

  return SUCCESS;
}

/*
 *	CompactSequence::build
 *	This function is used to build the compact sequence. The compact 
 *	sequence contains all effective characters, each of which is 
 *	represented by three bits. Effective characters include all A, C, 
 *	G and T characters, and the first numError+1 unknown characters 
 *	in each N segment, which is the segment that contains continugous 
 *	numError+1 Ns. The interval tree is built to transfer the location 
 *	in the original sequnce and compact sequence.
 *	1) invoke preProcess to collect sequence infos.
 *	2) allocate the sequence space.
 *	3) load effective characters of the original sequences into the 
 *	compact sequence.
 */
int CompactSequence::build(char ** fname, int numFile, int length,
    int numError) {
  unsigned int i, j, ret;
  int curSeq;
  FILE * file;
  char c;
  unsigned int offsetInCmptSeq, offsetInOrgSeq;
  unsigned int lenSegment, lenSegmentN;
  unsigned int step, nextstep;
  int64 numFileChar, numFileNSeg, numFileSeq, nChar, nNSegment;
  int64 word, code;

  len = length;
  nError = numError;

  /*
   *	scan all sequences to accumulate the number of effective
   *	characters and N segments.
   */
  nChar = 0;
  nSeq = 0;
  nNSegment = 0;
  elog(INFO, "Preprocessing reference sequences...\n");
  for (i = 0; i < numFile; i++) {
    elog(INFO, "preprocessing %s...\n", fname[i]);

    ret = preProcess(fname[i], numError, numFileChar, numFileNSeg, numFileSeq);
    if (ret != SUCCESS
      )
      return ret;

    /* we add numError+1 Ns between two adjacent sequences */
    nChar += numFileChar + (numError + 1) * numFileSeq;

    /* accumulate the number of sequences in all files */
    nSeq += numFileSeq;

    /*
     * we use 32-bit entry in hash tables, and the MSB is reservered.
     * So we support up to 2^31 non-N characters in the reference
     * sequences.
     */
    if (nChar > MAX_NUM_CHAR)
    {
      //	elog(ERROR, "#characters: %lld\n", nChar);
      //	elog(ERROR, "ERROR: the number of non-N characters in the reference sequences exceeds the maximum value %d\n", MAX_NUM_CHAR);
      //	return ERR_SEQ;
    }

    /* we add a N segment in the begining of each sequence in the file */
    nNSegment += numFileNSeg + numFileSeq;
  }
  /* we add a N segment in the end of all sequences */
  nNSegment++;
  nChar += numError + 1;

  elog(INFO, "\n");

  /*
   * we use 32-bit entry in hash tables, and the MSB is reservered.
   * So we support up to 2^31 non-N characters in the reference
   * sequences.
   */
  if (nChar > MAX_NUM_CHAR)
  {
    elog(ERROR, "#characters: %lld\n", nChar);
    elog(
        ERROR,
        "ERROR: the number of non-N characters in the reference sequences exceeds the maximum value %d\n",
        MAX_NUM_CHAR);
    return ERR_SEQ;
  }

  numChar = (uint32) nChar;
  numNSegment = (uint32) nNSegment;

  /*
   *	allocate the sequence space. SEQUENCE_HEAD_WORDS integers
   *	should be left in the begining of the sequence. The left
   *	space is used to avoid the memory overflow when get the
   *	the subsequence starting from the first several characters.
   */
  size = NUM_LONGWORD_BASE(numChar + length) + SEQUENCE_HEAD_WORDS;
  pool = (int64 *) malloc(size * sizeof(int64));
  sequence = pool + SEQUENCE_HEAD_WORDS;

  memset(pool, 0, SEQUENCE_HEAD_WORDS * sizeof(int64));

  /* build an empty interval tree */
  itree = new IntervalTree(numNSegment, nError + 1);

  /* allocate space for seuqence names */
  seqNames = new char *[nSeq];
  seqNamepool = new char[nSeq * MAX_LENGTH_PATH];for
(  i = 0; i < nSeq; i++)
  seqNames[i] = &seqNamepool[i * MAX_LENGTH_PATH];

  /* allocate space for sequence lens */
  seqLens = new uint32[nSeq];

  word = 0;
  offsetInCmptSeq = 0;
  lenSegment = 0;
  lenSegmentN = 0;
  curSeq = -1;

  ProgressBar bar(numChar - 1, PROGRESS_BAR_WIDTH);

  elog(INFO, "loading reference sequences...\n");
  for (i = 0; i < numFile; i++) {
    /*	open the sequence file */
    file = fopen(fname[i], "rb");
    if (file == NULL)
    {
      return ERR_PARA;
    }

    while ((c = fgetc(file)) != EOF) {
      /* update the progress bar*/
      bar.update(offsetInCmptSeq);

      /* the begining of a new sequence */
      if (c == '>') {
        /*
         *	we need to insert numError+1 Ns in the beginning of each sequence.
         *	Otherwise, a substring span over the boundary of two sequences
         *	may introduce wrong alignment.
         */
        code = BASE_N;
        for (j = 0; j < numError + 1; j++) {
          if (lenSegment + BITS_PER_BASE >= BITS_PER_LONGWORD)
          {
            assert(offsetInCmptSeq <= numChar);
            assert((offsetInCmptSeq * BITS_PER_BASE) % BITS_PER_LONGWORD != 0);

            /* append the high bits into the current word. */
            sequence[offsetInCmptSeq * BITS_PER_BASE_LL / BITS_PER_LONGWORD] =
                (word << (BITS_PER_LONGWORD - lenSegment))
                    | (code >> (lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD));
            lenSegment = lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD;

            /* the current 64-bit word is updated to the low bits */
            word = ~((~code) | (-1LL << lenSegment));
          } else {
            /* append the current character into the current word */
            lenSegment += BITS_PER_BASE;
            word = (word << BITS_PER_BASE) | code;
          }
          offsetInCmptSeq++;
        }

        /*	save sequence length */
        if (curSeq >= 0)
          seqLens[curSeq] = offsetInOrgSeq;

        curSeq++;

        /*	save sequence names */
        getSeqName(file, seqNames[curSeq]);

        /*	initialization for each sequence */
        offsetInOrgSeq = 0;

        itree->append(offsetInCmptSeq, 0, curSeq, offsetInOrgSeq);

        lenSegmentN = 0;
      }

      /*	skip the line break characters */
      else if (c == 10 || c == 13)
        continue;
      else {
        bool isUnknownChar = true;

        if (c == 'A' || (c == 'a' && !skipMask)) {
          code = BASE_A;
          isUnknownChar = false;
        } else if (c == 'C' || (c == 'c' && !skipMask)) {
          code = BASE_C;
          isUnknownChar = false;
        } else if (c == 'G' || (c == 'g' && !skipMask)) {
          code = BASE_G;
          isUnknownChar = false;
        } else if (c == 'T' || (c == 't' && !skipMask)) {
          code = BASE_T;
          isUnknownChar = false;
        }

        if (!isUnknownChar) {
          /*
           *	insert the last N segment into the interval tree.
           *	The insertted point is at the end of the N segment.
           *	We insert the offsets in original sequence and
           *	compact sequence as a pair.
           */
          if (lenSegmentN > numError + 1) {
            itree->append(offsetInCmptSeq, 0, curSeq, offsetInOrgSeq);
          }
          lenSegmentN = 0;
        } else {
          code = BASE_N;

          /*
           *	all other characters are handled similar to the unknown
           *	character 'N'.
           */

          /* update the current N segment length */
          lenSegmentN++;

          /*
           *	if the current N segment exceed numError+1 characters,
           *	discard the character.
           */
          if (lenSegmentN > numError + 1) {
            offsetInOrgSeq++;
            continue;
          }
        }

        /*
         *	We use three bits to represent a character. The characters
         *	are packed into 64-bit words. Here we check if the current
         *	position is on the boundary of 64-bit word.
         */
        if (lenSegment + BITS_PER_BASE >= BITS_PER_LONGWORD)
        {
          assert(offsetInCmptSeq <= numChar);
          assert((offsetInCmptSeq * BITS_PER_BASE) % BITS_PER_LONGWORD != 0);

          /* append the high bits into the current word. */
          sequence[offsetInCmptSeq * BITS_PER_BASE_LL / BITS_PER_LONGWORD] =
              (word << (BITS_PER_LONGWORD - lenSegment))
                  | (code >> (lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD));
          lenSegment = lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD;

          /* the current 64-bit word is updated to the low bits */
          word = ~((~code) | (-1LL << lenSegment));
        } else {
          /* append the current character into the current word */
          lenSegment += BITS_PER_BASE;
          word = (word << BITS_PER_BASE) | code;
        }
        offsetInCmptSeq++;
        offsetInOrgSeq++;
      }
    }

    ret = fclose(file);
    if (ret != 0)
      return ERR_FILE;

  }

  assert(offsetInCmptSeq + numError + 1 == numChar);

  /* save the last sequence length */
  if (curSeq >= 0)
    seqLens[curSeq] = offsetInOrgSeq;

  /* add the last N-segment */
  code = BASE_N;
  for (j = offsetInCmptSeq; j < numChar; j++) {
    /* update the progress bar */
    bar.update(j);

    if (lenSegment + BITS_PER_BASE >= BITS_PER_LONGWORD)
    {
      assert(offsetInCmptSeq <= numChar);
      assert((offsetInCmptSeq * BITS_PER_BASE) % BITS_PER_LONGWORD != 0);

      /* append the high bits into the current word. */
      sequence[offsetInCmptSeq * BITS_PER_BASE_LL / BITS_PER_LONGWORD] = (word
          << (BITS_PER_LONGWORD - lenSegment))
          | (code >> (lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD));
      lenSegment = lenSegment + BITS_PER_BASE - BITS_PER_LONGWORD;

      /* the current 64-bit word is updated to the low bits */
      word = ~((~code) | (-1LL << lenSegment));
    } else {
      /* append the current character into the current word */
      lenSegment += BITS_PER_BASE;
      word = (word << BITS_PER_BASE) | code;
    }
    offsetInCmptSeq++;
  }

  /* flush the current word */
  word = word << (BITS_PER_LONGWORD - lenSegment);
  sequence[offsetInCmptSeq * BITS_PER_BASE_LL / BITS_PER_LONGWORD] = word;

  itree->flush(offsetInCmptSeq, 0, curSeq, offsetInOrgSeq);

  elog(DEBUG1, "#characters in reference sequences: %u\n", numChar);

  return SUCCESS;
}

int CompactSequence::filter(Aligner * aligner, char ** fname, int numFile,
    char * path) {
  unsigned int i, j, ret;
  FILE * file;
  char c;
  unsigned int offsetInCmptSeq;
  unsigned int lenSegmentN;
  unsigned int step, nextstep;
  int64 numFileChar, numFileNSeg, numFileSeq, nChar, nNSegment;
  int64 space[16];
  int nSkipChar = 0;
  int64 * key = &space[8];

  FILE * outfile = fopen(path, "w");

  offsetInCmptSeq = 0;
  lenSegmentN = 0;

  ProgressBar bar(numChar - 1, PROGRESS_BAR_WIDTH);

  elog(INFO, "loading reference sequences...\n");
  for (i = 0; i < numFile; i++) {
    /*	open the sequence file */
    file = fopen(fname[i], "rb");
    if (file == NULL)
    {
      return ERR_PARA;
    }

    while ((c = fgetc(file)) != EOF) {
      /* update the progress bar*/
      bar.update(offsetInCmptSeq);

      /* the begining of a new sequence */
      if (c == '>') {
        char str[256];

        fputc(c, outfile);

        j = 0;
        while (1) {
          str[j] = fgetc(file);
          if (str[j] == 10 || str[j] == 13 || str[j] == EOF
            )
            break;
          j++;
        }
        str[j] = '\0';
        fprintf(outfile, "%s\n", str);

        offsetInCmptSeq += nError + 1;

        lenSegmentN = 0;
      } else if (c == 10 || c == 13) {
        fputc(c, outfile);
        continue;
      } else {
        nSkipChar--;

        if (c == 'A' || c == 'a' || c == 'C' || c == 'c' || c == 'G' || c == 'g'
            || c == 'T' || c == 't') {
          lenSegmentN = 0;

          //search
          BitRead::extract(sequence, key, offsetInCmptSeq * BITS_PER_BASE_LL,
          len * BITS_PER_BASE);

          if (aligner->hashTables[0].lookup(key, offsetInCmptSeq)) {
            if (nSkipChar > 0)
              fputc('N', outfile);
            else
              fputc(c, outfile);
          } else {
            fputc('N', outfile);
            nSkipChar = len;
          }
        } else {
          fputc(c, outfile);

          /* update the current N segment length */
          lenSegmentN++;

          /*
           *	if the current N segment exceed numError+1 characters,
           *	discard the character.
           */
          if (lenSegmentN > nError + 1)
            continue;
        }

        offsetInCmptSeq++;
      }
    }

    fputc('\n', outfile);

    ret = fclose(file);
    if (ret != 0)
      return ERR_FILE;

  }

  ret = fclose(outfile);
  if (ret != 0)
    return ERR_FILE;

  return SUCCESS;
}

/**
 * CompactSequence::compose
 * compose a character-based sequence into bit-vector format
 * that using 3 bits to represent a character.
 */
void CompactSequence::compose(char * str, int length, int64 * words) {
  int j;
  int forward_offset, offset;
  int64 forward_word, code;

  /* cut the sequence if necessary */
  str[length] = '\0';

  /* initialize the values */
  words[0] = 0;
  words[1] = 0;

  offset = (WORDS_PER_READ * BITS_PER_LONGWORD - length * BITS_PER_BASE)
      / BITS_PER_LONGWORD;

  /* initialize the current word in forward/backward format */
  forward_word = 0;

  /* initialize the begining offset in forward/backward format */
  forward_offset = (WORDS_PER_READ * BITS_PER_LONGWORD - length * BITS_PER_BASE)
      % BITS_PER_LONGWORD;

  /*
   *	scan the sequence and generate the compact representation
   *	in forward or/and backward format.
   */
  for (j = 0; j < length; j++) {
    if (str[j] == 'A')
      code = BASE_A;
    else if (str[j] == 'C')
      code = BASE_C;
    else if (str[j] == 'G')
      code = BASE_G;
    else if (str[j] == 'T')
      code = BASE_T;
    else if (str[j] == 'N')
      code = BASE_N;
    else
      elog(ERROR, "ERROR: unknown character in short read files.\n");

    /*	forward format */
    if (forward_offset + BITS_PER_BASE >= BITS_PER_LONGWORD)
    {
      /* on the boundary of 64-bit word */
      words[offset++] = (forward_word << (BITS_PER_LONGWORD - forward_offset))
          | (code >> (forward_offset + BITS_PER_BASE - BITS_PER_LONGWORD));
      forward_offset = forward_offset + BITS_PER_BASE - BITS_PER_LONGWORD;
      forward_word = ~((~code) | (-1LL << forward_offset));
    } else {
      forward_word = (forward_word << BITS_PER_BASE) | code;
      forward_offset += BITS_PER_BASE;
    }
  }
}

void CompactSequence::decompose(char * str, int length, int64 * words) {
  int i, j = 0, k;
  int64 code;

  k = WORDS_PER_READ - 1;
  str[length] = '\0';
  for (i = length - 1; i >= 0; i--) {
    if (j + BITS_PER_BASE > BITS_PER_LONGWORD)
    {
      if (j < BITS_PER_LONGWORD
        )
        code = ((words[k] >> j) | (words[k - 1] << (BITS_PER_LONGWORD - j)))
            & 0x7;
      else
        code = words[k - 1] & 0x7;
      str[i] = code2Gene[code];
      j = j + BITS_PER_BASE - BITS_PER_LONGWORD;
      k--;
    } else {
      code = (words[k] >> j) & 0x7;
      str[i] = code2Gene[code];
      j += 3;
    }
  }
}

/*
 *	CompactSequence::save
 *	This function is used to save the in-memory sequence to disk. 
 *	The CompactSequence structure is stored in the file sequence.whm 
 *	in the specified data path. Interval tree is stored in the file 
 *	interval.whm.
 */
int CompactSequence::save(char * path) {
  int ret;
  char fname[MAX_LENGTH_PATH];
  FILE * file;

  if (strlen(path) > 240)
    return ERR_PARA;

  sprintf(fname, "%s.sequence.whm", path);
  file = fopen(fname, "wb");
  if (file == NULL)
  {
    elog(ERROR, "ERROR:failed to open file: %s\n", fname);
    return ERR_PARA;
  }

  ret = fwrite(this, sizeof(CompactSequence), 1, file);
  if (ret != 1) {
    elog(ERROR, "ERROR: write head data file.\n");
    return ERR_FILE;
  }

  ret = fwrite(pool, sizeof(int64), size, file);
  if (ret != size) {
    elog(ERROR, "ERROR: write head data file.\n");
    return ERR_FILE;
  }

  ret = fwrite(seqNamepool, sizeof(char) * MAX_LENGTH_PATH, nSeq, file);
  if (ret != nSeq) {
    elog(ERROR, "ERROR: write head data file.\n");
    return ERR_FILE;
  }

  ret = fwrite(seqLens, sizeof(uint32), nSeq, file);
  if (ret != nSeq) {
    elog(ERROR, "ERROR: write head data file.\n");
    return ERR_FILE;
  }

  ret = fflush(file);
  if (ret != 0) {
    elog(ERROR, "ERROR: write head data file.\n");
    return ERR_FILE;
  }

  ret = fclose(file);
  if (ret != 0)
    return ERR_FILE;

  ret = itree->save(path);
  if (ret != SUCCESS
    )
    return ret;

  return SUCCESS;
}

/*
 *	CompactSequence::load
 *	This function is used to load the on-disk sequence structure into
 *	main memory. The CompactSequence structure is loaded from the file 
 *	sequence.whm in the specified data path. Interval tree is loaded 
 *	from the file interval.whm.
 */
int CompactSequence::load(char * path) {
  int ret;
  char fname[256];
  FILE * file;

  if (strlen(path) > 240)
    return ERR_PARA;

  sprintf(fname, "%s.sequence.whm", path);
  file = fopen(fname, "rb");
  if (file == NULL
    )
    return ERR_PARA;

  ret = fread(this, sizeof(CompactSequence), 1, file);
  if (ret != 1) {
    elog(ERROR, "ERROR: read sequence structure data file.\n");
    return ERR_FILE;
  }

  pool = (int64 *) malloc(size * sizeof(int64));
  sequence = pool + SEQUENCE_HEAD_WORDS;

  ret = fread(pool, sizeof(int64), size, file);
  if (ret != size) {
    elog(ERROR, "ERROR: read sequence data file.\n");
    return ERR_FILE;
  }

  seqNamepool = (char *) malloc(nSeq * MAX_LENGTH_PATH);
  ret = fread(seqNamepool, sizeof(char) * MAX_LENGTH_PATH, nSeq, file);
  if (ret != nSeq) {
    elog(ERROR, "ERROR: read sequence data file.\n");
    return ERR_FILE;
  }
  seqNames = (char **) malloc(nSeq * sizeof(char *));
  for (int i = 0; i < nSeq; i++)
    seqNames[i] = &seqNamepool[i * MAX_LENGTH_PATH];

  seqLens = (uint32 *) malloc(nSeq * sizeof(uint32));
  ret = fread(seqLens, sizeof(uint32), nSeq, file);
  if (ret != nSeq) {
    elog(ERROR, "ERROR: read sequence data file.\n");
    return ERR_FILE;
  }

  ret = fclose(file);
  if (ret != 0)
    return ERR_FILE;

  itree = new IntervalTree;
  ret = itree->load(path);
  if (ret != SUCCESS
    )
    return ret;

  return SUCCESS;
}

/*
 *	CompactSequence::valid
 *	this function is used to check if the sequence is compatible 
 *	with the specificied parameters 
 */
int CompactSequence::valid(int length, int numError) {
  if (length != len)
    return ERR_PARA;

  if (numError != nError)
    return ERR_PARA;

  return SUCCESS;
}

/*
 int CompactSequence::loadRead(char * path)
 {
 int ret;
 char fname[256];
 FILE * file;

 sprintf(fname, "%s//short.dat", path);
 file = fopen(fname, "rb");
 if (file == NULL)
 return ;

 ret = fseek(file, 0, SEEK_END);
 if (ret != 0)
 return 0;

 long int size;
 size = ftell(file);
 numRead = size / (sizeof(int64) * 3 + sizeof(uint32));

 ret = fseek(file, 0, SEEK_SET);
 if (ret != 0)
 return 0;

 keys = new int64[numRead * 3];
 offsets = new uint32[numRead];

 for (uint32 i = 0; i < numRead; i++)
 {
 ret = fread(keys + i * 3, sizeof(int64), 3, file);
 if (ret != 3)
 return 0;
 ret = fread(offsets + i, sizeof(uint32), 1, file);
 if (ret != 1)
 return 0;
 }

 fclose(file);

 printf("load reads succesfully.\n");
 return 1;
 }
 */