File: cdbfasta.cpp

package info (click to toggle)
cdbfasta 0.99-20100722-5
links: PTS, VCS
area: main
in suites: buster
size: 388 kB
sloc: cpp: 6,168; makefile: 67; sh: 6
file content (797 lines) | stat: -rw-r--r-- 25,746 bytes
parent folder | download | duplicates (3)
#include <ctype.h>
#include <fcntl.h>
#include <string.h>
#include <sys/stat.h>
#include "GBase.h"
#include "GArgs.h"
#include "GHash.hh"
#include "gcdb.h"
#ifdef ENABLE_COMPRESSION
#include "gcdbz.h"
#endif
#ifdef __WIN32__
#define VERSION "cdbfasta version 0.99w"
#else
#define VERSION "cdbfasta version 0.99"
#endif

#define USAGE "Usage:\n\
  cdbfasta <fastafile> [-o <index_file>] [-r <record_delimiter>]\n\
   [-z <compressed_db>] [-i] [-m|-n <numkeys>|-f<LIST>]|-c|-C]\n\
    [-w <stopwords_list>] [-s <stripendchars>] [-v]\n\
   \n\
   Creates an index file for records from a multi-fasta file.\n\
   By default (without -m/-n/-c/-C option), only the first \n\
   space-delimited token from the defline is used as a key.\n\
  \n\
   <fastafile> is the multi-fasta file to index; \n\
   -o the index file will be named <index_file>; if not given,\n\
      the index filename is database name plus the suffix '.cidx'\n\
   -r <record_delimiter> a string of characters at the beginning of line\n\
      marking the start of a record (default: '>')\n\
   -Q treat input as fastq format, i.e. with '@' as record delimiter\n\
      and with records expected to have at least 4 lines\n\
   -z database is compressed into the file <compressed_db>\n\
      before indexing (<fastafile> can be \"-\" or \"stdin\" \n\
      in order to get the input records from stdin)\n\
   -s strip extraneous characters from *around* the space delimited\n\
      tokens, for the multikey options below (-m,-n,-f);\n\
      Default <stripendchars> set is: '\",`.(){}/[]!:;~|><+-\n\
   -m (\"multi-key\" option) create hash entries pointing to \n\
      the same record for all tokens found in\n\
      the defline\n\
   -n <numkeys> same as -m, but only takes the first <numkeys>\n\
      tokens from the defline\n\
   -f indexes *space* delimited tokens (fields) in the defline as given\n\
      by LIST of fields or fields ranges (the same syntax as UNIX 'cut')\n\
   -w <stopwordslist> exclude from indexing all the words found\n\
      in the file <stopwordslist> (for options -m, -n and -k)\n\
   -i do case insensitive indexing (i.e. create additional keys for \n\
      all-lowercase tokens used for indexing from the defline \n\
   -c for deflines in the format: db1|accession1|db2|accession2|...,\n\
      only the first db-accession pair ('db1|accession1') is taken as key\n\
   -C like -c, but also subsequent db|accession constructs are indexed,\n\
      along with the full (default) token; additionally,\n\
      all nrdb concatenated accessions found in the defline \n\
      are parsed and stored (assuming 0x01 or '^|^' as separators)\n\
   -a accession mode: like -C option, but indexes the 'accession'\n\
      part for all 'db|accession' constructs found\n\
   -A like -a and -C together (both accessions and 'db|accession'\n\
      constructs are used as keys\n\
   -v show program version and exit\n"


#define ERR_TOOMANYFIELDS "Error: too many fields for -f option\n"
//16K initial defline buffer
#define KBUFSIZE 0x4000
#ifndef O_BINARY
 #define O_BINARY 0x0000
#endif
#define MAX_KEYLEN 1024
//64K input buffer
#define GREADBUF_SIZE 0x010000


typedef void (*addFuncType)(char*, off_t, uint32);

// for passing around index data:
struct IdxData32 {
   uint32 fpos;
   uint32 reclen;
   };

struct IdxData {
   off_t fpos; //64bit value on Linux
   uint32 reclen;
   };

int IdxDataSIZE=offsetof(IdxData, reclen)+sizeof(uint32);
int IdxDataSIZE32=offsetof(IdxData32, reclen)+sizeof(uint32);

char ftmp[365];
char fztmp[365];
char record_marker[127]; //record delimiter
int  record_marker_len=1;
int num_recs;
int num_keys;

off_t last_cdbfpos=0;

int compact_plus; //shortcut key and
bool acc_mode=false;
bool acc_only=false;
bool do_compress=false; // compression used
bool fastq=false;

FILE* zf=NULL; //compressed file handle

//store just offset and record length
const char* defWordJunk="'\",`.(){}/[]!:;~|><+-";
char* wordJunk=NULL;
bool caseInsensitive=false; //case insensitive storage
bool useStopWords=false;
unsigned int numFields=0;
// we have fields[numFields-1]=MAX_UINT as defined in gcdb.h -- 
// as an indicator of taking every single token in the defline, 
// or for open ended ranges (e.g. -f5- )
unsigned int fields[255]; //array of numFields field indices (1-based)
GHash<int> stopList;
//static int datalen=sizeof(uint32)+sizeof(off_t);

char lastKey[MAX_KEYLEN]; //keep a copy of the last valid written key

GCdbWrite* cdbidx;
addFuncType addKeyFunc;

#define ERR_W_DBSTAT "Error writing the database statististics!\n"


void die_write(const char* fname) {
  GError("Error: cdbhash was unable to write into file %s\n",
      fname);

 }

void die_read(const char* infile) {
  GError("Error: cdbhash was unable to read the input file %s\n",infile);
}


void die_readformat(const char* infile) {
  GError("Error: bad format for file %s; is it a fastA file?\n",
       infile);
}


bool add_cdbkey(char* key, off_t fpos, uint32 reclen) {
 
 unsigned int klen=strlen(key);
 if (fpos==last_cdbfpos && strcmp(key, lastKey)==0) return true;
 if (klen<1) {
    GMessage("Warning: zero length key found following key '%s'\n",
              lastKey);
    return false;
    }
  //------------ adding record -----------------
 num_keys++;
 strncpy(lastKey, key, MAX_KEYLEN-1);
 lastKey[MAX_KEYLEN-1]='\0';
 if ((uint64)fpos>(uint64)MAX_UINT) { //64 bit file offset
  IdxData recdata;
  uint64 v= (uint64) fpos; //needed for Solaris' off_t issues with gcc/32
  recdata.fpos=gcvt_offt(&v);
  recdata.reclen=gcvt_uint(&reclen);  
  if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE)==-1)
    GError("Error adding cdb record with key '%s'\n",key);
  }
 else {//32 bit file offset
  IdxData32 recdata;
  uint32 v=(uint32) fpos;
  recdata.fpos=gcvt_uint(&v);
  recdata.reclen=gcvt_uint(&reclen);
  //GMessage("Adding 32bit: '%s' reclen=%d\n", key, recdata.reclen);
  if (cdbidx->addrec(key,klen,(char*)&recdata,IdxDataSIZE32)==-1)
    GError("Error adding cdb record with key '%s'\n",key);
  }
 last_cdbfpos=fpos;
 return true;
}

//default indexing: key directly passed -- 
// as the first space delimited token
void addKey(char* key, off_t fpos,
             uint32 reclen) {
 num_recs++;
 add_cdbkey(key, fpos, reclen);
 if (caseInsensitive) {
   char* lckey=loCase(key);
   if (strcmp(lckey, key)!=0) 
      add_cdbkey(lckey, fpos, reclen);
   GFREE(lckey);
   }
}

//the whole defline is passed 
void addKeyMulti(char* defline,
                 off_t fpos, uint32 reclen) {
 char* p=defline;
 unsigned int fieldno=0;
 char* pn;
 num_recs++;
 bool stillParsing=true;
 unsigned int fidx=0; //index in fields[] array
 while (stillParsing) {
       while ((*p)==' ' || (*p)=='\t') p++;
       if (*p == '\0') break;
       //skip any extraneous characters at the beginning of the token
       while (chrInStr(*p, wordJunk)) p++;
       //skip any padding spaces or other extraneous characters 
       //at the beginning of the word
       if (*p == '\0') break;
       pn=p;
       while (*pn!='\0' && !isspace(*pn)) pn++; 
       //found next space or end of string
       fieldno++;
       while (fields[fidx]<fieldno && fidx<numFields-1) fidx++;
       //GMessage("> p=>'%s' [%d, %d, %d] (next='%s')\n",p, numFields, 
        //      fieldno, fields[numFields-1], pn+1);
       stillParsing = (((*pn)!='\0') && (fieldno+1<=fields[numFields-1]));
       char* pend = pn-1; //pend is on the last non-space in the current token
       //--strip the ending junk, if any
       while (chrInStr(*pend, wordJunk) && pend>p) pend--;
       if (pend<pn-1) *(pend+1)='\0';
                 else *pn='\0';
       
       if (strlen(p)>0) {
         if (fields[fidx]==MAX_UINT || fields[fidx]==fieldno) {
           if (useStopWords && stopList.hasKey(p)) {
             p=pn+1;
             continue;
             }
           //--- store this key with the same current record data:
           add_cdbkey(p, fpos, reclen);
           //---storage code ends here
           if (caseInsensitive) {
              char* lcp=loCase(p);
              if (strcmp(lcp,p)!=0)
                  add_cdbkey(lcp, fpos, reclen);
              GFREE(lcp);
              }
           }
         //if (isEnd) break; //if all the token were stored
         }
       p=pn+1;//p is now on the next token's start
       } //token parsing loop
}


int qcmpInt(const void *p1, const void *p2) {
 //int n1=*((int*)p1);
 //int n2=*((int*)p2);
 const unsigned int *a = (unsigned int *) p1;
 const unsigned int *b = (unsigned int *) p2;
 if (*a < *b) return -1;
      else if (*a > *b) return 1;
                    else return 0;
 }


char* parse_dbacc(char* pstart, char*& end_acc, char*& accst) {
 if (pstart==NULL || *pstart==0) return NULL;
 bool hasDigits=false;
 char* pend=pstart;
 end_acc=NULL; //end of first accession
 accst=NULL;
 while (*pstart=='|') pstart++;
 for(char* p=pstart;;p++) {
   if (hasDigits==false && *p>='0' && *p<='9') hasDigits=true;
   /* if (*p==0) { //end of seq_name
      pend=p; //doesn't matter if it's accession or "db"-like
      break;
      }*/
   if (*p=='|' || *p==0) {
      int curlen=p-pstart;
      if (*p==0 || (hasDigits && curlen>3) || curlen>7 || accst!=NULL) {//previous token was an accession
        pend=p; //advance pend
        if (end_acc==NULL) end_acc=p;
        if (accst==NULL) accst=pstart;
        break;
        }
      else { //first pipe char or no digits
        accst=p+1;
        }
      hasDigits=false;//reset flag
      } // | separator
   }
 if (pend!=pstart) return pend;
              else return NULL;
}




char* parseSpToken(char* str) {
 if (str==NULL) return NULL;
 char* p=str;
 while (*p!=' ' && *p!='\t' && *p!='\v' && *p!=0) p++;
 *p=0;
 return p;
}

#define NRDB_CHARSEP "\1\2\3\4"
#define NRDB_STRSEP "^|^"

//nrdbp is positioned at the end of current nrdb concatenated
//defline, or NULL if there is no more
inline void NRDB_Rec(char* &nrdbp, char* defline) {
 nrdbp=strpbrk(defline, NRDB_CHARSEP);
 if (nrdbp==NULL) {
   nrdbp=strstr(defline, NRDB_STRSEP);
   if (nrdbp!=NULL) {
     *nrdbp='\0';
     nrdbp+=2;
     }
   }
  else *nrdbp='\0';
}

//-c/-C/-a/-A indexing: key up to the first space or second '|'
//receives the full defline
void addKeyCompact(char* defline,
              off_t fpos, uint32 reclen) {
 //we got the first token found on the defline
 num_recs++;
 char* nrdb_end;
 //breaks defline at the next nrdb concatenation point
 NRDB_Rec(nrdb_end, defline);
 //isolate the first token in this nrdb concatenation
 char* token_end=parseSpToken(defline);
 if (!compact_plus) { //shortcut key
   //only the first db|accession construct will be indexed, if found
   char* end_acc1=NULL; //end of first accession
   char* acc1st=NULL;
   char* dbacc_end=parse_dbacc(defline, end_acc1, acc1st);
   if (end_acc1!=NULL) { //has acceptable shortcut
     *end_acc1=0;
     add_cdbkey(defline, fpos, reclen);
     return;
     }
   if (dbacc_end!=NULL) {
      *dbacc_end=0;
      add_cdbkey(defline, fpos, reclen);
      return;
      }
   //store this whole non-space token as key:
   add_cdbkey(defline, fpos, reclen);
   return;
   }
 //from now on only -C/-a/-A treatment:
 for(;;) {
    //defline is on the first token    
    if (strlen(defline)>0) //add whole non-space token as the "full key"
       add_cdbkey(defline, fpos, reclen);
    //add the db|accession constructs as keys
    char* dbacc_start=defline;
    char* firstacc_end=NULL;
    char* accst=NULL;
    char* dbacc_end=parse_dbacc(dbacc_start, firstacc_end, accst);
    while (dbacc_end!=NULL) {
      if (firstacc_end!=NULL && firstacc_end<dbacc_end) {
        char c=*firstacc_end;
        *firstacc_end=0;
        if (!acc_only)
          add_cdbkey(dbacc_start, fpos, reclen);
        if (acc_mode) {
           add_cdbkey(accst, fpos, reclen);
           }
        *firstacc_end=c;
        }
      if (dbacc_start==defline && dbacc_end==token_end) {
           if (acc_mode && accst!=NULL)
              add_cdbkey(accst, fpos, reclen);
           break; //the whole seq_name was only one db entry
           }
      *dbacc_end=0; //end key here
      if (!acc_only)
        add_cdbkey(dbacc_start, fpos, reclen);
      if (acc_mode)
        add_cdbkey(accst, fpos, reclen);
      if (dbacc_end==token_end) 
           break; //reached the end of this whole seq_name (non-space token)
      dbacc_start=dbacc_end+1;
      firstacc_end=NULL;
      dbacc_end=parse_dbacc(dbacc_start, firstacc_end, accst);
      }
    // -- get to next concatenated defline, if any:
    if (compact_plus && nrdb_end!=NULL) { 
      defline=nrdb_end+1; //look for the next nrdb concatenation
      NRDB_Rec(nrdb_end, defline);
      //isolate the first token in this nrdb record
      token_end=parseSpToken(defline);
      }
     else break;
   } //for
 }

int readWords(FILE* f, GHash<int>& xhash) {
  int c;
  int count=0;
  char name[256];
  int len=0;
  while ((c=getc(f))!=EOF) {
    if (isspace(c) || c==',' || c==';') {
      if (len>0) {
        name[len]='\0';
        xhash.Add(name, new int(1));
        count++;
        len=0;
        }
      continue;
      }
    //a non-space
    name[len]=(char) c;
    len++;
    if (len>255) {
      name[len-1]='\0';
      GError("Error reading words file: token too long ('%s') !\n",name);
      }
    }
 if (len>0) {
   name[len]='\0';
   xhash.Add(name, new int(1));
   count++;
   }
 return count;
}



//========================== MAIN ===============================
int main(int argc, char **argv) {
  FILE* f_read=NULL;
  off_t fdbsize;
  int ch;
  char* zfilename;
  char* fname;
  char* marker; //record marker
  int maxkeys=0;
  int multikey;
  record_marker[0]='>';
  record_marker[1]=0;
  GArgs args(argc, argv, "mn:o:r:z:w:f:s:icvQCaA");
  int e=args.isError();
  if  (e>0)
     GError("%s Invalid argument: %s\n", USAGE, argv[e] );
  if (args.getOpt('v')!=NULL) {
    printf("%s\n",VERSION);
    return 0;
    }
  fastq = (args.getOpt('Q')!=NULL);
  
  multikey = (args.getOpt('m')!=NULL);
  if (multikey) {
    fields[numFields]=1;
    numFields++;
    fields[numFields]=MAX_UINT;
    numFields++;  
    }
  caseInsensitive = (args.getOpt('i')!=NULL);
  acc_only=(args.getOpt('a')!=NULL);
  acc_mode=(acc_only || args.getOpt('A')!=NULL);
  compact_plus=(args.getOpt('C')!=NULL || acc_mode);
  wordJunk = (char *)args.getOpt('s');
  if (wordJunk==NULL) wordJunk=(char*)defWordJunk;
  int compact=(args.getOpt('c')!=NULL || compact_plus);
  if (compact && multikey) {
    GError("%s Error: invalid flags combination.\n", USAGE);
    }
  char* s = (char*)args.getOpt('n');
  if (s!=NULL) {
    maxkeys = atoi(s);
    if (maxkeys<=1 || compact || multikey)
        GError("%s Error: invalid options (-m, -c/C, -n and -f are exclusive)\n", USAGE);
    multikey=1;
    numFields=maxkeys;
    if (numFields>254) GError(ERR_TOOMANYFIELDS);
    for (unsigned int i=1;i<=numFields;i++) fields[i-1]=i;
    }
  char* argfields = (char*)args.getOpt('f');
  if (argfields!=NULL) { //parse all the field #s
    char* pbrk;
    int prevnum=0;
    char prevsep='\0';
    numFields=0;
    char sep;
    char *p=argfields;
    do {
     pbrk=strpbrk(p,",-");
     if (pbrk==NULL) {
         sep='\0';
         pbrk=p+strlen(p);
         if (prevsep == '-' && *p=='\0' && prevnum>0) {
           //open ended range -- ending with '-'
           fields[numFields]=prevnum;
           numFields++;
           if (numFields>253) GError(ERR_TOOMANYFIELDS);
           fields[numFields]=MAX_UINT;
           numFields++;
           //GMessage("--- stored %d, %d\n",prevnum, MAX_UINT);
           break;
           }// ending with '-'
         } // '\0'
      else { sep=*pbrk; *pbrk = '\0'; }
     int num = atoi(p);
     if (num<=0 || num>254 )
              GError("%s Error: invalid syntax for -f option.\n", USAGE);
     if (prevsep == '-') { //store a range
       for (int i=prevnum;i<=num;i++) {
          fields[numFields]=i;
          numFields++;
          if (numFields>254) GError(ERR_TOOMANYFIELDS);
          }
       }
      else if (sep!='-') {
         fields[numFields]=num;
         numFields++;
         if (numFields>254) GError(ERR_TOOMANYFIELDS);
         }
      
     prevsep=sep;
     prevnum=num;
     p=pbrk+1;
     } while (sep != '\0'); //range parsing loop
    if (numFields<=0 || numFields>254 )
              GError("%s Error at parsing -f option.\n", USAGE);
    //GMessage("[%d] Fields parsed (%d values):\n", sizeof(fields[0]), numFields);
    qsort(fields, numFields, sizeof(fields[0]), &qcmpInt);    
    multikey=1;
    /*-- --------debug:
    for (unsigned int i=0;i<numFields-1;i++) {
      GMessage("%d,", fields[i]);
      }
    GMessage("%d\n",fields[numFields-1]);
    exit(0); */ 
    } //fields
  if (fastq) {
    record_marker[0]='@';
    record_marker_len=1;
    } 
  if (args.getOpt('r')!=NULL) {//non-FASTA record delimiter?
   if (fastq) {
     GMessage("Option -r ignored because -Q was given (-Q sets delimiter to '@')\n");
     }
   else {  
   marker=(char*)args.getOpt('r'); //
   int v=0;
   if (strlen(marker)>126) 
      GError("Error: the specified record delimiter is too long. "
        "Maximum accepted is 126\n");
   //special case: hex (0xXX) and octal codes (\XXX) are accepted, only if by themselves
   if (strlen(marker)==4 && marker[0]=='\\' || (marker[0]=='0' && (toupper(marker[1])=='X'))) {
       if (marker[0]=='\\') {
           marker++;
           v=strtol(marker, NULL, 8);
           }
          else v=strtol(marker, NULL, 16);
       if (v==0 || v>255)
         GError("Invalid record delimiter: should be only one character,\n"
                "'\\NNN' (octal value), '0xNN' (hexadecimal value)");
       record_marker[0]=v;
       record_marker_len=1;
       }
     else {
      strcpy(record_marker, marker);
      record_marker_len=strlen(record_marker);
      }
    }
   }
 char* stopwords=(char*)args.getOpt('w'); //stop words filename given?
 if (stopwords!=NULL) {
  FILE* fstopwords=NULL;
  if ((fstopwords=fopen(stopwords, "r"))==NULL)
       GError("Cannot open stop words file '%s'!\n", stopwords);
  int c=readWords(fstopwords, stopList);
  GMessage("Loaded %d stop words.\n", c);
  fclose(fstopwords);
  useStopWords=(c>0);
  }
  if ((zfilename=(char*)args.getOpt('z')) !=NULL) {
    do_compress=true;
    #ifndef ENABLE_COMPRESSION
      GError("Error: compression requested but not enabled when cdbfasta was compiled\n")
    #endif    
    strcpy(fztmp,zfilename);
    strcat(fztmp,"_ztmp");
    zf=fopen(fztmp,"wb");
    if (zf==NULL)
      GError("Error creating file '%s'\n'", fztmp);
    }
  char* outfile=(char*) args.getOpt('o');
  int numfiles = args.startNonOpt();
  if (numfiles==0)
    GError("%sError: no fasta file given.\n", USAGE);
  fname=(char*) args.nextNonOpt(); //first fasta file given
  if (do_compress)  { //-------- compression case -------------------
     if (strcmp(fname, "-")==0 || strcmp(fname, "stdin")==0)
           f_read=stdin;
       else f_read= fopen(fname, "rb");
     if (f_read == NULL) die_read(fname);
     fname=zfilename; //forget the input file name, keep the output
     }
  else {//
    int fdread= open(fname, O_RDONLY|O_BINARY);
    if (fdread == -1) die_read(fname);
    struct stat dbstat;
    fstat(fdread, &dbstat);
    fdbsize=dbstat.st_size;
    close(fdread);
    f_read= fopen(fname, "rb");
    if (f_read == NULL) die_read(fname);
    }

  char idxfile[365];
  if (outfile==NULL) {
    if (do_compress) {
      strcpy(ftmp, zfilename);
      strcat(ftmp, ".cidx");
      strcpy(idxfile, ftmp);
      strcat(ftmp, "_tmp");
      }
    else {
      strcpy(ftmp, fname);
      strcat(ftmp, ".cidx");
      strcpy(idxfile, ftmp);
      strcat(ftmp, "_tmp");
      }
    //should add the process ID, time and user to make this unique?
    }
  else {
    strcpy(ftmp, outfile);
    strcpy(idxfile, outfile);
    strcat(ftmp, "_tmp");
    }

  cdbidx=new GCdbWrite(ftmp); //test if this was successful?

  if (compact)
         addKeyFunc=&addKeyCompact;
    else if (multikey)
               addKeyFunc = &addKeyMulti;
          else addKeyFunc = &addKey;

  off_t recpos=0;
  off_t r=0;
  unsigned int recsize=0;
  char* key=NULL;
  bool fullDefline=(multikey || compact_plus);
  GReadBuf *readbuf = new GReadBuf(f_read, GREADBUF_SIZE);
  if (do_compress) { //---------------- compression case -------------
     if (fastq) GError("Error: sorry, compression is not supported with fastq format\n");
     fdbsize=0;
     GCdbz cdbz(zf); // zlib interface
     recpos=cdbz.getZRecPos();
     while ((key=cdbz.compress(readbuf,record_marker))!=NULL) {
       recsize=cdbz.getZRecSize();
       if (!fullDefline) {
         //find first space after the record_marker and place a '\0' there
         for (int i=record_marker_len; key[i]!='\0';i++) {
           if (isspace(key[i])) { key[i]='\0';break; }
           }
         }
       addKeyFunc(key, recpos, recsize);
       recpos = cdbz.getZRecPos();
       }
     remove(zfilename);
     cdbz.compress_end();
     fclose(zf);
     //determine the size of this file:
     int ftmp= open(fztmp, O_RDONLY|O_BINARY);
     if (ftmp == -1) die_read(fztmp);
     struct stat dbstat;
     fstat(ftmp, &dbstat);
     fdbsize=dbstat.st_size;
     //rename it to the intended file name
     if (rename(fztmp,zfilename) != 0) {
       GMessage("Error: unable to rename '%s' to '%s'\n",fztmp,zfilename);
       perror("rename");
       }
    }  //compression requested  
  else { // not compressed -- buffered file access
     bool defline=false;
     int kbufsize=KBUFSIZE;
     if (fullDefline) { GMALLOC(key, KBUFSIZE); }//large defline storage buffer, just in case
                 else { GMALLOC(key, 1024); }
     int kidx=-1;
     num_recs=0;
     num_keys=0;
     char lastchar=0;
     //first iteration -- for the beginning of file case
     if (readbuf->peekCmp(record_marker, record_marker_len)==0) {
           //new record start found (defline)
           recpos=readbuf->getPos(); //new record pos
           defline=true; //we're in defline
           readbuf->skip(record_marker_len);
           kidx=0;
           }//new record start
     int linecounter=0;
     bool checkNewRec=true;
     while ((ch=readbuf->getch())>0) {
      if (defline && kidx>=0) { //on the defline here, still parsing possible keys
          key[kidx]=(char)ch;
          kidx++;
          if (kidx>=kbufsize) {
             kbufsize+=KBUFSIZE;
             GREALLOC(key, kbufsize);
             }
          if (((isspace(ch) || ch<31) && fullDefline==false) || ch=='\n' || ch=='\r') {
               //end key here, don't care about the rest
               
               key[kidx-1]='\0';
               kidx=-1;
               }
        }
      if (ch=='\n') { // newline!        
         //TODO: should this be '\r' on MacOS ?!
         linecounter++;
         //check ahead if a record delimiter follows
        checkNewRec = (!fastq || linecounter>3);
        if (checkNewRec && readbuf->peekCmp(record_marker, record_marker_len)==0) {
           //new record start (defline)
           recsize = readbuf->getPos()-recpos-1; //previous recsize
           if (recsize>off_t(record_marker_len+1) && key[0]!='\0') {
             //add previous record, if there
             addKeyFunc(key, recpos, recsize);
             linecounter=0;
             //GMessage("adding key=%s\n",key);
             }
           recpos=readbuf->getPos(); //new record pos
           defline=true; //we're in defline
           readbuf->skip(record_marker_len);
           //if (r<0) die_readformat(fname);
           kidx=0;
           } //new record start
         else { //after newline but not a new record start
            if (defline) { //we just finished a defline
                if (fullDefline) { //close the defline string
                  if (kidx>0) key[kidx-1]='\0';
                  kidx=-1;
                  }
               defline=false;
               }
           } //after newline but not a new record
        } // was newline 
      lastchar=ch;
      }//while char
     recsize=readbuf->getPos()-recpos;
     if (recsize>0) {//add last record, if there
         if (lastchar=='\n') recsize--;
         if (fullDefline && kidx>0) {//close the defline string
               if (lastchar!='\n') kidx++;
               key[kidx-1]='\0';
               }
         addKeyFunc(key, recpos, recsize);
         linecounter=0;
         //GMessage("adding key=%s\n",key);
         }
   delete readbuf;
   }
  if (f_read!=stdin) fclose(f_read);
  if (cdbidx->finish() == -1) die_write("");

  // === add some statistics at the end of the cdb index file!
  r=lseek(cdbidx->getfd(), 0, SEEK_END);
  cdbInfo info;
  memcpy((void*)info.tag, (void*)"CDBX", 4);
  info.idxflags=0;
  if (multikey) info.idxflags |= CDBMSK_OPT_MULTI;
  if (do_compress) {
      info.idxflags |= CDBMSK_OPT_COMPRESS;
      GMessage("Input data were compressed into file '%s'\n",fname);
      }
  if (compact) {
      if (compact_plus)
            info.idxflags |= CDBMSK_OPT_CADD;
         else
            info.idxflags |= CDBMSK_OPT_C;
     }
  info.num_records=gcvt_uint(&num_recs);
  info.num_keys=gcvt_uint(&num_keys);
  info.dbsize=gcvt_offt(&fdbsize);
  info.idxflags=gcvt_uint(&info.idxflags);
  int nlen=strlen(fname);
  info.dbnamelen=gcvt_uint(&nlen);
  r=write(cdbidx->getfd(), fname, nlen);
  if (r!=nlen)
        GError(ERR_W_DBSTAT);
  r=write(cdbidx->getfd(), &info, cdbInfoSIZE);
  if (r!=cdbInfoSIZE)
        GError(ERR_W_DBSTAT);
  delete cdbidx;
  GFREE(key);
  remove(idxfile);
  if (rename(ftmp,idxfile) == -1)
    GError("Error: unable to rename %s to %s",ftmp,idxfile);
  GMessage("%d entries from file %s were indexed in file %s\n",
      num_recs, fname, idxfile);
  return 0;
}