File: spideymain.c

package info (click to toggle)
ncbi-tools6 6.1.20120620-8
links: PTS, VCS
area: main
in suites: jessie, jessie-kfreebsd
size: 241,628 kB
ctags: 101,236
sloc: ansic: 1,431,713; cpp: 6,248; pascal: 3,949; xml: 3,390; sh: 3,090; perl: 1,077; csh: 488; makefile: 449; ruby: 93; lisp: 81
file content (799 lines) | stat: -rw-r--r-- 27,743 bytes
parent folder | download | duplicates (11)
/* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*            National Center for Biotechnology Information (NCBI)
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government do not place any restriction on its use or reproduction.
*  We would, however, appreciate having the NCBI and the author cited in
*  any work or product based on this material.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
* ===========================================================================
*
* File Name:  spideymain.c
*
* Author:  Sarah Wheelan
*
* Version Creation Date:   5/01
*
* $Revision: 6.12 $
*
* File Description: main functions for running Spidey as a standalone 
*
* Modifications:
* --------------------------------------------------------------------------
* $Log: spideymain.c,v $
* Revision 6.12  2005/11/17 17:13:07  kskatz
* Fixed initializations and removed non-used functions to get rid of warnings
*
* Revision 6.11  2005/11/17 16:41:54  kskatz
* Fixed the code the uses user-supplied matrices:  SPI_GetSpliceInfoclear(); also commented out  ErrSetMessageLevel(SEV_MAX) (line 418) so that warnings could be posted
*
* Revision 6.10  2004/03/25 21:20:03  kskatz
* All SPI_is_acceptor_* functions have been corrected: 'N' no longer contributes to nor subtracts from the score, log odds are calculated and the scores added; they are however all antilogged because there are too many places in the code where the score is expected to be between 0 and 1.  Also, corrected sequence frequency determination in SPI_is_acceptor_user and SPI_is_donor_user, as well as correcting for 'N'. Finally, and this all began with, I added matrices for Dictyostelium - command line -r -m
*
* Revision 6.9  2003/04/04 19:42:56  kskatz
* Added a new command line option (-R) to allow external users to point spidey to a repeat database that it can pass on to blast for filtering repeats
*
* Revision 6.8  2002/10/02 16:47:11  kskatz
* clarifying the explanation of the -L option
*
* Revision 6.7  2002/10/02 16:03:16  kskatz
* added new command line option (-L) for user to use in conjunction with -X (big intron bool); -L allows user to supply the max size (bp) of the intron to use (default 220000) and sets the SPI_Option bigintron_size to this value
*
* Revision 6.6  2002/05/07 18:43:46  wheelan
* changes to support user-defined splice matrices
*
* Revision 6.5  2001/11/05 16:13:55  wheelan
* bug fix in multiple alignment code; changed call to SPI_PrintMult..
*
* Revision 6.4  2001/10/04 12:36:45  wheelan
* added bigintron option
*
* Revision 6.3  2001/08/24 13:45:48  wheelan
* changed printing options
*
* Revision 6.2  2001/07/10 16:45:53  wheelan
* changes to produce multiple alignments upon request
*
* Revision 6.1  2001/05/24 16:27:41  wheelan
* initial checkin
*
*
* ==========================================================================
*/


#include <ncbi.h>
#include <spidey.h>
#include <accid1.h>
#include <lsqfetch.h>

#define MYARGGENFILE   0
#define MYARGMRNAFILE  1
#define MYARGPRALIGN   2
#define MYARGOUTFILE   3
#define MYARGALNFILE   4
#define MYARGGILIST    5
#define MYARGNUMMOD    6
#define MYARGORG       7
#define MYARG1STEVAL   8
#define MYARG2NDEVAL   9
#define MYARG3RDEVAL   10
#define MYARGIDCUT     11
#define MYARGLENCUT    12
#define MYARGSPEC      13
#define MYARGASN       14
#define MYARGASNFILE   15
/*#define MYARGDRAFTFILE 16*/
#define MYARGMASKED    16
#define MYARGGETCDS    17
#define MYARGTABLE     18
/*#define MYARGACEDB     19*/
#define MYARGFROM      19 
#define MYARGTO        20
#define MYARGMULT      21
#define MYARGXL        22
#define MYARGXL_SIZE   23 /* added by KSK */
#define MYARGSTRAND    24
#define MYARGDSPLICE   25
#define MYARGASPLICE   26
#define MYARGREPDB     27

#define NUMARGS        28

Args myargs[NUMARGS] = {
   {"Input file -- genomic sequence(s)", NULL, NULL, NULL, FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
   {"Input file -- mRNA sequence(s)", NULL, NULL, NULL, FALSE, 'm', ARG_FILE_IN, 0.0, 0, NULL},
   {"Print alignment? 0=summary+alignments, 1=summary, 2=alignments, 3=summary&alignments in different files", "0", NULL, NULL, TRUE, 'p', ARG_INT, 0.0, 0, NULL},
   {"Output file 1 (summary or summary+aln)", "stdout", NULL, NULL, TRUE, 'o', ARG_STRING, 0.0, 0, NULL},
   {"Output file 2 (alignments)", "spidey.aln", NULL, NULL, TRUE, 'a', ARG_STRING, 0.0, 0, NULL},
   {"Input file is a GI list", "F", NULL, NULL, TRUE, 'G', ARG_BOOLEAN, 0.0, 0, NULL},
   {"Number of gene models", "1", NULL, NULL, TRUE, 'n', ARG_INT, 0.0, 0, NULL},
   {"Organism (genomic sequence) v=vertebrate,\nd = drosophila, p = plant, c = C. elegans, m = Dictyostelium discoideum", "v", NULL, NULL, TRUE, 'r', ARG_STRING, 0.0, 0, NULL},
   {"First-pass e-value", "0.0000000001", NULL, NULL, TRUE, 'e', ARG_FLOAT, 0.0, 0, NULL},
   {"Second-pass e-value", "0.001", NULL, NULL, TRUE, 'f', ARG_FLOAT, 0.0, 0, NULL},
   {"Third-pass e-value", "10", NULL, NULL, TRUE, 'g', ARG_FLOAT, 0.0, 0, NULL},
   {"% identity cutoff", "0", NULL, NULL, TRUE, 'c', ARG_INT, 0.0, 0, NULL},
   {"% length coverage cutoff", "0", NULL, NULL, TRUE, 'l', ARG_INT, 0.0, 0, NULL},
   {"interspecies alignment", "F", NULL, NULL, TRUE, 's', ARG_BOOLEAN, 0.0, 0, NULL},
   {"Print ASN.1 alignment?", "F", NULL, NULL, TRUE, 'j', ARG_BOOLEAN, 0.0, 0, NULL},
   {"File for asn.1", "spidey.asn", NULL, NULL, TRUE, 'k', ARG_STRING, 0.0, 0, NULL},
/*   {"File of draft boundary information", NULL, NULL, NULL, TRUE, 'b', ARG_FILE_IN, 0.0, 0, NULL},*/
   {"Is the input mRNA masked (lowercase)?", "F", NULL, NULL, TRUE, 'w', ARG_BOOLEAN, 0.0, 0, NULL},
   {"Fetch the CDS and compute its results also?", "F", NULL, NULL, TRUE, 'd', ARG_BOOLEAN, 0.0, 0, NULL},
   {"File with feature table", NULL, NULL, NULL, TRUE, 't', ARG_FILE_IN, 0.0, 0, NULL},
   /*{"ACEDB format", "F", NULL, NULL, TRUE, 'A', ARG_BOOLEAN, 0.0, 0, NULL},*/
   {"Start of genomic interval desired (from)", "0", NULL, NULL, TRUE, 'F', ARG_INT, 0.0, 0, NULL},
   {"Stop of genomic interval desired (to)", "0", NULL, NULL, TRUE, 'T', ARG_INT, 0.0, 0, NULL},
   {"Make a multiple alignment of all input mRNAs", "F", NULL, NULL, TRUE, 'u', ARG_BOOLEAN, 0.0, 0, NULL},
   {"Use extra-large intron sizes", "F", NULL, NULL, TRUE, 'X', ARG_BOOLEAN, 0.0, 0, NULL},
   {"The extra-large intron size to use", "220000", NULL, NULL, TRUE, 'L', ARG_INT, 0.0, 0, NULL},
   {"Restrict to plus (p) or minus (p) strand of genomic seq?", NULL, NULL, NULL, TRUE, 'S', ARG_STRING, 0.0, 0, NULL},
   {"File with donor splice matrix", NULL, NULL, NULL, TRUE, 'M', ARG_FILE_IN, 0.0, 0, NULL},
   {"File with acceptor splice matrix", NULL, NULL, NULL, TRUE, 'N', ARG_FILE_IN, 0.0, 0, NULL},
      /* KSK added */
   {"File (including path) to repeat blast database for filtering", NULL, NULL, NULL, TRUE, 'R', ARG_STRING, 0.0, 0, NULL},
};

static void SPI_FindAllNuc(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent);
static CharPtr ReadALine (CharPtr str, size_t size, FILE *fp);
static BioseqPtr SPI_GetBspFromGIOrAcc(CharPtr str);
static void SPI_GetSpliceInfo(SPI_OptionsPtr spot, FILE *sfp, Boolean donor);
static void SPI_ReadFeatureTable(FILE *ifp, SPI_bsinfoPtr spim_head);

Int2 Main()
{
   AsnIoPtr           aip;
   BioseqPtr          bsp;
   Pointer            dataptr;
   Uint2              datatype;
   Boolean            found;
   SPI_mRNAToHerdPtr  h_head;
   SPI_mRNAToHerdPtr  h_prev;
   SPI_mRNAToHerdPtr  hptr;
   FILE               *ifp;
   Boolean            isGIlist;
   Char               line[60];
   Boolean            lowercase;
   SeqLocPtr          lcaseloc;
   FILE               *ofp;
   FILE               *ofp2;
   SeqAlignPtr        sap;
   SeqAnnotPtr        sanp;
   SeqEntryPtr        sep;
   FILE               *sfp;
   SeqIdPtr           sip;
   SeqLocPtr          slp;
   SPI_bsinfoPtr      spig;
   SPI_bsinfoPtr      spig_head;
   SPI_bsinfoPtr      spig_prev;
   SPI_bsinfoPtr      spim;
   SPI_bsinfoPtr      spim_head;
   SPI_bsinfoPtr      spim_prev;
   SPI_OptionsPtr     spot;
   SPI_RegionInfoPtr  srip = NULL;
   SPI_RegionInfoPtr  srip_head;
   SPI_RegionInfoPtr  srip_prev;
   CharPtr            str;
   CharPtr            txt;

   ID1BioseqFetchEnable("spidey", FALSE);
   LocalSeqFetchInit(FALSE);
   /* standard setup */
   ErrSetFatalLevel (SEV_MAX);
   ErrClearOptFlags (EO_SHOW_USERSTR);
   UseLocalAsnloadDataAndErrMsg ();
   ErrPathReset ();
   if (! AllObjLoad ())
   {
      Message (MSG_FATAL, "AllObjLoad failed");
      return 1;
   }
   if (! SubmitAsnLoad ())
   {
      Message (MSG_FATAL, "SubmitAsnLoad failed");
      return 1;
   }
   if (! FeatDefSetLoad ())
   {
      Message (MSG_FATAL, "FeatDefSetLoad failed");
      return 1;
   }
   if (! SeqCodeSetLoad ())
   {
      Message (MSG_FATAL, "SeqCodeSetLoad failed");
      return 1;
   }
   if (! GeneticCodeTableLoad ())
   {
      Message (MSG_FATAL, "GeneticCodeTableLoad failed");
      return 1;
   }
   if (!GetArgs("SPIDEY", NUMARGS, myargs))
      return 0;
   /* set the error message level high to suppress warnings from BLAST */
   isGIlist = (Boolean)myargs[MYARGGILIST].intvalue;
   txt = myargs[MYARGGENFILE].strvalue;
   ifp = FileOpen(txt, "r");
   spig_head = NULL;
   if (ifp == NULL)
   {
      bsp = SPI_GetBspFromGIOrAcc(txt);
      if (bsp == NULL)
      {
         ErrPostEx(SEV_ERROR, 0, 0, "Can't open genomic input file\n");
         return -1;
      } else
      {
         spig_head = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
         spig_head->bsp = bsp;
      }
   }
   if (spig_head == NULL)
   {
      spig_prev = NULL;
      /* read in the genomic sequence(s) first and put them into bsinfo structures */
      while ((dataptr = ReadAsnFastaOrFlatFile (ifp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL)
      {
         if (datatype == OBJ_BIOSEQ)
         {
            spig = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
            spig->bsp = (BioseqPtr)dataptr;
            if (spig_head == NULL)
               spig_head = spig_prev = spig;
            else
            {
               spig_prev->next = spig;
               spig_prev = spig;
            }
         } else if (datatype == OBJ_SEQENTRY)
         {
            sep = (SeqEntryPtr)dataptr;
            SeqEntryExplore(sep, &spig_head, SPI_FindAllNuc);
         }
      }
      FileClose(ifp);
   }
   if (spig_head == NULL)
   {
      ErrPostEx(SEV_ERROR, 0, 0, "No valid bioseqs in genomic file\n");
      return -1;
   } else if (ISA_aa(spig_head->bsp->mol))
   {
      ErrPostEx(SEV_ERROR, 0, 0, "At least one of the genomic sequences appears to be a protein.\n");
      return -1;
   }
   if (spig_head->next != NULL)
   {
      ErrPostEx(SEV_ERROR, 0, 0, "This version can only process one genomic sequence at a time.  Only the first sequence in this file will be used.\n");
      spig_head->next = NULL;
   }
   spim_head = spim_prev = NULL;
   txt = myargs[MYARGMRNAFILE].strvalue;
   ifp = FileOpen(txt, "r");
   if (ifp == NULL)
   {
      bsp = SPI_GetBspFromGIOrAcc(txt);
      if (bsp == NULL)
      {
         ErrPostEx(SEV_ERROR, 0, 0, "Can't open mRNA input file\n");
         return -1;
      } else
      {
         spim_head = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
         spim_head->bsp = bsp;
      }
   }
   if (spim_head == NULL)
   {
      lowercase = (Boolean)myargs[MYARGMASKED].intvalue;
      lcaseloc = NULL;
      /* if the mRNA has lowercase masking, read it in carefully to record the masking */
      if (lowercase == TRUE)
      {
         while ((sep = FastaToSeqEntryForDb(ifp, TRUE, NULL, TRUE, NULL, NULL, &lcaseloc)) != NULL)
         {
            SeqEntryExplore(sep, &spim_head, SPI_FindAllNuc);
            if (lcaseloc != NULL)  /* put masking info into the bsinfo structure */
            {
               spim = spim_head;
               sip = SeqLocId(lcaseloc);
               found = FALSE;
               while (spim != NULL && !found)
               {
                  if (SeqIdComp(sip, spim->bsp->id) == SIC_YES)
                  {
                     found = TRUE;
                     spim->lcaseloc = lcaseloc;
                  }
                  spim = spim->next;
               }
               lcaseloc = NULL;
            }
         }
      } else if (isGIlist) /* mRNA file is a list of GIs, must fetch the bioseqs */
      {
         str = ReadALine(line, sizeof(line), ifp);
         while (str != NULL)
         {
            bsp = SPI_GetBspFromGIOrAcc(str);
            if (bsp != NULL)
            {
               spim = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
               spim->bsp = bsp;
               if (spim_head == NULL)
                  spim_head = spim_prev = spim;
               else
               {
                  spim_prev->next = spim;
                  spim_prev = spim;
               }
            }
            str = ReadALine(line, sizeof(line), ifp);
         }
      } else /* mRNAs are FASTA or ASN.1, read them all in */
      {
         while ((dataptr = ReadAsnFastaOrFlatFile (ifp, &datatype, NULL, FALSE, FALSE, TRUE, FALSE)) != NULL)
         {
            if (datatype == OBJ_BIOSEQ)
            {
               spim = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
               spim->bsp = (BioseqPtr)dataptr;
               if (spim_head == NULL)
                  spim_head = spim_prev = spim;
               else
               {
                  spim_prev->next = spim;
                  spim_prev = spim;
               }
            } else if (datatype == OBJ_SEQENTRY)
            {
               sep = (SeqEntryPtr)dataptr;
               SeqEntryExplore(sep, &spim_head, SPI_FindAllNuc);
            }
         }
      }
      FileClose(ifp);
   }
   if (spim_head == NULL)
   {
      ErrPostEx(SEV_ERROR, 0, 0, "No valid bioseqs in mRNA file\n");
      return -1;
   } else if (ISA_aa(spim_head->bsp->mol))
   {
      ErrPostEx(SEV_ERROR, 0, 0, "At least one of the mRNA sequences appears to be a protein\n");
      return -1;
   }
   txt = myargs[MYARGTABLE].strvalue;
   if (txt != NULL)
   {
      ifp = FileOpen(txt, "r");
      if (ifp == NULL)
      {
         ErrPostEx(SEV_ERROR, 0, 0, "Unable to open table file\n");
         return -1;
      }
      SPI_ReadFeatureTable(ifp, spim_head);
      spim = spim_head;
      while (spim != NULL)
      {
         if (spim->lcaseloc != NULL)
         {
            slp = (SeqLocPtr)ValNodeNew(NULL);
            slp->choice = SEQLOC_MIX;
            slp->data.ptrvalue = (Pointer)spim->lcaseloc;
            spim->lcaseloc = slp;
         }
         spim = spim->next;
      }
   }
   spim = spim_head;
   spot = (SPI_OptionsPtr)MemNew(sizeof(SPI_Options));
   spot->printaln = myargs[MYARGPRALIGN].intvalue;
   txt = myargs[MYARGOUTFILE].strvalue;
   ofp = FileOpen(txt, "w");
   if (ofp == NULL && spot->printaln != 3)
   {
      ErrPostEx(SEV_ERROR, 0, 0, "Unable to open output file\n");
      return -1;
   }
   if (spot->printaln >= 2)
   {
      txt = myargs[MYARGALNFILE].strvalue;
      ofp2 = FileOpen(txt, "a");
      if (ofp2 == NULL)
      {
         ErrPostEx(SEV_ERROR, 0, 0, "Unable to open output file 2\n");
         return -1;
      }
   } else
      ofp2 = NULL;
   /** ErrSetMessageLevel(SEV_MAX); **/
   spot->firstpasseval = myargs[MYARG1STEVAL].floatvalue;
   spot->secpasseval = myargs[MYARG2NDEVAL].floatvalue;
   spot->thirdpasseval = myargs[MYARG3RDEVAL].floatvalue;
   spot->numreturns = myargs[MYARGNUMMOD].intvalue;
   spot->idcutoff = myargs[MYARGIDCUT].intvalue;
   spot->lencutoff = myargs[MYARGLENCUT].intvalue;
   spot->interspecies = (Boolean)myargs[MYARGSPEC].intvalue;
   spot->printasn = (Boolean)myargs[MYARGASN].intvalue;
   spot->fetchcds = (Boolean)myargs[MYARGGETCDS].intvalue;
   /*spot->ace = (Boolean)myargs[MYARGACEDB].intvalue;*/
   spot->from = myargs[MYARGFROM].intvalue;
   spot->to = myargs[MYARGTO].intvalue;
   spot->makemult = (Boolean)myargs[MYARGMULT].intvalue;
   /*KSK*/
   spot->bigintron = (Boolean)myargs[MYARGXL].intvalue; 
   spot->bigintron_size = myargs[MYARGXL_SIZE].intvalue;
   spot->repeat_db_file = myargs[MYARGREPDB].strvalue;
   txt = myargs[MYARGORG].strvalue;
   if (!StringICmp(txt, "d") || !StringICmp(txt, "D")){
       spot->organism = SPI_FLY;
   }
   else if (!StringICmp(txt, "p") || !StringICmp(txt, "P")){
       spot->organism = SPI_PLANT;
   }
   else if (!StringICmp(txt, "c") || !StringICmp(txt, "C")){
       spot->organism = SPI_CELEGANS;
   }
   else if (!StringICmp(txt, "m") || !StringICmp(txt, "M")){
       spot->organism = SPI_DICTY;
   }
   else {
       spot->organism = SPI_VERTEBRATE;
   }
   sap = NULL;
   if (spot->printasn)
      spot->sap_head = &sap;
   txt = myargs[MYARGSTRAND].strvalue;
   if (txt != NULL)
   {
      if (StrChr(txt, 'p') || StrChr(txt, 'P'))
         spot->strand = Seq_strand_plus;
      else
         spot->strand = Seq_strand_minus;
   } else
      spot->strand = Seq_strand_both;
   /*txt = myargs[MYARGDRAFTFILE].strvalue;
   if (txt != NULL)
      spot->draftfile = StringSave(txt);*/
   txt = myargs[MYARGDSPLICE].strvalue;
   if (txt != NULL)
   {
      sfp = FileOpen(txt, "r");
      SPI_GetSpliceInfo(spot, sfp, TRUE);
      FileClose(sfp);
   }
   txt = myargs[MYARGASPLICE].strvalue;
   if (txt != NULL)
   {
      sfp = FileOpen(txt, "r");
      SPI_GetSpliceInfo(spot, sfp, FALSE);
      FileClose(sfp);
   }
   h_head = h_prev = NULL;
   srip_head = srip_prev = NULL;
   while (spim != NULL)
   {
      spot->lcaseloc = spim->lcaseloc;
      if (spot->draftfile == NULL)
         srip = SPI_AlnSinglemRNAToGen(spig_head, spim, ofp, ofp2, spot);
      else
      {
         hptr = SPI_AlnSinglemRNAToPieces(spig_head, spim, ofp, ofp2, spot);
         if (h_head != NULL)
         {
            h_prev->next = hptr;
            h_prev = hptr;
         } else
            h_head = h_prev = hptr;
      }
      if (srip != NULL)
      {
         if (srip_head != NULL)
         {
            srip_prev->next = srip;
            srip_prev = srip;
         } else
            srip_head = srip_prev = srip;
      }
      spim = spim->next;
   }
   if (spot->makemult)
   {
      SPI_MakeMultipleAlignment(srip_head);
      SPI_PrintMultipleAlignment(srip_head, FALSE, spig_head->bsp, ofp);
      SPI_RegionListFree(srip_head);
   } else
      SPI_RegionListFree(srip_head);
   /* create the ASN.1 output, if requested; need to use the continuous alignment */
   /* that was generated */
   if (spot->printasn && *(spot->sap_head) != NULL && spot->draftfile == NULL)
   {
      sanp = SeqAnnotForSeqAlign(*(spot->sap_head));
      txt = myargs[MYARGASNFILE].strvalue;
      aip = AsnIoOpen(txt, "w");
      SeqAnnotAsnWrite(sanp, aip, NULL);
      AsnIoClose(aip);
      SeqAlignSetFree(*(spot->sap_head));
   }
   FileClose(ofp);
   FileClose(ofp2);
   SPI_OptionsFree(spot);
   SPI_bsinfoFreeList(spim_head);
   SPI_bsinfoFreeList(spig_head);
   LocalSeqFetchDisable();
   ID1BioseqFetchDisable();
   return 0;
}

/***************************************************************************
*
*  SPI_FindAllNuc is a SeqEntryExplore callback function that puts all
*  the nucleotide bioseqs in a seqentry into a linked list of SPI_bsinfoPtrs.
*
***************************************************************************/
static void SPI_FindAllNuc(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
{
   BioseqPtr      bsp;
   SPI_bsinfoPtr  spi;
   SPI_bsinfoPtr  PNTR spim_head;

   spim_head = (SPI_bsinfoPtr PNTR)data;
   spi = *spim_head;
   if (IS_Bioseq(sep))
   {
      bsp = (BioseqPtr)sep->data.ptrvalue;
      if (ISA_na(bsp->mol))
      {
         if (spi != NULL)
         {
            while (spi->next != NULL)
            {
               spi = spi->next;
            }
            spi->next = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
            spi = spi->next;
         } else
            spi = *spim_head = (SPI_bsinfoPtr)MemNew(sizeof(SPI_bsinfo));
         spi->bsp = bsp;
      }
   }
}

/***************************************************************************
*
*  ReadALine is copied from Jonathan's code -- it simply reads 
*  consecutive lines from a file.
*
***************************************************************************/
static CharPtr ReadALine (CharPtr str, size_t size, FILE *fp)
{
   Char     ch;
   CharPtr  ptr;
   CharPtr  rsult;

   if (str == NULL || size < 1 || fp == NULL) return NULL;
   *str = '\0';
   rsult = fgets (str, size, fp);
   if (rsult != NULL)
   {
      ptr = str;
      ch = *ptr;
      while (ch != '\0' && ch != '\n' && ch != '\r')
      {
         ptr++;
         ch = *ptr;
      }
      *ptr = '\0';
   }
   return rsult;
}

/***************************************************************************
*
*  SPI_GetBspFromGIOrAcc takes a string, decides whether it's probably
*  a GI or an accession, converts it to an appropriate seqid, and then
*  attempts to fetch and return the bioseq referred to by that id.
*
***************************************************************************/
static BioseqPtr SPI_GetBspFromGIOrAcc(CharPtr str)
{
   BioseqPtr  bsp;
   Int4       gi;
   Char       ptr;
   SeqIdPtr   sip;
   ValNode    vn;

   str = TrimSpacesAroundString(str);
   ptr = *str;
   if (IS_ALPHA(ptr))  /* accession */
   {
      sip = SeqIdFromAccessionDotVersion(str);
      bsp = BioseqLockById(sip);
   } else  /* it's a GI */
   {
      gi = atoi(str);
      vn.choice = SEQID_GI;
      vn.data.intvalue = gi;
      vn.next = NULL;
      bsp = BioseqLockById(&vn);
   }
   return bsp;
}

/***************************************************************************
*
*  SPI_ReadFeatureTable reads in a tab-delimited file and converts it
*  to feature information:
*
*  sequence id	name of feature		start	stop
*  NM_004377.1	repetitive_region	12	40
*
*  Masking and other feature information for all mRNAs should be in the
*  same file; ids not in the mRNA list or unknown feature names will be
*  ignored.
*
***************************************************************************/
static void SPI_ReadFeatureTable(FILE *ifp, SPI_bsinfoPtr spim_head)
{
   Char           c;
   CharPtr        fields[5];
   Boolean        found;
   Char           line[60];
   Int4           i;
   CharPtr        ptr;
   SeqIdPtr       sip;
   SeqLocPtr      slp;
   SPI_bsinfoPtr  spim;
   CharPtr        str;
   ValNode        vn;

   str = ReadALine(line, sizeof(line), ifp);
   while (str != NULL)
   {
      ptr = strtok(str, " \t");
      for (i=0; i<4; i++)
      {
         fields[i] = StringSave(ptr);
         ptr = strtok(NULL, " \t");
      }
      if (fields[1] != NULL && !StrCmp(fields[1], "repetitive_region"))
      {
         c = *fields[0];
         if (IS_ALPHA(c))
            sip = SeqIdFromAccessionDotVersion(fields[0]);
         else
         {
            vn.choice = SEQID_GI;
            vn.data.intvalue = atoi(fields[0]);
            vn.next = NULL;
            sip = &vn;
         }
         spim = spim_head;
         found = FALSE;
         while (!found && spim != NULL)
         {
            if (SeqIdIn(sip, spim->bsp->id) == TRUE)
               found = TRUE;
            else
               spim = spim->next;
         }
         if (found)
         {
            slp = SeqLocIntNew(atoi(fields[2]), atoi(fields[3]), Seq_strand_both, spim->bsp->id);
            slp->next = spim->lcaseloc;
            spim->lcaseloc = slp;
         }
      }
      str = ReadALine(line, sizeof(line), ifp);
   }
}

static void SPI_GetSpliceInfo(SPI_OptionsPtr spot, FILE *sfp, Boolean donor)
{
   Char               line[1000];
   CharPtr            ptr = NULL;
   SPI_SpliceInfoPtr  ssp = NULL;
   SPI_SpliceInfoPtr  ssp_head = NULL;
   SPI_SpliceInfoPtr  ssp_prev = NULL;
   CharPtr            str = NULL;
   enum { _a = 0, _c = 1, _g = 2, _t = 3};
   int base = 0;

   if (sfp == NULL){
       printf("Matrix file not found\n");
       return;
   }
   /* read the first line which is the column dividing donor / acceptor
      sequencs ('GT', 'AG') should just be a single number */   
   str = ReadALine(line, sizeof(line), sfp);
   if (donor){
       spot->dsplicejunc = atoi(str);
   }
   else {
       spot->asplicejunc = atoi(str);
   }
   
   for (base = 0; base < (_t + 1); ++base){       
       str = ReadALine(line, sizeof(line), sfp);
       switch (base){
       case _a:
           for (ptr = strtok(str, "\t"); ptr != NULL; 
                ptr = strtok(NULL, "\t")){
               ssp = (SPI_SpliceInfoPtr)MemNew(sizeof(SPI_SpliceInfo));
               ssp->a = atof(ptr);
               if (ssp_prev != NULL){
                   ssp_prev->next = ssp;
                   ssp_prev = ssp;
               }
               else {
                   ssp_head = ssp_prev = ssp;
                   if (donor){
                       spot->dssp_head = ssp_head;
                   }
                   else {
                       spot->assp_head = ssp_head; 
                   }
               }
           };
           break;
       case _c:
       case _t:
       case _g:
           for (ssp = ssp_head, ptr = strtok(str, "\t"); ptr != NULL && ssp != NULL;
                ptr = strtok(NULL, "\t"), ssp = ssp->next){
               switch (base){
               case _c:
                   ssp->c = atof(ptr);
                   break;
               case _g:
                   ssp->g = atof(ptr);
                   break;
               case _t:
                   ssp->t = atof(ptr);
                   break;
               }
           }
           if ((ptr != NULL && ssp == NULL) || 
               (ssp != NULL && ptr == NULL)){
               switch (base){
               case _c:
                   donor 
                       ? ErrPostEx(SEV_ERROR, 0, 0,
                                   "Donor atrix file error -- C matrix column number is different than preceding.")
                       : ErrPostEx(SEV_ERROR, 0, 0,
                                   "Acceptor matrix file error -- C matrix column number is different than preceding.");
                       break;
               case _g:
                   donor 
                       ? ErrPostEx(SEV_ERROR, 0, 0,
                                   "Donor matrix file error -- G matrix column number is different than preceding.")
                       : ErrPostEx(SEV_ERROR, 0, 0,
                                   "Acceptor matrix file error -- G matrix column number is different than preceding.");
                       break;
               case _t:
                   donor
                       ? ErrPostEx(SEV_ERROR, 0, 0,
                                   "Donor matrix file error -- T matrix column number is different than preceding.")
                       : ErrPostEx(SEV_ERROR, 0, 0,
                                   "Acceptor matrix file error -- T matrix column number is different than preceding.");
                       break;
               }
           }
       }
   }
}