| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 
 | /* $Id: seed.h,v 6.20 2005/07/28 14:57:10 coulouri Exp $
* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*/
 
/*****************************************************************************
 
File name: seed.h
Author: Alejandro Schaffer
 
Contents: header file for PHI-BLAST and pseed3.
$Revision: 6.20 $
$Log: seed.h,v $
Revision 6.20  2005/07/28 14:57:10  coulouri
remove dead code
Revision 6.19  2004/04/01 13:43:08  lavr
Spell "occurred", "occurrence", and "occurring"
Revision 6.18  2002/08/28 13:37:09  madden
Lower MAX_HIT to 20000 again (for LINUX)
Revision 6.17  2002/08/09 17:32:09  madden
Raise MAX_HIT to 20000
Revision 6.16  2000/08/01 17:21:13  shavirin
Added protection for using C++ compiler.
Revision 6.15  1999/10/18 19:54:43  shavirin
Removed unused definition.
Revision 6.14  1999/10/05 19:36:54  shavirin
Changed to use functions from blast.c: BlastGetDbChunk and BlastTickProc.
Removed unused functions.
Revision 6.13  1999/09/22 17:50:18  shavirin
Now functions will collect messages in ValNodePtr before printing out.
 
 
*****************************************************************************/
#if !defined(SEED__H)
#define SEED__H
#ifdef __cplusplus
extern "C" {
#endif
#define ALPHABET_SIZE 25
#define DNA_ALPHABET_SIZE 4
#define MAX_HIT 20000
#define PATTERN_SPACE_SIZE 1000
#define BUF_SIZE 100
#define PATTERN_BUF_SIZE 2000
#define PATTERN_NAME_SIZE 200
#define SeqIdBufferSize   64 /*buffer size for SeqIdWrite*/
#define BITS_PACKED_PER_WORD 30
#define OVERFLOW1  (1 << BITS_PACKED_PER_WORD)
#define allone  ((1 << ALPHABET_SIZE) - 1)
#define SEED_INFINITY 1000000 /*large score for array sentinel*/
#define MAX_EVALUE 1000 /*maximum e-value allowed as threshold*/
/*The following 3 flags define 3 options for running the program*/
#define SEED_FLAG 1
#define PATTERN_FLAG 2
#define PAT_SEED_FLAG 3
#define PAT_MATCH_FLAG 4
#define MaxW   11
#define MaxP   (BITS_PACKED_PER_WORD * MaxW) /*threshold pattern length*/
#ifdef WIN16
#define MAX_WORDS_IN_PATTERN 16
#else
#define MAX_WORDS_IN_PATTERN 100
#endif
#define ASCII_SIZE 256
#define DEFAULT_PARAM_C 0.6
#define DEFAULT_PARAM_LAMBDA 0.270
#define ONE_WORD_PATTERN  0
#define MULTI_WORD_PATTERN 1
#define PATTERN_TOO_LONG  2
#define PAT_PROB_THRESH  0.002  /*upper threshold for pattern probabilities*/
#define EXPECT_MATCH_THRESH 20000 /*upper threshold for number of occurrences*/
#define WILDCARD_THRESH  30 /*threshold for product of variable-length wildcards*/
/*band amounts for banded alignment*/
#define BAND_LOW (-5)
#define BAND_HIGH 5
/*Limit on length of DNA sequence*/
#define MAXDNA 200000
/*   The following integer codes used for trace back in align.
     Each node must implcitly store three pointers to decide where a CC DD, or
     e value comes from. For example the  CC value can come from a sub del or
     ins edge. A DD can come from extension of a gap or an initiation of
     and new gap.  So th three flags are needed.
     One flag have three states, the other 2 has 2 states.
     The flags are packed into one integer. Let the flags be s1, s2, s3.
     The integer state is then s3*20+s2*10+s1. where s1={0,1,2} s2=s3={0,1}. 
     There numbers above are from this packing. */
#define DELETE_CODE 20
#define INSERT_CODE 10
#define DIAGONAL_DELETE 2
#define DIAGONAL_INSERT 1
/* #define BLAST_DB_CHUNK_SIZE 500
   #define BLAST_NTICKS 50 */
typedef struct hit_str {
    Int4 score;
    Int4 l_score;
    Nlm_FloatHi mul; /*multiplier for scores of characters*/
    Int4 hit_pos, hit_end;
    Int4 bi, bj, ei, ej; /*beginning and end of pattern occurrence
                           in query sequence
                           and database sequence, respectively*/
    struct hit_str *next;  /*next hit in linked list*/
} *hit_ptr, hit_node;
typedef struct store_str {
    Int4 l_score;
    Uint1Ptr seq;
    Char *header;
    Int4 seqno;
    hit_ptr hit_list;
    struct store_str *next;
} store_node, *store_ptr;
typedef struct qseq {
     Uint1Ptr lseq, rseq, sseq;
     Int4 llen, rlen, slen;
} query_seq, *qseq_ptr;
typedef struct seedSearchItems {
    Nlm_FloatHi  charMultiple[ALPHABET_SIZE];
    Nlm_FloatHi  paramC; /*used in e-value computation*/
    Nlm_FloatHi  paramLambda; /*used in e-value computation*/
    Nlm_FloatHi  paramK; /*used in the bit score computation*/
    Int4         cutoffScore; /*lower bound for what is a hit*/
    Nlm_FloatHi  standardProb[ALPHABET_SIZE]; /*probability of each letter*/
    Char         order[ASCII_SIZE];
    Char         pchars[ALPHABET_SIZE+1];
    Char         name_space[BUF_SIZE];  /*name of a pattern*/
    Char         pat_space[PATTERN_SPACE_SIZE];  /*string description
                                                   of pattern*/
} seedSearchItems;
typedef struct seedResultItems {
    store_ptr    listOfMatchingSequences;
} seedResultItems;
typedef struct patternSearchItems {
  
   Int4   numWords;  /*number of words need to hold bit representation
                       of pattern*/
   Int4   match_mask;   /*bit mask representation of input pattern
                          for patterns that fit in a word*/
   Int4   match_maskL[BUF_SIZE]; /*bit mask representation of input pattern
                                   for long patterns*/
  /*which positions can a character occur in for long patterns*/
   Int4   bitPatternByLetter[ASCII_SIZE][MaxW]; 
   Int4   *whichPositionPtr; /*used to pass a piece a row of the arrays*/
   Uint4   *DNAwhichPrefixPosPtr, *DNAwhichSuffixPosPtr; /*similar for DNA patterns*/
  /*which positions can a character occur in for short patterns*/
   Int4   whichPositionsByCharacter[ASCII_SIZE];
   Uint4   DNAwhichPrefixPositions[ASCII_SIZE]; /*for DNA sequence where
                             prefix of DNA 4-mer matches pattern*/
   Uint4   DNAwhichSuffixPositions[ASCII_SIZE]; /*similar to above for suffixes*/
    /*for each letter in the alphabet and each word in the masked
      pattern representation, holds a bit pattern saying for which
      positions the letter will match*/
   Int4   SLL[MAX_WORDS_IN_PATTERN][ASCII_SIZE];  /*similar to
                  whichPositionsByCharacter for many-word patterns*/
   Uint4   DNAprefixSLL[MAX_WORDS_IN_PATTERN][ASCII_SIZE];
  /*similar to DNAwhichPrefixPositions for many word patterns*/
   Uint4   DNAsuffixSLL[MAX_WORDS_IN_PATTERN][ASCII_SIZE];
  /*similar to DNAwhichSuffixPositions for many word patterns*/
   Char   flagPatternLength; /*indicates if pattern fits in 1 word,
                               some words, or is too long*/
   Nlm_FloatHi  patternProbability;  /*probability of this letter
                                        combination*/
   Int4   whichMostSpecific; /*which word in an extra long pattern
                               has the lowest probability of a match*/
   Int4   numPlacesInWord[MAX_WORDS_IN_PATTERN]; /*when pattern has more than 7
             words keep track of how many places of pattern in each word of 
             the  representation; was called lening */
   Int4   spacing[MAX_WORDS_IN_PATTERN]; /*spaces until next word due to
                                          wildcard*/
   Int4   inputPatternMasked[MaxP];
   Int4   highestPlace; /*number of places in pattern representation
                          as computed in input_pattern; was called num*/
  Int4   minPatternMatchLength; /*minimum length of string to match this pattern*/
  Int4   wildcardProduct; /*product of wildcard lengths*/
} patternSearchItems;
typedef struct alignSearchItems {
   Int4** matrix;   /*score matrix*/
   Int4   gapOpen;  /*penalty to open a gap*/
   Int4   gapExtend; /*penalty to extend a gap one position*/
   Int4   gapCost;   /*gapOpen + gapExtend*/
} alignSearchItems;
typedef struct seedParallelItems {
    ReadDBFILEPtr rdpt; /*pointer to database*/
    qseq_ptr query_seq; /*multi-piece representation of query sequence*/
    Int4 lenPatMatch;  /*number of characters in the pattern occurrence*/
    GapAlignBlkPtr gap_align; /*structure for description of the gapped
                                alignment*/
    Boolean is_dna;  /*is this DNA or protein data*/
    patternSearchItems * patternSearch; /*holds items about the pattern*/
    seedResultItems * seedResults; /*holds the results for this thread*/
     seedSearchItems * seedSearch; /*holds preprocessing info about the
                                     search*/
    Int4 totalOccurrences;  /*total number of pattern occurrences 
                              found in this thread*/
    Int4 matchIndex; /* total number of matches with reportable score in
                        this thread; match ===> occurrenece, but
                        occurrence !===> match */
    /* threadInfoItems *threadInfo; */
    BlastThrInfoPtr thr_info;
} seedParallelItems;
void PGPOutTextMessages(ValNodePtr info_vnp, FILE *fd);
Char * LIBCALL strsave PROTO((Char *s));
ValNodePtr LIBCALL  seedEngineCore PROTO((BlastSearchBlkPtr search, 
  BLAST_OptionsBlkPtr options, Uint1Ptr query, Uint1Ptr unfilter_query,
  CharPtr database, CharPtr patfile, Int4 program_flag,  FILE * patfp, 
  Boolean is_dna, Boolean reverseDb, seedSearchItems *seedSearch,
   Nlm_FloatHi posEThresh, Nlm_FloatLo searchSpEff,
   posSearchItems * posSearch, SeqLocPtr *seed_seq_loc, Boolean showDiagnostics, ValNodePtr PNTR info_vnp));
void LIBCALL init_order PROTO((Int4 **matrix, Int4 program_flag, Boolean is_dna,
   seedSearchItems *seedSearch));
Int4 LIBCALL convertProgramToFlag PROTO((Char * program, Boolean * is_dna));
void LIBCALL initProbs PROTO((seedSearchItems * seedSearch));
Int4 LIBCALL find_hits PROTO((Int4 *hitArray, Uint1Ptr seq, Int4 len, Boolean is_dna,  patternSearchItems * patternSearch));
Int4 LIBCALL init_pattern PROTO((Uint1 *pattern, Boolean is_dna, patternSearchItems * patternSearch,  seedSearchItems *seedSearch, ValNodePtr * error_return));
Int4 LIBCALL align_of_pattern PROTO((Uint1 *querySeq, Uint1 *dbSeq, Int4 lenQuerySeq,  Int4 lenDbSeq, Int4 *alignScript,  Int4 **tback,  GapAlignBlkPtr gap_align, Int4 *useful_score,  Nlm_FloatHi *multiple, patternSearchItems *patternSearch, seedSearchItems * seedSearch));
void LIBCALL pat_output PROTO((Uint1 *seq, Int4 begin, Int4 end, patternSearchItems *patternSearch, ValNodePtr PNTR info_vnp));
qseq_ptr LIBCALL split_target_seq PROTO((Uint1 *seq, Int4 seed, Int4 len_pat, Int4 len_query));
hit_ptr LIBCALL get_hits PROTO((qseq_ptr qp, Int4 len_of_pat, 
		 Uint1Ptr seq_db, Int4 len_seq_db, GapAlignBlkPtr gap_align, 
		 Boolean is_dna, patternSearchItems * patternSearch,
                 seedSearchItems * seedSearch, Int4 * newOccurrences));
void LIBCALL search_pat PROTO((ReadDBFILEPtr rdpt, Char *patternFileName, Boolean is_dna, seedSearchItems *seedSearch, patternSearchItems *patternSearch, ValNodePtr * error_return, ValNodePtr PNTR info_vnp));
SeqAlignPtr LIBCALL output_hits PROTO((ReadDBFILEPtr rdpt,
	    Boolean score_only, Uint1 *seq1, qseq_ptr qp, 
	    Int4 len, Nlm_FloatHi dbLength, GapAlignBlkPtr gap_align, Boolean is_dna,
            Int4 effectiveOccurrences, seedSearchItems *seedSearch, seedResultItems *seedResults, 
            patternSearchItems * patternSearch, Boolean reverse, 
            Int4 numOccurrences, Nlm_FloatHi eThresh,
            SeqIdPtr query_id, Nlm_FloatHi posEthresh, 
            posSearchItems *posSearch, Int4 numMatches, 
            Int4 *totalBelowEThresh, Boolean showDiagnostics,
            ValNodePtr PNTR info_vnp));
Char*  LIBCALL get_a_pat PROTO((FILE *fp, Char **name, Int4Ptr hitArray, Int4Ptr fullHitArray, 
   Int4 * numPatOccur, Int4 *numEffectiveOccurrences, Int4 program_flag, 
   Uint1Ptr unfilter_seq, Uint1Ptr seq, Int4 len, Boolean is_dna,
   patternSearchItems *patternSearch, seedSearchItems * seedSearch,
   Boolean showDiagnostics, ValNodePtr * error_return, 
   ValNodePtr PNTR info_vnp));
void LIBCALL quicksort_hits PROTO((Int4 no_of_seq, seedResultItems *seedResults));
Int4 LIBCALL eValueFit PROTO((Nlm_FloatHi eThresh, Nlm_FloatHi dbLength, 
        seedSearchItems *seedSearch, Int4 numOccurrences, 
        Nlm_FloatHi patternProbability));
void LIBCALL storeOneMatch PROTO((hit_ptr hit_list, Int4 seqno, Uint1Ptr seq, 
	      seedResultItems *seedResults));
void LIBCALL seed_free_all PROTO((seedResultItems *seedResults));
ValNodePtr  LIBCALL SeedPruneHitsFromSeedReturn PROTO((ValNodePtr seedReturn, Int4 number_of_descriptions));
#ifdef __cplusplus
}
#endif
#endif /*define SEED__H*/
 |