File: AlignerOptions.h

package info (click to toggle)
snap-aligner 1.0.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 4,988 kB
  • sloc: cpp: 36,500; ansic: 5,239; python: 227; makefile: 85; sh: 28
file content (153 lines) | stat: -rw-r--r-- 6,006 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/*++

Module Name:

    AlignerOptions.h

Abstract:

    Common parameters for running single & paired alignment.

Authors:

    Ravi Pandya, May, 2012

Environment:

    User mode service.

Revision History:

    Integrated from SingleAligner.cpp & PairedAligner.cpp

--*/

#pragma once

#include "stdafx.h"
#include "options.h"
#include "Genome.h"
#include "Read.h"

#define INSTRUMENTATION_FOR_PAPER 0 // Turn this on to generate raw data about hit sets and their intersections for the paper

#if INSTRUMENTATION_FOR_PAPER
#define MAX_HIT_SIZE_LOG_2  15  // This is for the instrumentation
extern _int64 g_alignmentTimeByHitCountsOfEachSeed[MAX_HIT_SIZE_LOG_2+1][MAX_HIT_SIZE_LOG_2+1];  // In the paired-end aligner, if you have seeds A and B with hit set sizes |A| and |B| then the total time in ns gets added into g_alignmentTimeByHitCountsOfEachSeed[log2(|A|)][log2(|B|)]
extern _int64 g_alignmentCountByHitCountsOfEachSeed[MAX_HIT_SIZE_LOG_2 + 1][MAX_HIT_SIZE_LOG_2 + 1];  // Same as above, but just add one per time.
#endif // INSTRUMENTATION_FOR_PAPER

#define MAPQ_LIMIT_FOR_SINGLE_HIT 10

struct AbstractOptions
{
    virtual void usageMessage() = 0;

    virtual bool parse(const char** argv, int argc, int& n, bool *done) = 0;
};

enum FileType {UnknownFileType, SAMFile, FASTQFile, BAMFile, InterleavedFASTQFile, CRAMFile};  // Add more as needed

struct SNAPFile {
	SNAPFile() : fileName(NULL), secondFileName(NULL), fileType(UnknownFileType), isStdio(false), omitSQLines(false) {}
    const char          *fileName;
    const char          *secondFileName;
    FileType             fileType;
    bool                 isCompressed;
    bool                 isStdio;           // Only applies to the first file for two-file inputs
	bool				 omitSQLines;		// Special (formerly) undocumented option for Charles Chiu's group.  Mostly a bad idea.

    PairedReadSupplierGenerator *createPairedReadSupplierGenerator(int numThreads, bool quicklyDropUnpairedReads, const ReaderContext& context);
    ReadSupplierGenerator *createReadSupplierGenerator(int numThreads, const ReaderContext& context);
    static bool generateFromCommandLine(const char **args, int nArgs, int *argsConsumed, SNAPFile *snapFile, bool paired, bool isInput);
};

struct AlignerOptions : public AbstractOptions
{
    AlignerOptions(const char* i_commandLine, bool forPairedEnd = false);

    const char         *commandLine;
    const char         *indexDir;
    const char         *similarityMapFile;
    int                 numThreads;
    unsigned            maxDist;
    float               maxDistFraction;
    unsigned            numSeedsFromCommandLine;
    double              seedCoverage;       // Exclusive with numSeeds; this is readSize/seedSize
    bool                seedCountSpecified; // Has either -n or -sc been specified?  This bool is used to make sure they're not both specified on the command line
    unsigned            maxHits;
    int                 minWeightToCheck;
    bool                bindToProcessors;
    bool                ignoreMismatchedIDs;
    SNAPFile            outputFile;
    int                 nInputs;
    SNAPFile           *inputs;
    ReadClippingType    clipping;
    bool                sortOutput;
    bool                noIndex;
    bool                noDuplicateMarking;
    bool                noQualityCalibration;   // This doesn't appear to be used.  
    unsigned            sortMemory; // total output sorting buffer size in Gb
    unsigned            filterFlags;
    bool                explorePopularSeeds;
    bool                stopOnFirstHit;
	bool				useM;	// Should we generate CIGAR strings using = and X, or using the old-style M?
    unsigned            gapPenalty; // if non-zero use gap penalty aligner
    AbstractOptions    *extra; // extra options
    const char         *rgLineContents;
    const char         *perfFileName;
    bool                useTimingBarrier;
    unsigned            extraSearchDepth;
    const char         *defaultReadGroup; // if not specified in input
    bool                ignoreSecondaryAlignments; // on input, default true
    int                 maxSecondaryAlignmentAdditionalEditDistance;
	int					maxSecondaryAlignments;
    int                 maxSecondaryAlignmentsPerContig;
    bool                preserveClipping;
    float               expansionFactor;
    bool                noUkkonen;
    bool                noOrderedEvaluation;
	bool				noTruncation;
    bool                useAffineGap;
    unsigned            matchReward;
    unsigned            subPenalty;
    unsigned            gapOpenPenalty;
    unsigned            gapExtendPenalty;
	unsigned			minReadLength;
	bool				mapIndex;
	bool				prefetchIndex;
    size_t              writeBufferSize;
    bool                dropIndexBeforeSort;
    bool                killIfTooSlow;
    const char *        sortIntermediateDirectory;
    bool                profile;
    bool                profileAffineGap;
    bool                ignoreAlignmentAdjustmentsForOm;
    bool                emitInternalScore;
    char                internalScoreTag[3];
	bool				altAwareness;
    int                 maxScoreGapToPreferNonALTAlignment;
    bool                emitALTAlignments;
    
    static bool         useHadoopErrorMessages; // This is static because it's global (and I didn't want to push the options object to every place in the code)
    static bool         outputToStdout;         // Likewise

    void usage();

    virtual void usageMessage();

    virtual bool parse(const char** argv, int argc, int& n, bool *done);

    enum FilterFlags
    {
        FilterUnaligned =           0x0001,
        FilterSingleHit =           0x0002,
        FilterMultipleHits =        0x0004,
        FilterBothMatesMatch =      0x0008,
		FilterTooShort =            0x0010
    };

    bool passFilter(Read* read, AlignmentResult result, bool tooShort, bool secondaryAlignment);
    
    virtual bool isPaired() { return false; }
};