File: HitDepth.cpp

package info (click to toggle)
snap-aligner 2.0.3%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 6,652 kB
  • sloc: cpp: 41,051; ansic: 5,239; python: 227; makefile: 85; sh: 28
file content (139 lines) | stat: -rw-r--r-- 4,183 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/*++

Module Name:

    HitDepth.cpp

Abstract:

    Special function to find the seed with the minimum hit depth to align each
    locus in a set of contigs.

Authors:

    Bill Bolosky, March, 2022

Environment:

    User mode service.

Revision History:

--*/

#include "stdafx.h"
#include "Compat.h"
#include "Histogram.h"
#include "exit.h"
#include "AlignerOptions.h"
#include "GenomeIndex.h"

#if HIT_DEPTH_COUNTING
void CountHitDepthUsage()
{
    fprintf(stderr, "This is a special function to compute data for a paper.\n");
    fprintf(stderr, "It looks at every locus in a set of contigs and finds the seed\n");
    fprintf(stderr, "with the fewest hits that contains the correct alignment across\n");
    fprintf(stderr, "a range of seed sizes.  It is intended to show some concept of\n");
    fprintf(stderr, "'difficulty' of aligning different portions of the genome.\n\n");
    fprintf(stderr, "usage:\n");
    fprintf(stderr, "  depth index-filename-base minSeedSize maxSeedSize seedSizeForBaseAlignment outputFile {contigFile}\n");
    fprintf(stderr, "        index-filename-base is the base pathname for a set of SNAP indices of various seed sizes.\n");
    fprintf(stderr, "                            the individual index names are the base concatenated with the seed size.\n");
    fprintf(stderr, "        minSeedSize and maxSeedSize are the min and max of the range of seed sizes to test\n");
    fprintf(stderr, "        seedSizeForBaseAlignment is the seed size used to align the reads to determine the 'correct' alignment\n");
    fprintf(stderr, "        ouputFile is the output filename\n");
    fprintf(stderr, "        contigFile is a file with a list of contigs to compute (one per line).  If it is not specified then\n");
    fprintf(stderr, "                   the standard list of primary contigs from hg38 are used (i.e., chr1-chr22, chrX, chrY, and chrM)\n");

    soft_exit(1);
} // CountHitDepthUsage

char* contigsToUse[] = {
    "chr1",
    "chr2",
    "chr3",
    "chr4",
    "chr5",
    "chr6",
    "chr7",
    "chr8",
    "chr9",
    "chr10",
    "chr11",
    "chr12",
    "chr13",
    "chr14",
    "chr15",
    "chr16",
    "chr17",
    "chr18",
    "chr19",
    "chr20",
    "chr21",
    "chr22",
    "chrX",
    "chrY",
    "chrM"
};

GenomeIndex *g_HitDepthGenomeIndex = NULL;

void CountHitDepthLoadIndex(const char* indexFileNameBase, int seedSize) 
{
    printf("Loading index for seed size %d...", seedSize);

    _int64 startLoadTime = timeInMillis();

    size_t indexFileNameBufferSize = strlen(indexFileNameBase) + 20;// 20 is more than the largest int + 1 for \0, so plenty of space
    char* indexFileName = new char[indexFileNameBufferSize];

    snprintf(indexFileName, indexFileNameBufferSize, "%s%d", indexFileNameBase, seedSize);

    g_HitDepthGenomeIndex = GenomeIndex::loadFromDirectory(indexFileName, true, true);

    if (g_HitDepthGenomeIndex == NULL) {
        fprintf(stderr, "\nIndex load failed\n");
        soft_exit(1);
    }

    printf("%llds\n", (timeInMillis() - startLoadTime + 500) / 1000);
}

void CountHitDepth(int argc, const char** argv)
{
    if (argc != 5 && argc != 6) {
        CountHitDepthUsage();
    }

    const char* indexFileNameBase = argv[0];

    int minSeedSize = atoi(argv[1]);
    int maxSeedSize = atoi(argv[2]);

    if (minSeedSize <= 0 || maxSeedSize < minSeedSize) {
        fprintf(stderr, "Min seed size must be strictly positive and no greater than max seed size\n");
        soft_exit(1);
    }

    int seedSizeForBaseAlignment = atoi(argv[3]);
    if (seedSizeForBaseAlignment < minSeedSize || seedSizeForBaseAlignment > maxSeedSize) {
        fprintf(stderr, "Seed size for base alignment must be in the range of min seed size - max seed size\n");
        soft_exit(1);
    }

    if (argc == 6) {
        fprintf(stderr, "Contig file not yet implemented\n");
        soft_exit(1);
    }

    FILE* outputFile = fopen(argv[4], "w");
    if (NULL == outputFile) {
        fprintf(stderr, "Unable to open output file\n");
        soft_exit(1);
    }

    CountHitDepthLoadIndex(indexFileNameBase, seedSizeForBaseAlignment);

} // CountHitDepth
#endif // HIT_DEPTH_COUNTING