File: getparams.cpp

package info (click to toggle)
pilercr 1.06%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 844 kB
  • sloc: cpp: 14,339; makefile: 67; sh: 36
file content (156 lines) | stat: -rwxr-xr-x 4,481 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#include "pilercr.h"

/***
Generate default aligner parameters given:
	minimum hit length.
	minimum fractional identity.
	sequence lengths.
	maximum memory.
***/

const int DEFAULT_LENGTH = 400;
const double DEFAULT_MIN_ID = 0.94;
const double RAM_FRACT = 0.80;

/***
For minimum word length, choose k=4 arbitrarily.
For max, k=16 definitely won't work with 32-bit ints
because 4^16 = 2^32 = 4GB.
k=14 might be OK, but would have to look carefully at
boundary cases, which I haven't done.
k=13 is definitely safe, so set this as upper bound.
***/
const int MIN_WORD_LENGTH = 4;
const int MAX_WORD_LENGTH = 13;
const int MAX_AVG_INDEX_LIST_LENGTH = 10;
const int TUBE_OFFSET_DELTA = 32;

static double FilterMemRequired(int SeqLengthQ, const FilterParams &FP)
	{
	const double Index = 2*sizeof(INDEX_ENTRY)*pow4d(FP.WordSize);

	const int TubeWidth = FP.TubeOffset + FP.SeedDiffs;
	const double MaxActiveTubes = (SeqLengthQ + TubeWidth - 1)/FP.TubeOffset + 1;
	const double Tubes = MaxActiveTubes*sizeof(TubeState);
	return Index + Tubes;
	}

double AvgIndexListLength(int SeqLengthQ, const FilterParams &FP)
	{
	return SeqLengthQ / pow4d(FP.WordSize);
	}

double TotalMemRequired(int SeqLengthQ, const FilterParams &FP)
	{
	const double Filter = FilterMemRequired(SeqLengthQ, FP);
	const double Seq = SeqLengthQ;
	return Filter + Seq;
	}

static void CalcParams(int g_MinHitLength, double MinId, int SeqLengthQ,
  int g_Diameter, double MaxMem, FilterParams *ptrFP, DPParams *ptrDP)
	{
	if (MinId < 0 || MinId > 1.0)
		Quit("CalcParams: bad MinId=%g", MinId);
	if (g_MinHitLength <= 4)
		Quit("CalcParams: bad g_MinHitLength=%d", g_MinHitLength);

// Lower bound on word length k by requiring manageable index.
// Given kmer occurs once every 4^k positions.
// Hence average number of index entries is i = N/(4^k) for random
// string of length N.
// Require i <= I, then k > log_4(N/i).
	const double dSeqLengthA = (double) SeqLengthQ;
	const int MinWordSize = (int) (log4(g_Diameter) - log4(MAX_AVG_INDEX_LIST_LENGTH) + 0.5);

// First choice is that filter criteria are same as DP criteria,
// but this may not be possible.
	int SeedLength = g_MinHitLength;
	int SeedDiffs = (int) (g_MinHitLength*(1.0 - MinId) + 0.5);

// Find filter valid filter parameters,
// starting from preferred case.
	int WordSize = -1;
	for (;;)
		{
		int MinWords = -1;
		for (WordSize = MAX_WORD_LENGTH; WordSize >= MinWordSize; --WordSize)
			{
			ptrFP->WordSize = WordSize;
			ptrFP->SeedLength = SeedLength;
			ptrFP->SeedDiffs = SeedDiffs;
			ptrFP->TubeOffset = ptrFP->SeedDiffs + TUBE_OFFSET_DELTA;

			double Mem = TotalMemRequired(SeqLengthQ, *ptrFP);
			if (MaxMem > 0 && Mem > MaxMem)
				{
				Log("Parameters seedlength=%3d k=%2d seeddiffs=%2d, mem=%.0f Mb > maxmem=%.0f Mb\n",
				  ptrFP->SeedLength,
				  ptrFP->WordSize,
				  ptrFP->SeedDiffs,
				  Mem/1e6,
				  MaxMem/1e6);
				MinWords = -1;
				continue;
				}

			MinWords = MinWordsPerFilterHit(SeedLength, WordSize, SeedDiffs);
			if (MinWords <= 0)
				{
				Log("Parameters seedlength=%3d k=%2d seeddiffs=%2d, B=%d\n",
				  ptrFP->SeedLength,
				  ptrFP->WordSize,
				  ptrFP->SeedDiffs,
				  MinWords);
				MinWords = -1;
				continue;
				}

			const double Len = AvgIndexListLength(g_Diameter, *ptrFP);
			if (Len > MAX_AVG_INDEX_LIST_LENGTH)
				{
				Log("Parameters n=%d k=%d e=%d, B=%d avgixlen=%g > max = %d\n",
				  ptrFP->SeedLength,
				  ptrFP->WordSize,
				  ptrFP->SeedDiffs,
				  MinWords,
				  Len,
				  MAX_AVG_INDEX_LIST_LENGTH);
				MinWords = -1;
				continue;
				}
			break;
			}
		if (MinWords > 0)
			break;

	// Failed to find filter parameters, try
	// fewer errors and shorter seed.
		if (SeedLength >= g_MinHitLength/4)
			{
			SeedLength -= 4;
			continue;
			}

		if (SeedDiffs > 0)
			{
			--SeedDiffs;
			continue;
			}
		
		Quit("Failed to find filter parameters");
		}

	ptrDP->g_MinHitLength = g_MinHitLength;
	ptrDP->MinId = MinId;
	}

// Alignment parameters are specified by length and pctid.
void GetParams(int SeqLengthQ, int g_Diameter, int Length, double MinId,
  FilterParams *ptrFP, DPParams *ptrDP)
	{
	const char *strMaxMem = ValueOpt("maxmem");
	const double MaxMem = (0 == strMaxMem) ? GetRAMSize()*RAM_FRACT : atof(strMaxMem);

	CalcParams(Length, MinId, SeqLengthQ, g_Diameter, MaxMem, ptrFP, ptrDP);
	}