File: minimap.h

package info (click to toggle)
minimap2 2.17%2Bdfsg-12
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 1,204 kB
  • sloc: ansic: 8,653; javascript: 2,301; makefile: 130; python: 91; sh: 42; perl: 29
file content (382 lines) | stat: -rw-r--r-- 13,005 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#ifndef MINIMAP2_H
#define MINIMAP2_H

#include <stdint.h>
#include <stdio.h>
#include <sys/types.h>

#define MM_F_NO_DIAG       0x001 // no exact diagonal hit
#define MM_F_NO_DUAL       0x002 // skip pairs where query name is lexicographically larger than target name
#define MM_F_CIGAR         0x004
#define MM_F_OUT_SAM       0x008
#define MM_F_NO_QUAL       0x010
#define MM_F_OUT_CG        0x020
#define MM_F_OUT_CS        0x040
#define MM_F_SPLICE        0x080 // splice mode
#define MM_F_SPLICE_FOR    0x100 // match GT-AG
#define MM_F_SPLICE_REV    0x200 // match CT-AC, the reverse complement of GT-AG
#define MM_F_NO_LJOIN      0x400
#define MM_F_OUT_CS_LONG   0x800
#define MM_F_SR            0x1000
#define MM_F_FRAG_MODE     0x2000
#define MM_F_NO_PRINT_2ND  0x4000
#define MM_F_2_IO_THREADS  0x8000
#define MM_F_LONG_CIGAR    0x10000
#define MM_F_INDEPEND_SEG  0x20000
#define MM_F_SPLICE_FLANK  0x40000
#define MM_F_SOFTCLIP      0x80000
#define MM_F_FOR_ONLY      0x100000
#define MM_F_REV_ONLY      0x200000
#define MM_F_HEAP_SORT     0x400000
#define MM_F_ALL_CHAINS    0x800000
#define MM_F_OUT_MD        0x1000000
#define MM_F_COPY_COMMENT  0x2000000
#define MM_F_EQX           0x4000000 // use =/X instead of M
#define MM_F_PAF_NO_HIT    0x8000000 // output unmapped reads to PAF
#define MM_F_NO_END_FLT    0x10000000
#define MM_F_HARD_MLEVEL   0x20000000
#define MM_F_SAM_HIT_ONLY  0x40000000

#define MM_I_HPC          0x1
#define MM_I_NO_SEQ       0x2
#define MM_I_NO_NAME      0x4

#define MM_IDX_MAGIC   "MMI\2"

#define MM_MAX_SEG       255

#ifdef __cplusplus
extern "C" {
#endif

// emulate 128-bit integers and arrays
typedef struct { uint64_t x, y; } mm128_t;
typedef struct { size_t n, m; mm128_t *a; } mm128_v;

// minimap2 index
typedef struct {
	char *name;      // name of the db sequence
	uint64_t offset; // offset in mm_idx_t::S
	uint32_t len;    // length
} mm_idx_seq_t;

typedef struct {
	int32_t b, w, k, flag;
	uint32_t n_seq;            // number of reference sequences
	int32_t index;
	mm_idx_seq_t *seq;         // sequence name, length and offset
	uint32_t *S;               // 4-bit packed sequence
	struct mm_idx_bucket_s *B; // index (hidden)
	struct mm_idx_intv_s *I;   // intervals (hidden)
	void *km, *h;
} mm_idx_t;

// minimap2 alignment
typedef struct {
	uint32_t capacity;                  // the capacity of cigar[]
	int32_t dp_score, dp_max, dp_max2;  // DP score; score of the max-scoring segment; score of the best alternate mappings
	uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
	uint32_t n_cigar;                   // number of cigar operations in cigar[]
	uint32_t cigar[];
} mm_extra_t;

typedef struct {
	int32_t id;             // ID for internal uses (see also parent below)
	int32_t cnt;            // number of minimizers; if on the reverse strand
	int32_t rid;            // reference index; if this is an alignment from inversion rescue
	int32_t score;          // DP alignment score
	int32_t qs, qe, rs, re; // query start and end; reference start and end
	int32_t parent, subsc;  // parent==id if primary; best alternate mapping score
	int32_t as;             // offset in the a[] array (for internal uses only)
	int32_t mlen, blen;     // seeded exact match length; seeded alignment block length
	int32_t n_sub;          // number of suboptimal mappings
	int32_t score0;         // initial chaining score (before chain merging/spliting)
	uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, dummy:7;
	uint32_t hash;
	float div;
	mm_extra_t *p;
} mm_reg1_t;

// indexing and mapping options
typedef struct {
	short k, w, flag, bucket_bits;
	int mini_batch_size;
	uint64_t batch_size;
} mm_idxopt_t;

typedef struct {
	int64_t flag;    // see MM_F_* macros
	int seed;
	int sdust_thres; // score threshold for SDUST; 0 to disable

	int max_qlen;    // max query length

	int bw;          // bandwidth
	int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
	int max_frag_len;
	int max_chain_skip, max_chain_iter;
	int min_cnt;         // min number of minimizers on each chain
	int min_chain_score; // min chaining score

	float mask_level;
	float pri_ratio;
	int best_n;      // top best_n chains are subjected to DP alignment

	int max_join_long, max_join_short;
	int min_join_flank_sc;
	float min_join_flank_ratio;

	int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
	int sc_ambi; // score when one or both bases are "N"
	int noncan;      // cost of non-canonical splicing sites
	int junc_bonus;
	int zdrop, zdrop_inv;   // break alignment if alignment score drops too fast along the diagonal
	int end_bonus;
	int min_dp_max;  // drop an alignment if the score of the max scoring segment is below this threshold
	int min_ksw_len;
	int anchor_ext_len, anchor_ext_shift;
	float max_clip_ratio; // drop an alignment if BOTH ends are clipped above this ratio

	int pe_ori, pe_bonus;

	float mid_occ_frac;  // only used by mm_mapopt_update(); see below
	int32_t min_mid_occ;
	int32_t mid_occ;     // ignore seeds with occurrences above this threshold
	int32_t max_occ;
	int mini_batch_size; // size of a batch of query bases to process in parallel
	int64_t max_sw_mat;

	const char *split_prefix;
} mm_mapopt_t;

// index reader
typedef struct {
	int is_idx, n_parts;
	int64_t idx_size;
	mm_idxopt_t opt;
	FILE *fp_out;
	union {
		struct mm_bseq_file_s *seq;
		FILE *idx;
	} fp;
} mm_idx_reader_t;

// memory buffer for thread-local storage during mapping
typedef struct mm_tbuf_s mm_tbuf_t;

// global variables
extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag
extern double mm_realtime0; // wall-clock timer

/**
 * Set default or preset parameters
 *
 * @param preset     NULL to set all parameters as default; otherwise apply preset to affected parameters
 * @param io         pointer to indexing parameters
 * @param mo         pointer to mapping parameters
 *
 * @return 0 if success; -1 if _present_ unknown
 */
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo);

/**
 * Update mm_mapopt_t::mid_occ via mm_mapopt_t::mid_occ_frac
 *
 * If mm_mapopt_t::mid_occ is 0, this function sets it to a number such that no
 * more than mm_mapopt_t::mid_occ_frac of minimizers in the index have a higher
 * occurrence.
 *
 * @param opt        mapping parameters
 * @param mi         minimap2 index
 */
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);

void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len);

/**
 * Initialize an index reader
 *
 * @param fn         index or fasta/fastq file name (this function tests the file type)
 * @param opt        indexing parameters
 * @param fn_out     if not NULL, write built index to this file
 *
 * @return an index reader on success; NULL if fail to open _fn_
 */
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out);

/**
 * Read/build an index
 *
 * If the input file is an index file, this function reads one part of the
 * index and returns. If the input file is a sequence file (fasta or fastq),
 * this function constructs the index for about mm_idxopt_t::batch_size bases.
 * Importantly, for a huge collection of sequences, this function may only
 * return an index for part of sequences. It needs to be repeatedly called
 * to traverse the entire index/sequence file.
 *
 * @param r          index reader
 * @param n_threads  number of threads for constructing index
 *
 * @return an index on success; NULL if reaching the end of the input file
 */
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);

/**
 * Destroy/deallocate an index reader
 *
 * @param r          index reader
 */
void mm_idx_reader_close(mm_idx_reader_t *r);

int mm_idx_reader_eof(const mm_idx_reader_t *r);

/**
 * Check whether the file contains a minimap2 index
 *
 * @param fn         file name
 *
 * @return the file size if fn is an index file; 0 if fn is not.
 */
int64_t mm_idx_is_idx(const char *fn);

/**
 * Load a part of an index
 *
 * Given a uni-part index, this function loads the entire index into memory.
 * Given a multi-part index, it loads one part only and places the file pointer
 * at the end of that part.
 *
 * @param fp         pointer to FILE object
 *
 * @return minimap2 index read from fp
 */
mm_idx_t *mm_idx_load(FILE *fp);

/**
 * Append an index (or one part of a full index) to file
 *
 * @param fp         pointer to FILE object
 * @param mi         minimap2 index
 */
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);

/**
 * Create an index from strings in memory
 *
 * @param w            minimizer window size
 * @param k            minimizer k-mer size
 * @param is_hpc       use HPC k-mer if true
 * @param bucket_bits  number of bits for the first level of the hash table
 * @param n            number of sequences
 * @param seq          sequences in A/C/G/T
 * @param name         sequence names; could be NULL
 *
 * @return minimap2 index
 */
mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name);

/**
 * Print index statistics to stderr
 *
 * @param mi         minimap2 index
 */
void mm_idx_stat(const mm_idx_t *idx);

/**
 * Destroy/deallocate an index
 *
 * @param r          minimap2 index
 */
void mm_idx_destroy(mm_idx_t *mi);

/**
 * Initialize a thread-local buffer for mapping
 *
 * Each mapping thread requires a buffer specific to the thread (see mm_map()
 * below). The primary purpose of this buffer is to reduce frequent heap
 * allocations across threads. A buffer shall not be used by two or more
 * threads.
 *
 * @return pointer to a thread-local buffer
 */
mm_tbuf_t *mm_tbuf_init(void);

/**
 * Destroy/deallocate a thread-local buffer for mapping
 *
 * @param b          the buffer
 */
void mm_tbuf_destroy(mm_tbuf_t *b);

void *mm_tbuf_get_km(mm_tbuf_t *b);

/**
 * Align a query sequence against an index
 *
 * This function possibly finds multiple alignments of the query sequence.
 * The returned array and the mm_reg1_t::p field of each element are allocated
 * with malloc().
 *
 * @param mi         minimap2 index
 * @param l_seq      length of the query sequence
 * @param seq        the query sequence
 * @param n_regs     number of hits (out)
 * @param b          thread-local buffer; two mm_map() calls shall not use one buffer at the same time!
 * @param opt        mapping parameters
 * @param name       query name, used for all-vs-all overlapping and debugging
 *
 * @return an array of hits which need to be deallocated with free() together
 *         with mm_reg1_t::p of each element. The size is written to _n_regs_.
 */
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);

void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname);

/**
 * Align a fasta/fastq file and print alignments to stdout
 *
 * @param idx        minimap2 index
 * @param fn         fasta/fastq file name
 * @param opt        mapping parameters
 * @param n_threads  number of threads
 *
 * @return 0 on success; -1 if _fn_ can't be read
 */
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads);

int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);

/**
 * Generate the cs tag (new in 2.12)
 *
 * @param km         memory blocks; set to NULL if unsure
 * @param buf        buffer to write the cs/MD tag; typicall NULL on the first call
 * @param max_len    max length of the buffer; typically set to 0 on the first call
 * @param mi         index
 * @param r          alignment
 * @param seq        query sequence
 * @param no_iden    true to use : instead of =
 *
 * @return the length of cs
 */
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden);
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq);

// query sequence name and sequence in the minimap2 index
int mm_idx_index_name(mm_idx_t *mi);
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);

int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);

// deprecated APIs for backward compatibility
void mm_mapopt_init(mm_mapopt_t *opt);
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);

#ifdef __cplusplus
}
#endif

#endif // MINIMAP2_H