1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
|
#ifndef MINIMAP2_H
#define MINIMAP2_H
#include <stdint.h>
#include <stdio.h>
#include <sys/types.h>
#define MM_VERSION "2.27-r1193"
#define MM_F_NO_DIAG (0x001LL) // no exact diagonal hit
#define MM_F_NO_DUAL (0x002LL) // skip pairs where query name is lexicographically larger than target name
#define MM_F_CIGAR (0x004LL)
#define MM_F_OUT_SAM (0x008LL)
#define MM_F_NO_QUAL (0x010LL)
#define MM_F_OUT_CG (0x020LL)
#define MM_F_OUT_CS (0x040LL)
#define MM_F_SPLICE (0x080LL) // splice mode
#define MM_F_SPLICE_FOR (0x100LL) // match GT-AG
#define MM_F_SPLICE_REV (0x200LL) // match CT-AC, the reverse complement of GT-AG
#define MM_F_NO_LJOIN (0x400LL)
#define MM_F_OUT_CS_LONG (0x800LL)
#define MM_F_SR (0x1000LL)
#define MM_F_FRAG_MODE (0x2000LL)
#define MM_F_NO_PRINT_2ND (0x4000LL)
#define MM_F_2_IO_THREADS (0x8000LL)
#define MM_F_LONG_CIGAR (0x10000LL)
#define MM_F_INDEPEND_SEG (0x20000LL)
#define MM_F_SPLICE_FLANK (0x40000LL)
#define MM_F_SOFTCLIP (0x80000LL)
#define MM_F_FOR_ONLY (0x100000LL)
#define MM_F_REV_ONLY (0x200000LL)
#define MM_F_HEAP_SORT (0x400000LL)
#define MM_F_ALL_CHAINS (0x800000LL)
#define MM_F_OUT_MD (0x1000000LL)
#define MM_F_COPY_COMMENT (0x2000000LL)
#define MM_F_EQX (0x4000000LL) // use =/X instead of M
#define MM_F_PAF_NO_HIT (0x8000000LL) // output unmapped reads to PAF
#define MM_F_NO_END_FLT (0x10000000LL)
#define MM_F_HARD_MLEVEL (0x20000000LL)
#define MM_F_SAM_HIT_ONLY (0x40000000LL)
#define MM_F_RMQ (0x80000000LL)
#define MM_F_QSTRAND (0x100000000LL)
#define MM_F_NO_INV (0x200000000LL)
#define MM_F_NO_HASH_NAME (0x400000000LL)
#define MM_F_SPLICE_OLD (0x800000000LL)
#define MM_F_SECONDARY_SEQ (0x1000000000LL) //output SEQ field for seqondary alignments using hard clipping
#define MM_F_OUT_DS (0x2000000000LL)
#define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2
#define MM_I_NO_NAME 0x4
#define MM_IDX_MAGIC "MMI\2"
#define MM_MAX_SEG 255
#define MM_CIGAR_MATCH 0
#define MM_CIGAR_INS 1
#define MM_CIGAR_DEL 2
#define MM_CIGAR_N_SKIP 3
#define MM_CIGAR_SOFTCLIP 4
#define MM_CIGAR_HARDCLIP 5
#define MM_CIGAR_PADDING 6
#define MM_CIGAR_EQ_MATCH 7
#define MM_CIGAR_X_MISMATCH 8
#define MM_CIGAR_STR "MIDNSHP=XB"
#ifdef __cplusplus
extern "C" {
#endif
// emulate 128-bit integers and arrays
typedef struct { uint64_t x, y; } mm128_t;
typedef struct { size_t n, m; mm128_t *a; } mm128_v;
// minimap2 index
typedef struct {
char *name; // name of the db sequence
uint64_t offset; // offset in mm_idx_t::S
uint32_t len; // length
uint32_t is_alt;
} mm_idx_seq_t;
typedef struct {
int32_t b, w, k, flag;
uint32_t n_seq; // number of reference sequences
int32_t index;
int32_t n_alt;
mm_idx_seq_t *seq; // sequence name, length and offset
uint32_t *S; // 4-bit packed sequence
struct mm_idx_bucket_s *B; // index (hidden)
struct mm_idx_intv_s *I; // intervals (hidden)
void *km, *h;
} mm_idx_t;
// minimap2 alignment
typedef struct {
uint32_t capacity; // the capacity of cigar[]
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
int32_t dp_max0; // DP score before mm_update_dp_max() adjustment
uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
uint32_t n_cigar; // number of cigar operations in cigar[]
uint32_t cigar[];
} mm_extra_t;
typedef struct {
int32_t id; // ID for internal uses (see also parent below)
int32_t cnt; // number of minimizers; if on the reverse strand
int32_t rid; // reference index; if this is an alignment from inversion rescue
int32_t score; // DP alignment score
int32_t qs, qe, rs, re; // query start and end; reference start and end
int32_t parent, subsc; // parent==id if primary; best alternate mapping score
int32_t as; // offset in the a[] array (for internal uses only)
int32_t mlen, blen; // seeded exact match length; seeded alignment block length
int32_t n_sub; // number of suboptimal mappings
int32_t score0; // initial chaining score (before chain merging/spliting)
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, is_alt:1, strand_retained:1, dummy:5;
uint32_t hash;
float div;
mm_extra_t *p;
} mm_reg1_t;
// indexing and mapping options
typedef struct {
short k, w, flag, bucket_bits;
int64_t mini_batch_size;
uint64_t batch_size;
} mm_idxopt_t;
typedef struct {
int64_t flag; // see MM_F_* macros
int seed;
int sdust_thres; // score threshold for SDUST; 0 to disable
int max_qlen; // max query length
int bw, bw_long; // bandwidth
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
int max_frag_len;
int max_chain_skip, max_chain_iter;
int min_cnt; // min number of minimizers on each chain
int min_chain_score; // min chaining score
float chain_gap_scale;
float chain_skip_scale;
int rmq_size_cap, rmq_inner_dist;
int rmq_rescue_size;
float rmq_rescue_ratio;
float mask_level;
int mask_len;
float pri_ratio;
int best_n; // top best_n chains are subjected to DP alignment
float alt_drop;
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
int transition; // transition mismatch score (A:G, C:T)
int sc_ambi; // score when one or both bases are "N"
int noncan; // cost of non-canonical splicing sites
int junc_bonus;
int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal
int end_bonus;
int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
int min_ksw_len;
int anchor_ext_len, anchor_ext_shift;
float max_clip_ratio; // drop an alignment if BOTH ends are clipped above this ratio
int rank_min_len;
float rank_frac;
int pe_ori, pe_bonus;
float mid_occ_frac; // only used by mm_mapopt_update(); see below
float q_occ_frac;
int32_t min_mid_occ, max_mid_occ;
int32_t mid_occ; // ignore seeds with occurrences above this threshold
int32_t max_occ, max_max_occ, occ_dist;
int64_t mini_batch_size; // size of a batch of query bases to process in parallel
int64_t max_sw_mat;
int64_t cap_kalloc;
const char *split_prefix;
} mm_mapopt_t;
// index reader
typedef struct {
int is_idx, n_parts;
int64_t idx_size;
mm_idxopt_t opt;
FILE *fp_out;
union {
struct mm_bseq_file_s *seq;
FILE *idx;
} fp;
} mm_idx_reader_t;
// memory buffer for thread-local storage during mapping
struct mm_tbuf_s {
void *km;
int rep_len, frag_gap;
};
typedef struct mm_tbuf_s mm_tbuf_t;
// global variables
extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag
extern double mm_realtime0; // wall-clock timer
/**
* Set default or preset parameters
*
* @param preset NULL to set all parameters as default; otherwise apply preset to affected parameters
* @param io pointer to indexing parameters
* @param mo pointer to mapping parameters
*
* @return 0 if success; -1 if _present_ unknown
*/
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo);
/**
* Update mm_mapopt_t::mid_occ via mm_mapopt_t::mid_occ_frac
*
* If mm_mapopt_t::mid_occ is 0, this function sets it to a number such that no
* more than mm_mapopt_t::mid_occ_frac of minimizers in the index have a higher
* occurrence.
*
* @param opt mapping parameters
* @param mi minimap2 index
*/
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);
void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len);
/**
* Initialize an index reader
*
* @param fn index or fasta/fastq file name (this function tests the file type)
* @param opt indexing parameters
* @param fn_out if not NULL, write built index to this file
*
* @return an index reader on success; NULL if fail to open _fn_
*/
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out);
/**
* Read/build an index
*
* If the input file is an index file, this function reads one part of the
* index and returns. If the input file is a sequence file (fasta or fastq),
* this function constructs the index for about mm_idxopt_t::batch_size bases.
* Importantly, for a huge collection of sequences, this function may only
* return an index for part of sequences. It needs to be repeatedly called
* to traverse the entire index/sequence file.
*
* @param r index reader
* @param n_threads number of threads for constructing index
*
* @return an index on success; NULL if reaching the end of the input file
*/
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
/**
* Destroy/deallocate an index reader
*
* @param r index reader
*/
void mm_idx_reader_close(mm_idx_reader_t *r);
int mm_idx_reader_eof(const mm_idx_reader_t *r);
/**
* Check whether the file contains a minimap2 index
*
* @param fn file name
*
* @return the file size if fn is an index file; 0 if fn is not.
*/
int64_t mm_idx_is_idx(const char *fn);
/**
* Load a part of an index
*
* Given a uni-part index, this function loads the entire index into memory.
* Given a multi-part index, it loads one part only and places the file pointer
* at the end of that part.
*
* @param fp pointer to FILE object
*
* @return minimap2 index read from fp
*/
mm_idx_t *mm_idx_load(FILE *fp);
/**
* Append an index (or one part of a full index) to file
*
* @param fp pointer to FILE object
* @param mi minimap2 index
*/
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
/**
* Create an index from strings in memory
*
* @param w minimizer window size
* @param k minimizer k-mer size
* @param is_hpc use HPC k-mer if true
* @param bucket_bits number of bits for the first level of the hash table
* @param n number of sequences
* @param seq sequences in A/C/G/T
* @param name sequence names; could be NULL
*
* @return minimap2 index
*/
mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name);
/**
* Print index statistics to stderr
*
* @param mi minimap2 index
*/
void mm_idx_stat(const mm_idx_t *idx);
/**
* Destroy/deallocate an index
*
* @param r minimap2 index
*/
void mm_idx_destroy(mm_idx_t *mi);
/**
* Initialize a thread-local buffer for mapping
*
* Each mapping thread requires a buffer specific to the thread (see mm_map()
* below). The primary purpose of this buffer is to reduce frequent heap
* allocations across threads. A buffer shall not be used by two or more
* threads.
*
* @return pointer to a thread-local buffer
*/
mm_tbuf_t *mm_tbuf_init(void);
/**
* Destroy/deallocate a thread-local buffer for mapping
*
* @param b the buffer
*/
void mm_tbuf_destroy(mm_tbuf_t *b);
void *mm_tbuf_get_km(mm_tbuf_t *b);
/**
* Align a query sequence against an index
*
* This function possibly finds multiple alignments of the query sequence.
* The returned array and the mm_reg1_t::p field of each element are allocated
* with malloc().
*
* @param mi minimap2 index
* @param l_seq length of the query sequence
* @param seq the query sequence
* @param n_regs number of hits (out)
* @param b thread-local buffer; two mm_map() calls shall not use one buffer at the same time!
* @param opt mapping parameters
* @param name query name, used for all-vs-all overlapping and debugging
*
* @return an array of hits which need to be deallocated with free() together
* with mm_reg1_t::p of each element. The size is written to _n_regs_.
*/
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname);
/**
* Align a fasta/fastq file and print alignments to stdout
*
* @param idx minimap2 index
* @param fn fasta/fastq file name
* @param opt mapping parameters
* @param n_threads number of threads
*
* @return 0 on success; -1 if _fn_ can't be read
*/
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads);
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);
/**
* Generate the cs tag (new in 2.12)
*
* @param km memory blocks; set to NULL if unsure
* @param buf buffer to write the cs/MD tag; typicall NULL on the first call
* @param max_len max length of the buffer; typically set to 0 on the first call
* @param mi index
* @param r alignment
* @param seq query sequence
* @param no_iden true to use : instead of =
*
* @return the length of cs
*/
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden);
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq);
// query sequence name and sequence in the minimap2 index
int mm_idx_index_name(mm_idx_t *mi);
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
int mm_idx_alt_read(mm_idx_t *mi, const char *fn);
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
// deprecated APIs for backward compatibility
void mm_mapopt_init(mm_mapopt_t *opt);
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
#ifdef __cplusplus
}
#endif
#endif // MINIMAP2_H
|