1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
|
#ifndef MINIMAP2_H
#define MINIMAP2_H
#include <stdint.h>
#include <stdio.h>
#include <sys/types.h>
#define MM_F_NO_DIAG 0x001 // no exact diagonal hit
#define MM_F_NO_DUAL 0x002 // skip pairs where query name is lexicographically larger than target name
#define MM_F_CIGAR 0x004
#define MM_F_OUT_SAM 0x008
#define MM_F_NO_QUAL 0x010
#define MM_F_OUT_CG 0x020
#define MM_F_OUT_CS 0x040
#define MM_F_SPLICE 0x080 // splice mode
#define MM_F_SPLICE_FOR 0x100 // match GT-AG
#define MM_F_SPLICE_REV 0x200 // match CT-AC, the reverse complement of GT-AG
#define MM_F_NO_LJOIN 0x400
#define MM_F_OUT_CS_LONG 0x800
#define MM_F_SR 0x1000
#define MM_F_FRAG_MODE 0x2000
#define MM_F_NO_PRINT_2ND 0x4000
#define MM_F_2_IO_THREADS 0x8000
#define MM_F_LONG_CIGAR 0x10000
#define MM_F_INDEPEND_SEG 0x20000
#define MM_F_SPLICE_FLANK 0x40000
#define MM_F_SOFTCLIP 0x80000
#define MM_F_FOR_ONLY 0x100000
#define MM_F_REV_ONLY 0x200000
#define MM_F_HEAP_SORT 0x400000
#define MM_F_ALL_CHAINS 0x800000
#define MM_F_OUT_MD 0x1000000
#define MM_F_COPY_COMMENT 0x2000000
#define MM_F_EQX 0x4000000 // use =/X instead of M
#define MM_F_PAF_NO_HIT 0x8000000 // output unmapped reads to PAF
#define MM_F_NO_END_FLT 0x10000000
#define MM_F_HARD_MLEVEL 0x20000000
#define MM_F_SAM_HIT_ONLY 0x40000000
#define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2
#define MM_I_NO_NAME 0x4
#define MM_IDX_MAGIC "MMI\2"
#define MM_MAX_SEG 255
#ifdef __cplusplus
extern "C" {
#endif
// emulate 128-bit integers and arrays
typedef struct { uint64_t x, y; } mm128_t;
typedef struct { size_t n, m; mm128_t *a; } mm128_v;
// minimap2 index
typedef struct {
char *name; // name of the db sequence
uint64_t offset; // offset in mm_idx_t::S
uint32_t len; // length
} mm_idx_seq_t;
typedef struct {
int32_t b, w, k, flag;
uint32_t n_seq; // number of reference sequences
int32_t index;
mm_idx_seq_t *seq; // sequence name, length and offset
uint32_t *S; // 4-bit packed sequence
struct mm_idx_bucket_s *B; // index (hidden)
struct mm_idx_intv_s *I; // intervals (hidden)
void *km, *h;
} mm_idx_t;
// minimap2 alignment
typedef struct {
uint32_t capacity; // the capacity of cigar[]
int32_t dp_score, dp_max, dp_max2; // DP score; score of the max-scoring segment; score of the best alternate mappings
uint32_t n_ambi:30, trans_strand:2; // number of ambiguous bases; transcript strand: 0 for unknown, 1 for +, 2 for -
uint32_t n_cigar; // number of cigar operations in cigar[]
uint32_t cigar[];
} mm_extra_t;
typedef struct {
int32_t id; // ID for internal uses (see also parent below)
int32_t cnt; // number of minimizers; if on the reverse strand
int32_t rid; // reference index; if this is an alignment from inversion rescue
int32_t score; // DP alignment score
int32_t qs, qe, rs, re; // query start and end; reference start and end
int32_t parent, subsc; // parent==id if primary; best alternate mapping score
int32_t as; // offset in the a[] array (for internal uses only)
int32_t mlen, blen; // seeded exact match length; seeded alignment block length
int32_t n_sub; // number of suboptimal mappings
int32_t score0; // initial chaining score (before chain merging/spliting)
uint32_t mapq:8, split:2, rev:1, inv:1, sam_pri:1, proper_frag:1, pe_thru:1, seg_split:1, seg_id:8, split_inv:1, dummy:7;
uint32_t hash;
float div;
mm_extra_t *p;
} mm_reg1_t;
// indexing and mapping options
typedef struct {
short k, w, flag, bucket_bits;
int mini_batch_size;
uint64_t batch_size;
} mm_idxopt_t;
typedef struct {
int64_t flag; // see MM_F_* macros
int seed;
int sdust_thres; // score threshold for SDUST; 0 to disable
int max_qlen; // max query length
int bw; // bandwidth
int max_gap, max_gap_ref; // break a chain if there are no minimizers in a max_gap window
int max_frag_len;
int max_chain_skip, max_chain_iter;
int min_cnt; // min number of minimizers on each chain
int min_chain_score; // min chaining score
float mask_level;
float pri_ratio;
int best_n; // top best_n chains are subjected to DP alignment
int max_join_long, max_join_short;
int min_join_flank_sc;
float min_join_flank_ratio;
int a, b, q, e, q2, e2; // matching score, mismatch, gap-open and gap-ext penalties
int sc_ambi; // score when one or both bases are "N"
int noncan; // cost of non-canonical splicing sites
int junc_bonus;
int zdrop, zdrop_inv; // break alignment if alignment score drops too fast along the diagonal
int end_bonus;
int min_dp_max; // drop an alignment if the score of the max scoring segment is below this threshold
int min_ksw_len;
int anchor_ext_len, anchor_ext_shift;
float max_clip_ratio; // drop an alignment if BOTH ends are clipped above this ratio
int pe_ori, pe_bonus;
float mid_occ_frac; // only used by mm_mapopt_update(); see below
int32_t min_mid_occ;
int32_t mid_occ; // ignore seeds with occurrences above this threshold
int32_t max_occ;
int mini_batch_size; // size of a batch of query bases to process in parallel
int64_t max_sw_mat;
const char *split_prefix;
} mm_mapopt_t;
// index reader
typedef struct {
int is_idx, n_parts;
int64_t idx_size;
mm_idxopt_t opt;
FILE *fp_out;
union {
struct mm_bseq_file_s *seq;
FILE *idx;
} fp;
} mm_idx_reader_t;
// memory buffer for thread-local storage during mapping
typedef struct mm_tbuf_s mm_tbuf_t;
// global variables
extern int mm_verbose, mm_dbg_flag; // verbose level: 0 for no info, 1 for error, 2 for warning, 3 for message (default); debugging flag
extern double mm_realtime0; // wall-clock timer
/**
* Set default or preset parameters
*
* @param preset NULL to set all parameters as default; otherwise apply preset to affected parameters
* @param io pointer to indexing parameters
* @param mo pointer to mapping parameters
*
* @return 0 if success; -1 if _present_ unknown
*/
int mm_set_opt(const char *preset, mm_idxopt_t *io, mm_mapopt_t *mo);
int mm_check_opt(const mm_idxopt_t *io, const mm_mapopt_t *mo);
/**
* Update mm_mapopt_t::mid_occ via mm_mapopt_t::mid_occ_frac
*
* If mm_mapopt_t::mid_occ is 0, this function sets it to a number such that no
* more than mm_mapopt_t::mid_occ_frac of minimizers in the index have a higher
* occurrence.
*
* @param opt mapping parameters
* @param mi minimap2 index
*/
void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi);
void mm_mapopt_max_intron_len(mm_mapopt_t *opt, int max_intron_len);
/**
* Initialize an index reader
*
* @param fn index or fasta/fastq file name (this function tests the file type)
* @param opt indexing parameters
* @param fn_out if not NULL, write built index to this file
*
* @return an index reader on success; NULL if fail to open _fn_
*/
mm_idx_reader_t *mm_idx_reader_open(const char *fn, const mm_idxopt_t *opt, const char *fn_out);
/**
* Read/build an index
*
* If the input file is an index file, this function reads one part of the
* index and returns. If the input file is a sequence file (fasta or fastq),
* this function constructs the index for about mm_idxopt_t::batch_size bases.
* Importantly, for a huge collection of sequences, this function may only
* return an index for part of sequences. It needs to be repeatedly called
* to traverse the entire index/sequence file.
*
* @param r index reader
* @param n_threads number of threads for constructing index
*
* @return an index on success; NULL if reaching the end of the input file
*/
mm_idx_t *mm_idx_reader_read(mm_idx_reader_t *r, int n_threads);
/**
* Destroy/deallocate an index reader
*
* @param r index reader
*/
void mm_idx_reader_close(mm_idx_reader_t *r);
int mm_idx_reader_eof(const mm_idx_reader_t *r);
/**
* Check whether the file contains a minimap2 index
*
* @param fn file name
*
* @return the file size if fn is an index file; 0 if fn is not.
*/
int64_t mm_idx_is_idx(const char *fn);
/**
* Load a part of an index
*
* Given a uni-part index, this function loads the entire index into memory.
* Given a multi-part index, it loads one part only and places the file pointer
* at the end of that part.
*
* @param fp pointer to FILE object
*
* @return minimap2 index read from fp
*/
mm_idx_t *mm_idx_load(FILE *fp);
/**
* Append an index (or one part of a full index) to file
*
* @param fp pointer to FILE object
* @param mi minimap2 index
*/
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);
/**
* Create an index from strings in memory
*
* @param w minimizer window size
* @param k minimizer k-mer size
* @param is_hpc use HPC k-mer if true
* @param bucket_bits number of bits for the first level of the hash table
* @param n number of sequences
* @param seq sequences in A/C/G/T
* @param name sequence names; could be NULL
*
* @return minimap2 index
*/
mm_idx_t *mm_idx_str(int w, int k, int is_hpc, int bucket_bits, int n, const char **seq, const char **name);
/**
* Print index statistics to stderr
*
* @param mi minimap2 index
*/
void mm_idx_stat(const mm_idx_t *idx);
/**
* Destroy/deallocate an index
*
* @param r minimap2 index
*/
void mm_idx_destroy(mm_idx_t *mi);
/**
* Initialize a thread-local buffer for mapping
*
* Each mapping thread requires a buffer specific to the thread (see mm_map()
* below). The primary purpose of this buffer is to reduce frequent heap
* allocations across threads. A buffer shall not be used by two or more
* threads.
*
* @return pointer to a thread-local buffer
*/
mm_tbuf_t *mm_tbuf_init(void);
/**
* Destroy/deallocate a thread-local buffer for mapping
*
* @param b the buffer
*/
void mm_tbuf_destroy(mm_tbuf_t *b);
void *mm_tbuf_get_km(mm_tbuf_t *b);
/**
* Align a query sequence against an index
*
* This function possibly finds multiple alignments of the query sequence.
* The returned array and the mm_reg1_t::p field of each element are allocated
* with malloc().
*
* @param mi minimap2 index
* @param l_seq length of the query sequence
* @param seq the query sequence
* @param n_regs number of hits (out)
* @param b thread-local buffer; two mm_map() calls shall not use one buffer at the same time!
* @param opt mapping parameters
* @param name query name, used for all-vs-all overlapping and debugging
*
* @return an array of hits which need to be deallocated with free() together
* with mm_reg1_t::p of each element. The size is written to _n_regs_.
*/
mm_reg1_t *mm_map(const mm_idx_t *mi, int l_seq, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *name);
void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname);
/**
* Align a fasta/fastq file and print alignments to stdout
*
* @param idx minimap2 index
* @param fn fasta/fastq file name
* @param opt mapping parameters
* @param n_threads number of threads
*
* @return 0 on success; -1 if _fn_ can't be read
*/
int mm_map_file(const mm_idx_t *idx, const char *fn, const mm_mapopt_t *opt, int n_threads);
int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_mapopt_t *opt, int n_threads);
/**
* Generate the cs tag (new in 2.12)
*
* @param km memory blocks; set to NULL if unsure
* @param buf buffer to write the cs/MD tag; typicall NULL on the first call
* @param max_len max length of the buffer; typically set to 0 on the first call
* @param mi index
* @param r alignment
* @param seq query sequence
* @param no_iden true to use : instead of =
*
* @return the length of cs
*/
int mm_gen_cs(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq, int no_iden);
int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_reg1_t *r, const char *seq);
// query sequence name and sequence in the minimap2 index
int mm_idx_index_name(mm_idx_t *mi);
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
int mm_idx_bed_read(mm_idx_t *mi, const char *fn, int read_junc);
int mm_idx_bed_junc(const mm_idx_t *mi, int32_t ctg, int32_t st, int32_t en, uint8_t *s);
// deprecated APIs for backward compatibility
void mm_mapopt_init(mm_mapopt_t *opt);
mm_idx_t *mm_idx_build(const char *fn, int w, int k, int flag, int n_threads);
#ifdef __cplusplus
}
#endif
#endif // MINIMAP2_H
|