1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
|
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "kvec.h"
#include "sys.h"
#include "paf.h"
#include "sdict.h"
#include "miniasm.h"
#define MA_VERSION "0.3-r179"
static void print_subs(const sdict_t *d, const ma_sub_t *sub)
{
uint32_t i;
for (i = 0; i < d->n_seq; ++i)
if (!d->seq[i].del && sub[i].s != sub[i].e)
printf("%s\t%d\t%d\n", d->seq[i].name, sub[i].s, sub[i].e);
}
static void print_hits(size_t n_hits, const ma_hit_t *hit, const sdict_t *d, const ma_sub_t *sub)
{
size_t i;
for (i = 0; i < n_hits; ++i) {
const ma_hit_t *p = &hit[i];
const ma_sub_t *rq = &sub[p->qns>>32], *rt = &sub[p->tn];
printf("%s:%d-%d\t%d\t%d\t%d\t%c\t%s:%d-%d\t%d\t%d\t%d\t%d\t%d\t255\n", d->seq[p->qns>>32].name, rq->s + 1, rq->e, rq->e - rq->s, (uint32_t)p->qns, p->qe,
"+-"[p->rev], d->seq[p->tn].name, rt->s + 1, rt->e, rt->e - rt->s, p->ts, p->te, p->ml, p->bl);
}
}
int main(int argc, char *argv[])
{
ma_opt_t opt;
int i, c, stage = 100, no_first = 0, no_second = 0, bi_dir = 1, o_set = 0, no_cont = 0;
sdict_t *d, *excl = 0;
ma_sub_t *sub = 0;
ma_hit_t *hit;
size_t n_hits;
float cov = 40.0;
char *fn_reads = 0, *outfmt = "ug";
ma_opt_init(&opt);
while ((c = getopt(argc, argv, "n:m:s:c:S:i:d:g:o:h:I:r:f:e:p:12VBRbF:")) >= 0) {
if (c == 'm') opt.min_match = atoi(optarg);
else if (c == 'i') opt.min_iden = atof(optarg);
else if (c == 's') opt.min_span = atoi(optarg);
else if (c == 'c') opt.min_dp = atoi(optarg);
else if (c == 'o') opt.min_ovlp = atoi(optarg), o_set = 1;
else if (c == 'S') stage = atoi(optarg);
else if (c == 'd') opt.bub_dist = atoi(optarg);
else if (c == 'g') opt.gap_fuzz = atoi(optarg);
else if (c == 'h') opt.max_hang = atoi(optarg);
else if (c == 'I') opt.int_frac = atof(optarg);
else if (c == 'e') opt.max_ext = atoi(optarg);
else if (c == 'f') fn_reads = optarg;
else if (c == 'p') outfmt = optarg;
else if (c == '1') no_first = 1;
else if (c == '2') no_second = 1;
else if (c == 'n') opt.n_rounds = atoi(optarg) - 1;
else if (c == 'B') bi_dir = 1;
else if (c == 'b') bi_dir = 0;
else if (c == 'R') no_cont = 1;
else if (c == 'F') opt.final_ovlp_drop_ratio = atof(optarg);
else if (c == 'V') {
printf("%s\n", MA_VERSION);
return 0;
} else if (c == 'r') {
char *s;
opt.max_ovlp_drop_ratio = strtod(optarg, &s);
if (*s == ',') opt.min_ovlp_drop_ratio = strtod(s + 1, &s);
}
}
if (o_set == 0) opt.min_ovlp = opt.min_span;
if (argc == optind) {
fprintf(stderr, "Usage: miniasm [options] <in.paf>\n");
fprintf(stderr, "Options:\n");
fprintf(stderr, " Pre-selection:\n");
fprintf(stderr, " -R prefilter clearly contained reads (2-pass required)\n");
fprintf(stderr, " -m INT min match length [%d]\n", opt.min_match);
fprintf(stderr, " -i FLOAT min identity [%.2g]\n", opt.min_iden);
fprintf(stderr, " -s INT min span [%d]\n", opt.min_span);
fprintf(stderr, " -c INT min coverage [%d]\n", opt.min_dp);
fprintf(stderr, " Overlap:\n");
fprintf(stderr, " -o INT min overlap [same as -s]\n");
fprintf(stderr, " -h INT max over hang length [%d]\n", opt.max_hang);
fprintf(stderr, " -I FLOAT min end-to-end match ratio [%.2g]\n", opt.int_frac);
fprintf(stderr, " Layout:\n");
fprintf(stderr, " -g INT max gap differences between reads for trans-reduction [%d]\n", opt.gap_fuzz);
fprintf(stderr, " -d INT max distance for bubble popping [%d]\n", opt.bub_dist);
fprintf(stderr, " -e INT small unitig threshold [%d]\n", opt.max_ext);
fprintf(stderr, " -f FILE read sequences []\n");
fprintf(stderr, " -n INT rounds of short overlap removal [%d]\n", opt.n_rounds + 1);
fprintf(stderr, " -r FLOAT[,FLOAT]\n");
fprintf(stderr, " max and min overlap drop ratio [%.2g,%.2g]\n", opt.max_ovlp_drop_ratio, opt.min_ovlp_drop_ratio);
fprintf(stderr, " -F FLOAT aggressive overlap drop ratio in the end [%.2g]\n", opt.final_ovlp_drop_ratio);
fprintf(stderr, " Miscellaneous:\n");
fprintf(stderr, " -p STR output information: bed, paf, sg or ug [%s]\n", outfmt);
// fprintf(stderr, " -B only one direction of an arc is present in input PAF\n"); // deprecated; for backward compatibility
fprintf(stderr, " -b both directions of an arc are present in input\n");
fprintf(stderr, " -1 skip 1-pass read selection\n");
fprintf(stderr, " -2 skip 2-pass read selection\n");
fprintf(stderr, " -V print version number\n");
fprintf(stderr, "\nSee miniasm.1 for detailed description of the command-line options.\n");
return 1;
}
sys_init();
d = sd_init();
if (no_cont) {
fprintf(stderr, "[M::%s] ===> Step 0: removing contained reads <===\n", __func__);
excl = ma_hit_no_cont(argv[optind], opt.min_span, opt.min_match, opt.max_hang, opt.int_frac);
}
fprintf(stderr, "[M::%s] ===> Step 1: reading read mappings <===\n", __func__);
hit = ma_hit_read(argv[optind], opt.min_span, opt.min_match, d, &n_hits, bi_dir, excl);
if (!no_first) {
fprintf(stderr, "[M::%s] ===> Step 2: 1-pass (crude) read selection <===\n", __func__);
if (stage >= 2) {
sub = ma_hit_sub(opt.min_dp, opt.min_iden, 0, n_hits, hit, d->n_seq);
n_hits = ma_hit_cut(sub, opt.min_span, n_hits, hit);
}
if (stage >= 3) n_hits = ma_hit_flt(sub, opt.max_hang * 1.5, opt.min_ovlp * .5, n_hits, hit, &cov);
}
if (!no_second) {
fprintf(stderr, "[M::%s] ===> Step 3: 2-pass (fine) read selection <===\n", __func__);
if (stage >= 4) {
ma_sub_t *sub2;
sub2 = ma_hit_sub(opt.min_dp, opt.min_iden, opt.min_span/2, n_hits, hit, d->n_seq);
n_hits = ma_hit_cut(sub2, opt.min_span, n_hits, hit);
if (!no_first) {
ma_sub_merge(d->n_seq, sub, sub2);
free(sub2);
} else {
sub = sub2;
}
}
if (stage >= 5) n_hits = ma_hit_contained(&opt, d, sub, n_hits, hit);
}
hit = (ma_hit_t*)realloc(hit, n_hits * sizeof(ma_hit_t));
if (strcmp(outfmt, "bed") == 0) {
print_subs(d, sub);
} else if (strcmp(outfmt, "paf") == 0) {
print_hits(n_hits, hit, d, sub);
} else if (strcmp(outfmt, "ug") == 0 || strcmp(outfmt, "sg") == 0) {
asg_t *sg = 0;
ma_ug_t *ug = 0;
fprintf(stderr, "[M::%s] ===> Step 4: graph cleaning <===\n", __func__);
sg = ma_sg_gen(&opt, d, sub, n_hits, hit);
if (stage >= 6) {
fprintf(stderr, "[M::%s] ===> Step 4.1: transitive reduction <===\n", __func__);
asg_arc_del_trans(sg, opt.gap_fuzz);
}
if (stage >= 7) {
fprintf(stderr, "[M::%s] ===> Step 4.2: initial tip cutting and bubble popping <===\n", __func__);
asg_cut_tip(sg, opt.max_ext);
asg_pop_bubble(sg, opt.bub_dist);
}
if (stage >= 9) {
fprintf(stderr, "[M::%s] ===> Step 4.3: cutting short overlaps (%d rounds in total) <===\n", __func__, opt.n_rounds + 1);
for (i = 0; i <= opt.n_rounds; ++i) {
float r = opt.min_ovlp_drop_ratio + (opt.max_ovlp_drop_ratio - opt.min_ovlp_drop_ratio) / opt.n_rounds * i;
if (asg_arc_del_short(sg, r) != 0) {
asg_cut_tip(sg, opt.max_ext);
asg_pop_bubble(sg, opt.bub_dist);
}
}
}
if (stage >= 10) {
fprintf(stderr, "[M::%s] ===> Step 4.4: removing short internal sequences and bi-loops <===\n", __func__);
asg_cut_internal(sg, 1);
asg_cut_biloop(sg, opt.max_ext);
asg_cut_tip(sg, opt.max_ext);
asg_pop_bubble(sg, opt.bub_dist);
}
if (stage >= 11) {
fprintf(stderr, "[M::%s] ===> Step 4.5: aggressively cutting short overlaps <===\n", __func__);
if (asg_arc_del_short(sg, opt.final_ovlp_drop_ratio) != 0) {
asg_cut_tip(sg, opt.max_ext);
asg_pop_bubble(sg, opt.bub_dist);
}
}
if (strcmp(outfmt, "ug") == 0) {
fprintf(stderr, "[M::%s] ===> Step 5: generating unitigs <===\n", __func__);
ug = ma_ug_gen(sg);
if (fn_reads) ma_ug_seq(ug, d, sub, fn_reads);
ma_ug_print(ug, d, sub, stdout);
} else ma_sg_print(sg, d, sub, stdout);
asg_destroy(sg);
ma_ug_destroy(ug);
}
free(sub); free(hit);
sd_destroy(d);
if (excl) sd_destroy(excl);
fprintf(stderr, "[M::%s] Version: %s\n", __func__, MA_VERSION);
fprintf(stderr, "[M::%s] CMD:", __func__);
for (i = 0; i < argc; ++i)
fprintf(stderr, " %s", argv[i]);
fprintf(stderr, "\n[M::%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, sys_realtime(), sys_cputime());
return 0;
}
|