Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multipart test #203

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
68b0444
attemp1
hasindu2008 Apr 6, 2018
2e06e03
added multi-part functionality
hasindu2008 Apr 8, 2018
a741e7c
restructuring
hasindu2008 Apr 8, 2018
25d7ed4
use fopen instead of open
hasindu2008 Apr 8, 2018
7b704ce
added functionality to write the structure inside the structure
hasindu2008 Apr 8, 2018
c18bbce
now prints the sam header and corrects the chromosome id
hasindu2008 Apr 9, 2018
5ccba43
mapq and the rest of merging
hasindu2008 Apr 10, 2018
b2dd3f4
memory freeing
hasindu2008 Apr 12, 2018
5bf32d2
some cleaning
hasindu2008 Apr 24, 2018
ebeacee
print err fix
hasindu2008 Apr 24, 2018
ca5e737
sort by score - improve accuracy for paf
hasindu2008 Apr 25, 2018
ea88546
fix memory leaks
hasindu2008 Apr 25, 2018
0312dc9
some cleans
hasindu2008 May 1, 2018
00c15ee
some comments
hasindu2008 May 15, 2018
f76d90b
remove temp files after finished
hasindu2008 Jun 8, 2018
9613240
fixed a bug in multi-prefix error message for multi-segment reads
hasindu2008 Jun 11, 2018
8bc2a83
added support for 64 bit ARM architectures
hasindu2008 Jun 11, 2018
66f4bfb
Merge branch 'aarch64'
hasindu2008 Jun 11, 2018
38016e6
MD string added to mappy
philres Jun 12, 2018
ed8d8c3
arm64 support for mappy
philres Jun 12, 2018
05b268f
Merge branch 'md-string-mappy'
philres Jun 12, 2018
15781ae
Merge remote-tracking branch 'upstream/master'
hasindu2008 Jun 20, 2018
47369af
corrections
hasindu2008 Jun 21, 2018
d173323
Merge remote-tracking branch 'upstream/master'
hasindu2008 Jun 21, 2018
8172acc
fixed a bug due to the longopt number multi-prefix
hasindu2008 Jun 21, 2018
5b75d97
man page entry
hasindu2008 Jun 22, 2018
8d57f49
clean up temp file
hasindu2008 Jun 22, 2018
6b471f9
Change md string decode
philres Jul 3, 2018
7e1e676
Merge remote-tracking branch 'upstream/master'
philres Jul 3, 2018
f50d1da
Merge remote-tracking branch 'multipart/master' into multipart-test
philres Jul 23, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
CFLAGS= -g -Wall -O2 -Wc++-compat #-Wextra
CPPFLAGS= -DHAVE_KALLOC
INCLUDES=
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o ksw2_ll_sse.o
OBJS= kthread.o kalloc.o misc.o bseq.o sketch.o sdust.o options.o index.o chain.o align.o hit.o map.o format.o pe.o esterr.o ksw2_ll_sse.o merge.o
PROG= minimap2
PROG_EXTRA= sdust minimap2-lite
LIBS= -lm -lz -lpthread
Expand Down Expand Up @@ -116,3 +116,4 @@ options.o: mmpriv.h minimap.h bseq.h
pe.o: mmpriv.h minimap.h bseq.h kvec.h kalloc.h ksort.h
sdust.o: kalloc.h kdq.h kvec.h sdust.h
sketch.o: kvec.h kalloc.h mmpriv.h minimap.h bseq.h
merge.o: merge.h
42 changes: 41 additions & 1 deletion format.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,36 @@ static void write_cs_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
}

static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp)
//static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp)
//{
// int i, q_off, t_off, l_MD = 0;
// mm_sprintf_lite(s, "\tMD:Z:");
// for (i = q_off = t_off = 0; i < r->p->n_cigar; ++i) {
// int j, op = r->p->cigar[i]&0xf, len = r->p->cigar[i]>>4;
// assert(op >= 0 && op <= 2); // introns (aka reference skips) are not supported
// if (op == 0) { // match
// for (j = 0; j < len; ++j) {
// if (qseq[q_off + j] != tseq[t_off + j]) {
// mm_sprintf_lite(s, "%d%c", l_MD, "ACGTN"[tseq[t_off + j]]);
// l_MD = 0;
// } else ++l_MD;
// }
// q_off += len, t_off += len;
// } else if (op == 1) { // insertion to ref
// q_off += len;
// } else if (op == 2) { // deletion from ref
// for (j = 0, tmp[len] = 0; j < len; ++j)
// tmp[j] = "ACGTN"[tseq[t_off + j]];
// mm_sprintf_lite(s, "%d^%s", l_MD, tmp);
// l_MD = 0;
// t_off += len;
// }
// }
// if (l_MD > 0) mm_sprintf_lite(s, "%d", l_MD);
// assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
//}

static void write_MD_string(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp)
{
int i, q_off, t_off, l_MD = 0;
mm_sprintf_lite(s, "\tMD:Z:");
Expand Down Expand Up @@ -210,6 +239,12 @@ static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq
assert(t_off == r->re - r->rs && q_off == r->qe - r->qs);
}

static void write_MD_core(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp)
{
mm_sprintf_lite(s, "\tMD:Z:");
write_MD_string(s, tseq, qseq, r, tmp);
}

static void write_cs_or_MD(void *km, kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const mm_reg1_t *r, int no_iden, int is_MD)
{
extern unsigned char seq_nt4_table[256];
Expand Down Expand Up @@ -490,3 +525,8 @@ void mm_write_sam(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
if (r == &regs[i]) break;
mm_write_sam2(s, mi, t, 0, i, 1, &n_regs, &regs, NULL, 0);
}

void write_MD(kstring_t *s, const uint8_t *tseq, const uint8_t *qseq, const mm_reg1_t *r, char *tmp)
{
write_MD_string(s, tseq, qseq, r, tmp);
}
16 changes: 15 additions & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "bseq.h"
#include "minimap.h"
#include "mmpriv.h"
#include "merge.h"
#ifdef HAVE_GETOPT
#include <getopt.h>
#else
Expand Down Expand Up @@ -60,6 +61,7 @@ static struct option long_options[] = {
{ "lj-min-ratio", required_argument, 0, 0 }, // 30
{ "score-N", required_argument, 0, 0 }, // 31
{ "eqx", no_argument, 0, 0 }, // 32
{ "multi-prefix", required_argument, 0, 0 }, // 33
{ "help", no_argument, 0, 'h' },
{ "max-intron-len", required_argument, 0, 'G' },
{ "version", no_argument, 0, 'V' },
Expand Down Expand Up @@ -105,6 +107,7 @@ int main(int argc, char *argv[])
FILE *fp_help = stderr;
mm_idx_reader_t *idx_rdr;
mm_idx_t *mi;
int32_t idx_id=0;

mm_verbose = 3;
liftrlimit();
Expand Down Expand Up @@ -232,7 +235,11 @@ int main(int argc, char *argv[])
} else if (c == 'E') {
opt.e = opt.e2 = strtol(optarg, &s, 10);
if (*s == ',') opt.e2 = strtol(s + 1, &s, 10);
} else if (c==0 && long_idx == 33) { //multi-part
fprintf(stderr, "[WARNING]\033[1;31m option --multi-prefix is experimental. Currently works only with uni-segment reads.\033[0m\n");
opt.multi_prefix=optarg;
}

}
if ((opt.flag & MM_F_SPLICE) && (opt.flag & MM_F_FRAG_MODE)) {
fprintf(stderr, "[ERROR]\033[1;31m --splice and --frag should not be specified at the same time.\033[0m\n");
Expand Down Expand Up @@ -313,14 +320,18 @@ int main(int argc, char *argv[])
}
if (opt.best_n == 0 && (opt.flag&MM_F_CIGAR) && mm_verbose >= 2)
fprintf(stderr, "[WARNING]\033[1;31m `-N 0' reduces alignment accuracy. Please use --secondary=no to suppress secondary alignments.\033[0m\n");
if ((opt.multi_prefix!=NULL) && (argc - (optind + 1) > 1)) {
fprintf(stderr,"[ERROR]\033[1;31m --multi-prefix is not yet implemented for multi-segment reads\033[0m\n");
return 1;
}
while ((mi = mm_idx_reader_read(idx_rdr, n_threads)) != 0) {
if ((opt.flag & MM_F_CIGAR) && (mi->flag & MM_I_NO_SEQ)) {
fprintf(stderr, "[ERROR] the prebuilt index doesn't contain sequences.\n");
mm_idx_destroy(mi);
mm_idx_reader_close(idx_rdr);
return 1;
}
if ((opt.flag & MM_F_OUT_SAM) && idx_rdr->n_parts == 1) {
if ((opt.flag & MM_F_OUT_SAM) && (idx_rdr->n_parts == 1) && (opt.multi_prefix==NULL)) {
if (mm_idx_reader_eof(idx_rdr)) {
mm_write_sam_hdr(mi, rg, MM_VERSION, argc, argv);
} else {
Expand All @@ -334,16 +345,19 @@ int main(int argc, char *argv[])
__func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), mi->n_seq);
if (argc != optind + 1) mm_mapopt_update(&opt, mi);
if (mm_verbose >= 3) mm_idx_stat(mi);
mi->idx_id=idx_id;
if (!(opt.flag & MM_F_FRAG_MODE)) {
for (i = optind + 1; i < argc; ++i)
mm_map_file(mi, argv[i], &opt, n_threads);
} else {
mm_map_file_frag(mi, argc - (optind + 1), (const char**)&argv[optind + 1], &opt, n_threads);
}
mm_idx_destroy(mi);
idx_id++;
}
mm_idx_reader_close(idx_rdr);

if(opt.multi_prefix!=NULL) merge(&opt,&ipt,idx_id,(const char**)&argv[optind + 1], argc, argv, rg);
if (fflush(stdout) == EOF) {
fprintf(stderr, "[ERROR] failed to write the results\n");
exit(EXIT_FAILURE);
Expand Down
45 changes: 35 additions & 10 deletions map.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "mmpriv.h"
#include "bseq.h"
#include "khash.h"
#include "merge.h"

struct mm_tbuf_s {
void *km;
Expand Down Expand Up @@ -262,7 +263,7 @@ static mm_reg1_t *align_regs(const mm_mapopt_t *opt, const mm_idx_t *mi, void *k
return regs;
}

void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
int mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **seqs, int *n_regs, mm_reg1_t **regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
{
int i, j, rep_len, qlen_sum, n_regs0, n_mini_pos;
int max_chain_gap_qry, max_chain_gap_ref, is_splice = !!(opt->flag & MM_F_SPLICE), is_sr = !!(opt->flag & MM_F_SR);
Expand All @@ -277,7 +278,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
for (i = 0, qlen_sum = 0; i < n_segs; ++i)
qlen_sum += qlens[i], n_regs[i] = 0, regs[i] = 0;

if (qlen_sum == 0 || n_segs <= 0 || n_segs > MM_MAX_SEG) return;
if (qlen_sum == 0 || n_segs <= 0 || n_segs > MM_MAX_SEG) return 0;

hash = qname? __ac_X31_hash_string(qname) : 0;
hash ^= __ac_Wang_hash(qlen_sum) + __ac_Wang_hash(opt->seed);
Expand Down Expand Up @@ -375,6 +376,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char **
b->km = km_init();
}
}
return rep_len;
}

mm_reg1_t *mm_map(const mm_idx_t *mi, int qlen, const char *seq, int *n_regs, mm_tbuf_t *b, const mm_mapopt_t *opt, const char *qname)
Expand All @@ -394,6 +396,7 @@ typedef struct {
mm_bseq_file_t **fp;
const mm_idx_t *mi;
kstring_t str;
FILE* multipart_fd;
} pipeline_t;

typedef struct {
Expand All @@ -403,6 +406,7 @@ typedef struct {
int *n_reg, *seg_off, *n_seg;
mm_reg1_t **reg;
mm_tbuf_t **buf;
int *replen;
} step_t;

static void worker_for(void *_data, long i, int tid) // kt_for() callback
Expand All @@ -422,9 +426,9 @@ static void worker_for(void *_data, long i, int tid) // kt_for() callback
}
if (s->p->opt->flag & MM_F_INDEPEND_SEG) {
for (j = 0; j < s->n_seg[i]; ++j)
mm_map_frag(s->p->mi, 1, &qlens[j], &qseqs[j], &s->n_reg[off+j], &s->reg[off+j], b, s->p->opt, s->seq[off+j].name);
s->replen[i]=mm_map_frag(s->p->mi, 1, &qlens[j], &qseqs[j], &s->n_reg[off+j], &s->reg[off+j], b, s->p->opt, s->seq[off+j].name);
} else {
mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name);
s->replen[i]=mm_map_frag(s->p->mi, s->n_seg[i], qlens, qseqs, &s->n_reg[off], &s->reg[off], b, s->p->opt, s->seq[off].name);
}
for (j = 0; j < s->n_seg[i]; ++j) // flip the query strand and coordinate to the original read strand
if (s->n_seg[i] == 2 && ((j == 0 && (pe_ori>>1&1)) || (j == 1 && (pe_ori&1)))) {
Expand Down Expand Up @@ -463,6 +467,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
s->seg_off = s->n_reg + s->n_seq; // seg_off and n_seg are allocated together with n_reg
s->n_seg = s->seg_off + s->n_seq;
s->reg = (mm_reg1_t**)calloc(s->n_seq, sizeof(mm_reg1_t*));
s->replen = (int *)calloc(s->n_seq, sizeof(int));
for (i = 1, j = 0; i <= s->n_seq; ++i)
if (i == s->n_seq || !frag_mode || !mm_qname_same(s->seq[i-1].name, s->seq[i].name)) {
s->n_seg[s->n_frag] = i - j;
Expand All @@ -485,18 +490,35 @@ static void *worker_pipeline(void *shared, int step, void *in)
int seg_st = s->seg_off[k], seg_en = s->seg_off[k] + s->n_seg[k];
for (i = seg_st; i < seg_en; ++i) {
mm_bseq1_t *t = &s->seq[i];
if(p->opt->multi_prefix!=NULL){
multipart_write(p->multipart_fd,&(s->n_reg[i]),sizeof(s->n_reg[i]),1);
multipart_write(p->multipart_fd,&(s->replen[i]),sizeof(int),1);
//fprintf(stderr,"n regs %d\treplen %d\n",s->n_reg[i],s->replen[i]);
//fprintf(stderr,"replen original %d\n",s->replen[i]);
}
for (j = 0; j < s->n_reg[i]; ++j) {
mm_reg1_t *r = &s->reg[i][j];
if(p->opt->multi_prefix!=NULL) {
multipart_write(p->multipart_fd,r,sizeof(mm_reg1_t),1);
if(p->opt->flag & MM_F_CIGAR){
multipart_write(p->multipart_fd,&(r->p->capacity),sizeof(uint32_t),1);
multipart_write(p->multipart_fd,r->p,sizeof(mm_extra_t)+sizeof(uint32_t)*r->p->capacity,1);
}
//fprintf(stderr,"REGDETAIL id %d,cnt %d,rid %d,score %d,qs %d, qe %d, rs %d, re %d, parent %d, subsc %d, as %d, mlen %d, blen %d, n_sub %d, score0 %d, hash %u, div %f,",r->id,r->cnt,r->rid,r->score,r->qs,r->qe, r->rs, r->re,r->parent, r->subsc,r->as,r->mlen, r->blen,r->n_sub,r->score0,r->hash,r->div);
//fprintf(stderr,"dp_score %d, dp_max %d, dp_max2 %d \n",r->p->dp_score, r->p->dp_max, r->p->dp_max2);
}
assert(!r->sam_pri || r->id == r->parent);
if ((p->opt->flag & MM_F_NO_PRINT_2ND) && r->id != r->parent)
continue;
if (p->opt->flag & MM_F_OUT_SAM)
mm_write_sam2(&p->str, mi, t, i - seg_st, j, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag);
else
mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
mm_err_puts(p->str.s);
if (p->opt->multi_prefix==NULL){
if (p->opt->flag & MM_F_OUT_SAM)
mm_write_sam2(&p->str, mi, t, i - seg_st, j, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag);
else
mm_write_paf(&p->str, mi, t, r, km, p->opt->flag);
mm_err_puts(p->str.s);
}
}
if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM)) { // write an unmapped record
if (s->n_reg[i] == 0 && (p->opt->flag & MM_F_OUT_SAM) && (p->opt->multi_prefix==NULL)) { // write an unmapped record
mm_write_sam2(&p->str, mi, t, i - seg_st, -1, s->n_seg[k], &s->n_reg[seg_st], (const mm_reg1_t*const*)&s->reg[seg_st], km, p->opt->flag);
mm_err_puts(p->str.s);
}
Expand All @@ -509,6 +531,7 @@ static void *worker_pipeline(void *shared, int step, void *in)
}
}
free(s->reg); free(s->n_reg); free(s->seq); // seg_off and n_seg were allocated with reg; no memory leak here
free(s->replen);
km_destroy(km);
if (mm_verbose >= 3)
fprintf(stderr, "[M::%s::%.3f*%.2f] mapped %d sequences\n", __func__, realtime() - mm_realtime0, cputime() / (realtime() - mm_realtime0), s->n_seq);
Expand Down Expand Up @@ -536,6 +559,7 @@ int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_
return -1;
}
}
if(opt->multi_prefix!=NULL) pl.multipart_fd=multipart_init(opt,idx);
pl.opt = opt, pl.mi = idx;
pl.n_threads = n_threads > 1? n_threads : 1;
pl.mini_batch_size = opt->mini_batch_size;
Expand All @@ -544,6 +568,7 @@ int mm_map_file_frag(const mm_idx_t *idx, int n_segs, const char **fn, const mm_
free(pl.str.s);
for (i = 0; i < n_segs; ++i)
mm_bseq_close(pl.fp[i]);
if(opt->multi_prefix!=NULL) multipart_close(pl.multipart_fd);
free(pl.fp);
return 0;
}
Expand Down
Loading