Skip to content

Commit

Permalink
r752: option to copy comments to output (#136)
Browse files Browse the repository at this point in the history
  • Loading branch information
lh3 committed Mar 23, 2018
1 parent 8766d28 commit 08bd212
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 12 deletions.
23 changes: 17 additions & 6 deletions bseq.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ static inline char *kstrdup(const kstring_t *s)
return t;
}

static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual)
static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual, int with_comment)
{
int i;
s->name = kstrdup(&ks->name);
Expand All @@ -71,10 +71,11 @@ static inline void kseq2bseq(kseq_t *ks, mm_bseq1_t *s, int with_qual)
if (s->seq[i] == 'u' || s->seq[i] == 'U')
--s->seq[i];
s->qual = with_qual && ks->qual.l? kstrdup(&ks->qual) : 0;
s->comment = with_comment && ks->comment.l? kstrdup(&ks->comment) : 0;
s->l_seq = ks->seq.l;
}

mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int frag_mode, int *n_)
mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int chunk_size, int with_qual, int with_comment, int frag_mode, int *n_)
{
int64_t size = 0;
kvec_t(mm_bseq1_t) a = {0,0,0};
Expand All @@ -91,12 +92,12 @@ mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int
assert(ks->seq.l <= INT32_MAX);
if (a.m == 0) kv_resize(mm_bseq1_t, 0, a, 256);
kv_pushp(mm_bseq1_t, 0, a, &s);
kseq2bseq(ks, s, with_qual);
kseq2bseq(ks, s, with_qual, with_comment);
size += s->l_seq;
if (size >= chunk_size) {
if (frag_mode && a.a[a.n-1].l_seq < CHECK_PAIR_THRES) {
while (kseq_read(ks) >= 0) {
kseq2bseq(ks, &fp->s, with_qual);
kseq2bseq(ks, &fp->s, with_qual, with_comment);
if (mm_qname_same(fp->s.name, a.a[a.n-1].name)) {
kv_push(mm_bseq1_t, 0, a, fp->s);
memset(&fp->s, 0, sizeof(mm_bseq1_t));
Expand All @@ -110,12 +111,17 @@ mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int
return a.a;
}

mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int frag_mode, int *n_)
{
return mm_bseq_read3(fp, chunk_size, with_qual, 0, frag_mode, n_);
}

mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int *n_)
{
return mm_bseq_read2(fp, chunk_size, with_qual, 0, n_);
}

mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int *n_)
mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int with_comment, int *n_)
{
int i;
int64_t size = 0;
Expand All @@ -136,7 +142,7 @@ mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int
for (i = 0; i < n_fp; ++i) {
mm_bseq1_t *s;
kv_pushp(mm_bseq1_t, 0, a, &s);
kseq2bseq(fp[i]->ks, s, with_qual);
kseq2bseq(fp[i]->ks, s, with_qual, with_comment);
size += s->l_seq;
}
if (size >= chunk_size) break;
Expand All @@ -145,6 +151,11 @@ mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int
return a.a;
}

mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int *n_)
{
return mm_bseq_read_frag2(n_fp, fp, chunk_size, with_qual, 0, n_);
}

int mm_bseq_eof(mm_bseq_file_t *fp)
{
return (ks_eof(fp->ks->f) && fp->s.seq == 0);
Expand Down
4 changes: 3 additions & 1 deletion bseq.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ typedef struct mm_bseq_file_s mm_bseq_file_t;

typedef struct {
int l_seq, rid;
char *name, *seq, *qual;
char *name, *seq, *qual, *comment;
} mm_bseq1_t;

mm_bseq_file_t *mm_bseq_open(const char *fn);
void mm_bseq_close(mm_bseq_file_t *fp);
mm_bseq1_t *mm_bseq_read3(mm_bseq_file_t *fp, int chunk_size, int with_qual, int with_comment, int frag_mode, int *n_);
mm_bseq1_t *mm_bseq_read2(mm_bseq_file_t *fp, int chunk_size, int with_qual, int frag_mode, int *n_);
mm_bseq1_t *mm_bseq_read(mm_bseq_file_t *fp, int chunk_size, int with_qual, int *n_);
mm_bseq1_t *mm_bseq_read_frag2(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int with_comment, int *n_);
mm_bseq1_t *mm_bseq_read_frag(int n_fp, mm_bseq_file_t **fp, int chunk_size, int with_qual, int *n_);
int mm_bseq_eof(mm_bseq_file_t *fp);

Expand Down
5 changes: 5 additions & 0 deletions format.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,8 @@ void mm_write_paf(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, const m
}
if (r->p && (opt_flag & (MM_F_OUT_CS|MM_F_OUT_MD)))
write_cs_or_MD(km, s, mi, t, r, !(opt_flag&MM_F_OUT_CS_LONG), opt_flag&MM_F_OUT_MD);
if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
mm_sprintf_lite(s, "\t%s", t->comment);
}

static void sam_write_sq(kstring_t *s, char *seq, int l, int rev, int comp)
Expand Down Expand Up @@ -475,6 +477,9 @@ void mm_write_sam2(kstring_t *s, const mm_idx_t *mi, const mm_bseq1_t *t, int se
write_sam_cigar(s, flag, 1, t->l_seq, r, opt_flag);
}

if ((opt_flag & MM_F_COPY_COMMENT) && t->comment)
mm_sprintf_lite(s, "\t%s", t->comment);

s->s[s->l] = 0; // we always have room for an extra byte (see str_enlarge)
}

Expand Down
6 changes: 4 additions & 2 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include "getopt.h"
#endif

#define MM_VERSION "2.9-r751-dirty"
#define MM_VERSION "2.9-r752-dirty"

#ifdef __linux__
#include <sys/resource.h>
Expand Down Expand Up @@ -94,7 +94,7 @@ static inline void yes_or_no(mm_mapopt_t *opt, int flag, int long_idx, const cha

int main(int argc, char *argv[])
{
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:";
const char *opt_str = "2aSDw:k:K:t:r:f:Vv:g:G:I:d:XT:s:x:Hcp:M:n:z:A:B:O:E:m:N:Qu:R:hF:LC:y";
mm_mapopt_t opt;
mm_idxopt_t ipt;
int i, c, n_threads = 3, long_idx;
Expand Down Expand Up @@ -140,6 +140,7 @@ int main(int argc, char *argv[])
else if (c == 'Q') opt.flag |= MM_F_NO_QUAL;
else if (c == 'Y') opt.flag |= MM_F_SOFTCLIP;
else if (c == 'L') opt.flag |= MM_F_LONG_CIGAR;
else if (c == 'y') opt.flag |= MM_F_COPY_COMMENT;
else if (c == 'T') opt.sdust_thres = atoi(optarg);
else if (c == 'n') opt.min_cnt = atoi(optarg);
else if (c == 'm') opt.min_chain_score = atoi(optarg);
Expand Down Expand Up @@ -272,6 +273,7 @@ int main(int argc, char *argv[])
fprintf(fp_help, " -R STR SAM read group line in a format like '@RG\\tID:foo\\tSM:bar' []\n");
fprintf(fp_help, " -c output CIGAR in PAF\n");
fprintf(fp_help, " --cs[=STR] output the cs tag; STR is 'short' (if absent) or 'long' [none]\n");
fprintf(fp_help, " --MD output the MD tag\n");
fprintf(fp_help, " -Y use soft clipping for supplementary alignments\n");
fprintf(fp_help, " -t INT number of threads [%d]\n", n_threads);
fprintf(fp_help, " -K NUM minibatch size for mapping [500M]\n");
Expand Down
5 changes: 3 additions & 2 deletions map.c
Original file line number Diff line number Diff line change
Expand Up @@ -442,11 +442,12 @@ static void *worker_pipeline(void *shared, int step, void *in)
pipeline_t *p = (pipeline_t*)shared;
if (step == 0) { // step 0: read sequences
int with_qual = (!!(p->opt->flag & MM_F_OUT_SAM) && !(p->opt->flag & MM_F_NO_QUAL));
int with_comment = !!(p->opt->flag & MM_F_COPY_COMMENT);
int frag_mode = (p->n_fp > 1 || !!(p->opt->flag & MM_F_FRAG_MODE));
step_t *s;
s = (step_t*)calloc(1, sizeof(step_t));
if (p->n_fp > 1) s->seq = mm_bseq_read_frag(p->n_fp, p->fp, p->mini_batch_size, with_qual, &s->n_seq);
else s->seq = mm_bseq_read2(p->fp[0], p->mini_batch_size, with_qual, frag_mode, &s->n_seq);
if (p->n_fp > 1) s->seq = mm_bseq_read_frag2(p->n_fp, p->fp, p->mini_batch_size, with_qual, with_comment, &s->n_seq);
else s->seq = mm_bseq_read3(p->fp[0], p->mini_batch_size, with_qual, with_comment, frag_mode, &s->n_seq);
if (s->seq) {
s->p = p;
for (i = 0; i < s->n_seq; ++i)
Expand Down
1 change: 1 addition & 0 deletions minimap.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#define MM_F_HEAP_SORT 0x400000
#define MM_F_ALL_CHAINS 0x800000
#define MM_F_OUT_MD 0x1000000
#define MM_F_COPY_COMMENT 0x2000000

#define MM_I_HPC 0x1
#define MM_I_NO_SEQ 0x2
Expand Down
3 changes: 3 additions & 0 deletions minimap2.1
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,9 @@ SAM read group line in a format like
.B @RG\\\\tID:foo\\\\tSM:bar
[].
.TP
.B -y
Copy input FASTA/Q comments to output.
.TP
.B -c
Generate CIGAR. In PAF, the CIGAR is written to the `cg' custom tag.
.TP
Expand Down
2 changes: 1 addition & 1 deletion test/MT-orang.fa
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
>MT_orang
>MT_orang co:Z:comment
GTTTATGTAGCTTATTCTATCCAAAGCAATGCACTGAAAATGTCTCGACGGGCCCACACG
CCCCATAAACAAATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTGAGGTTACACAT
GCAAGCATCCCCGCCCCAGTGAGTCGCCCTCCAAGTCACTCTGACTAAGAGGAGCAAGCA
Expand Down

0 comments on commit 08bd212

Please sign in to comment.