Skip to content

Commit

Permalink
Merge branch 'ar/dmr-dU' into 'master'
Browse files Browse the repository at this point in the history
[dmr] Add `--assign-code`

See merge request machine-learning/modkit!194
  • Loading branch information
ArtRand committed Jun 22, 2024
2 parents e344ce4 + ea1ead2 commit 67a2eea
Show file tree
Hide file tree
Showing 10 changed files with 196 additions and 64 deletions.
27 changes: 13 additions & 14 deletions src/dmr/bedmethyl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@ use rustc_hash::{FxHashMap, FxHashSet};

use crate::dmr::llr_model::AggregatedCounts;
use crate::genome_positions::StrandedPosition;
use crate::mod_base_code::{
DnaBase, ModCodeRepr, MOD_CODE_TO_DNA_BASE, SUPPORTED_CODES,
};
use crate::mod_base_code::{DnaBase, ModCodeRepr};
use crate::parsing_utils::{
consume_char, consume_digit, consume_float, consume_string,
consume_string_from_list,
Expand Down Expand Up @@ -85,10 +83,6 @@ impl BedMethylLine {
self.interval.stop
}

pub fn check_mod_code_supported(&self) -> bool {
SUPPORTED_CODES.contains(&self.raw_mod_code)
}

pub fn check_base(
&self,
dna_base: DnaBase,
Expand All @@ -108,17 +102,19 @@ impl BedMethylLine {
}
}

/// panics if mod code isn't in
/// [`crate::mod_base_code::MOD_CODE_TO_DNA_BASE`]
pub(crate) fn get_stranded_position(&self) -> StrandedPosition<DnaBase> {
/// panics if mod code isn't in `code_lookup`
pub(crate) fn get_stranded_position(
&self,
code_lookup: &FxHashMap<ModCodeRepr, DnaBase>,
) -> StrandedPosition<DnaBase> {
let strand = match self.strand {
StrandRule::Positive | StrandRule::Both => Strand::Positive,
StrandRule::Negative => Strand::Negative,
};
let dna_base = if strand == Strand::Negative {
MOD_CODE_TO_DNA_BASE.get(&self.raw_mod_code).unwrap().complement()
code_lookup.get(&self.raw_mod_code).unwrap().complement()
} else {
*MOD_CODE_TO_DNA_BASE.get(&self.raw_mod_code).unwrap()
*code_lookup.get(&self.raw_mod_code).unwrap()
};

StrandedPosition { position: self.start(), strand, value: dna_base }
Expand Down Expand Up @@ -244,7 +240,7 @@ mod bedmethylline_tests {

use crate::dmr::bedmethyl::{aggregate_counts2, BedMethylLine};
use crate::genome_positions::GenomePositions;
use crate::mod_base_code::{DnaBase, ModCodeRepr};
use crate::mod_base_code::{DnaBase, ModCodeRepr, MOD_CODE_TO_DNA_BASE};
use crate::position_filter::Iv;

#[test]
Expand Down Expand Up @@ -333,7 +329,10 @@ mod bedmethylline_tests {
let bedmethyl_lines = BufReader::new(fh)
.lines()
.map(|l| BedMethylLine::parse(&l.unwrap()).unwrap())
.filter(|l| positions.contains(&l.get_stranded_position()))
.filter(|l| {
positions
.contains(&l.get_stranded_position(&MOD_CODE_TO_DNA_BASE))
})
.collect::<Vec<BedMethylLine>>();
let counts = aggregate_counts2(&bedmethyl_lines).unwrap();
assert_eq!(&counts.string_counts(), "h:2,m:4");
Expand Down
16 changes: 10 additions & 6 deletions src/dmr/pairwise.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ pub(super) fn get_modification_counts(
lines
.iter()
.filter(|l| {
region_of_interest
.positions
.contains(&l.get_stranded_position())
region_of_interest.positions.contains(
&l.get_stranded_position(
&sample_index.code_lookup,
),
)
})
.collect::<Vec<&BedMethylLine>>()
})
Expand All @@ -44,9 +46,11 @@ pub(super) fn get_modification_counts(
lines
.iter()
.filter(|l| {
region_of_interest
.positions
.contains(&l.get_stranded_position())
region_of_interest.positions.contains(
&l.get_stranded_position(
&sample_index.code_lookup,
),
)
})
.collect::<Vec<&BedMethylLine>>()
})
Expand Down
7 changes: 3 additions & 4 deletions src/dmr/single_site.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,6 @@ impl SingleSiteDmrAnalysis {
};

let (scores_snd, scores_rcv) = crossbeam::channel::bounded(1000);
// let (segment_snd, segment_rcv) = crossbeam::channel::bounded(1000);
let processed_batches = multi_progress_bar.add(get_ticker());
let failure_counter = multi_progress_bar.add(get_ticker());
let success_counter = multi_progress_bar.add(get_ticker());
Expand Down Expand Up @@ -206,8 +205,8 @@ impl SingleSiteDmrAnalysis {
Ok(chrom_to_scores) => {
Some(chrom_to_scores)
}
Err(_e) => {
// error!("batch failed, {e}");
Err(e) => {
debug!("batch failed, {e}");
None
}
}
Expand Down Expand Up @@ -1007,7 +1006,7 @@ impl DmrSegmenter for HmmDmrSegmenter {
);
self.writer.write(row.as_bytes())?;
}
debug!("wrote {} segments", integrated_path.len());
debug!("wrote {} segment(s)", integrated_path.len());

// reset everything
self.curr_region_positions = Vec::new();
Expand Down
129 changes: 113 additions & 16 deletions src/dmr/subcommands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,20 @@ use std::io::{BufWriter, Write};
use std::path::{Path, PathBuf};
use std::sync::Arc;

use anyhow::{bail, Context};
use anyhow::{anyhow, bail, Context};
use clap::{Args, Subcommand};
use indicatif::{MultiProgress, ProgressIterator};
use itertools::Itertools;
use log::{debug, error, info};
use rustc_hash::FxHashMap;

use crate::dmr::pairwise::run_pairwise_dmr;
use crate::dmr::single_site::SingleSiteDmrAnalysis;
use crate::dmr::tabix::{IndexHandler, MultiSampleIndex};
use crate::dmr::util::{parse_roi_bed, HandleMissing, RoiIter};
use crate::genome_positions::GenomePositions;
use crate::logging::init_logging;
use crate::mod_base_code::DnaBase;
use crate::mod_base_code::{DnaBase, ModCodeRepr, MOD_CODE_TO_DNA_BASE};
use crate::util::{
create_out_directory, get_master_progress_bar, get_subroutine_progress_bar,
get_ticker,
Expand Down Expand Up @@ -146,8 +147,22 @@ pub struct PairwiseDmr {
/// Bases to use to calculate DMR, may be multiple. For example, to
/// calculate differentially methylated regions using only cytosine
/// modifications use --base C.
#[arg(short, alias = "base")]
#[arg(short, long="base", alias = "modified-bases", action=clap::ArgAction::Append)]
modified_bases: Vec<char>,
/// Extra assignments of modification codes to their respective primary
/// bases. In general, modkit dmr will use the SAM specification to
/// know which modification codes are appropriate to use for a given
/// primary base. For example "h" is the code for 5hmC, so is appropriate
/// for cytosine bases, but not adenine bases. However, if your
/// bedMethyl file contains custom codes or codes that are not part of
/// the specification, you can specify which primary base they
/// belong to here with --assign-code x:C meaning associate modification
/// code "x" with cytosine (C) primary sequence bases. If a code is
/// encountered that is not part of the specification, the bedMethyl
/// record will not be used, this will be logged.
#[arg(long="assign-code", action=clap::ArgAction::Append)]
mod_code_assignments: Option<Vec<String>>,

/// File to write logs to, it's recommended to use this option.
#[arg(long, alias = "log")]
log_filepath: Option<PathBuf>,
Expand Down Expand Up @@ -243,26 +258,86 @@ pub struct PairwiseDmr {
}

impl PairwiseDmr {
fn check_modified_bases(&self) -> anyhow::Result<()> {
Self::validate_modified_bases(&self.modified_bases)
fn check_modified_bases(
&self,
) -> anyhow::Result<FxHashMap<ModCodeRepr, DnaBase>> {
Self::validate_modified_bases(
&self.modified_bases,
self.mod_code_assignments.as_ref(),
)
}

fn is_single_site(&self) -> bool {
self.regions_bed.is_none()
}

fn validate_modified_bases(bases: &[char]) -> anyhow::Result<()> {
fn parse_raw_assignments(
raw_mod_code_assignments: Option<&Vec<String>>,
) -> anyhow::Result<FxHashMap<ModCodeRepr, DnaBase>> {
if let Some(raw_assignments) = raw_mod_code_assignments {
let user_assignments = raw_assignments.iter().try_fold(
FxHashMap::default(),
|mut acc, next| {
if next.contains(':') {
let parts = next.split(':').collect::<Vec<&str>>();
if parts.len() != 2 {
bail!(
"invalid assignment {next}, should be \
<code>:<DNA>, such as m:C"
)
} else {
let dna_base = parts[1]
.parse::<char>()
.map_err(|e| {
anyhow!(
"invalid DNA base, should be single \
letter, {e}"
)
})
.and_then(|raw| DnaBase::parse(raw))?;
let mod_code = ModCodeRepr::parse(parts[0])?;
debug!(
"assigning modification code {mod_code:?} to \
{dna_base:?}"
);
acc.insert(mod_code, dna_base);
Ok(acc)
}
} else {
bail!(
"invalid assignment {next}, should be \
<code>:<DNA>, such as m:C"
)
}
},
)?;
Ok(MOD_CODE_TO_DNA_BASE
.clone()
.into_iter()
.chain(user_assignments.into_iter())
.collect())
} else {
Ok(MOD_CODE_TO_DNA_BASE.clone())
}
}

fn validate_modified_bases(
bases: &[char],
raw_mod_code_assignments: Option<&Vec<String>>,
) -> anyhow::Result<FxHashMap<ModCodeRepr, DnaBase>> {
if bases.is_empty() {
bail!("need to specify at least 1 modified base")
}
for b in bases.iter() {
match *b {
'A' | 'C' | 'G' | 'T' => {}
'A' | 'C' | 'G' | 'T' => {
debug!("using primary sequence base {b}");
}
_ => bail!("modified base needs to be A, C, G, or T."),
}
}

Ok(())
Self::parse_raw_assignments(raw_mod_code_assignments)
}

pub fn run(&self) -> anyhow::Result<()> {
Expand All @@ -274,13 +349,13 @@ impl PairwiseDmr {
{
bail!("need to provide at least 1 'a' sample and 'b' sample")
}
let code_lookup = self.check_modified_bases()?;

let mpb = MultiProgress::new();
if self.suppress_progress {
mpb.set_draw_target(indicatif::ProgressDrawTarget::hidden());
}

self.check_modified_bases()?;
let modified_bases = self
.modified_bases
.iter()
Expand Down Expand Up @@ -322,8 +397,11 @@ impl PairwiseDmr {
.chain(b_handlers)
.collect::<Vec<IndexHandler>>();

let sample_index =
MultiSampleIndex::new(handlers, self.min_valid_coverage);
let sample_index = MultiSampleIndex::new(
handlers,
code_lookup,
self.min_valid_coverage,
);

let writer: Box<dyn Write> = {
match self.out_path.as_ref() {
Expand Down Expand Up @@ -481,8 +559,21 @@ pub struct MultiSampleDmr {
/// Bases to use to calculate DMR, may be multiple. For example, to
/// calculate differentially methylated regions using only cytosine
/// modifications use --base C.
#[arg(short, alias = "base")]
#[arg(short, long="base", alias = "modified-bases", action=clap::ArgAction::Append)]
modified_bases: Vec<char>,
/// Extra assignments of modification codes to their respective primary
/// bases. In general, modkit dmr will use the SAM specification to
/// know which modification codes are appropriate to use for a given
/// primary base. For example "h" is the code for 5hmC, so is appropriate
/// for cytosine bases, but not adenine bases. However, if your
/// bedMethyl file contains custom codes or codes that are not part of
/// the specification, you can specify which primary base they
/// belong to here with --assign-code x:C meaning associate modification
/// code "x" with cytosine (C) primary sequence bases. If a code is
/// encountered that is not part of the specification, the bedMethyl
/// record will not be used, this will be logged.
#[arg(long="assign-code", action=clap::ArgAction::Append)]
mod_code_assignments: Option<Vec<String>>,
/// File to write logs to, it's recommended to use this option.
#[arg(long, alias = "log")]
log_filepath: Option<PathBuf>,
Expand Down Expand Up @@ -541,8 +632,10 @@ impl MultiSampleDmr {
info!("creating directory at {:?}", &self.out_dir);
std::fs::create_dir_all(&self.out_dir)?;
}

PairwiseDmr::validate_modified_bases(&self.modified_bases)?;
let code_lookup = PairwiseDmr::validate_modified_bases(
&self.modified_bases,
self.mod_code_assignments.as_ref(),
)?;
let index_fps = self
.indices
.chunks(2)
Expand Down Expand Up @@ -618,8 +711,12 @@ impl MultiSampleDmr {
(names, handlers)
},
);
let sample_index =
MultiSampleIndex::new(handlers, self.min_valid_coverage);

let sample_index = MultiSampleIndex::new(
handlers,
code_lookup,
self.min_valid_coverage,
);

let genome_positions = GenomePositions::new_from_sequences(
&motifs,
Expand Down
Loading

0 comments on commit 67a2eea

Please sign in to comment.