-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: adding support for clinvar-genes (#202)
- Loading branch information
Showing
11 changed files
with
539 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
//! Import of minimal ClinVar data. | ||
|
||
use std::{collections::HashSet, io::BufRead, sync::Arc}; | ||
|
||
use clap::Parser; | ||
use prost::Message; | ||
|
||
use crate::{ | ||
clinvar_genes::{ | ||
self, cli::reading::gene_impact::Impact, pbs::GeneImpactRecordCounts, | ||
pbs::Impact as PbImpact, | ||
}, | ||
common, | ||
}; | ||
|
||
/// Command line arguments for `tsv import` sub command. | ||
#[derive(Parser, Debug, Clone)] | ||
#[command(about = "import ClinVar per-gene data into RocksDB", long_about = None)] | ||
pub struct Args { | ||
/// Genome build to use in the build. | ||
#[arg(long, value_enum)] | ||
pub genome_release: common::cli::GenomeRelease, | ||
/// Path to input per-impact JSONL file(s). | ||
#[arg(long, required = true)] | ||
pub path_per_impact_jsonl: String, | ||
/// Path to output RocksDB directory. | ||
#[arg(long)] | ||
pub path_out_rocksdb: String, | ||
|
||
/// Name of the column family to import into. | ||
#[arg(long, default_value = "clinvar-genes")] | ||
pub cf_name: String, | ||
/// Optional path to RocksDB WAL directory. | ||
#[arg(long)] | ||
pub path_wal_dir: Option<String>, | ||
} | ||
|
||
/// Load per-impact JSONL file. | ||
fn load_per_impact_jsonl( | ||
path_per_impact_jsonl: &str, | ||
) -> Result<indexmap::IndexMap<String, Vec<GeneImpactRecordCounts>>, anyhow::Error> { | ||
// Open reader, possibly decompressing gziped files. | ||
let reader: Box<dyn std::io::Read> = if path_per_impact_jsonl.ends_with(".gz") { | ||
Box::new(flate2::read::GzDecoder::new(std::fs::File::open( | ||
path_per_impact_jsonl, | ||
)?)) | ||
} else { | ||
Box::new(std::fs::File::open(path_per_impact_jsonl)?) | ||
}; | ||
|
||
let mut result = indexmap::IndexMap::new(); | ||
|
||
let reader = std::io::BufReader::new(reader); | ||
for line in reader.lines() { | ||
let line = line?; | ||
let record = | ||
serde_json::from_str::<clinvar_genes::cli::reading::gene_impact::Record>(&line)?; | ||
|
||
let mut count_out = Vec::new(); | ||
for (impact, counts) in record.counts { | ||
let impact = match impact { | ||
Impact::ThreePrimeUtrVariant => PbImpact::ThreePrimeUtrVariant, | ||
Impact::FivePrimeUtrVariant => PbImpact::FivePrimeUtrVariant, | ||
Impact::DownstreamGeneVariant => PbImpact::DownstreamTranscriptVariant, | ||
Impact::FrameshiftVariant => PbImpact::FrameshiftVariant, | ||
Impact::InframeIndel => PbImpact::InframeIndel, | ||
Impact::StartLost => PbImpact::StartLost, | ||
Impact::IntronVariant => PbImpact::IntronVariant, | ||
Impact::MissenseVariant => PbImpact::MissenseVariant, | ||
Impact::NonCodingTranscriptVariant => PbImpact::NonCodingTranscriptVariant, | ||
Impact::StopGained => PbImpact::StopGained, | ||
Impact::NoSequenceAlteration => PbImpact::NoSequenceAlteration, | ||
Impact::SpliceAcceptorVariant => PbImpact::SpliceAcceptorVariant, | ||
Impact::SpliceDonorVariant => PbImpact::SpliceDonorVariant, | ||
Impact::StopLost => PbImpact::StopLost, | ||
Impact::SyonymousVariant => PbImpact::SynonymousVariant, | ||
Impact::UpstreamGeneVariant => PbImpact::UpstreamTranscriptVariant, | ||
}; | ||
count_out.push(GeneImpactRecordCounts { | ||
impact: impact as i32, | ||
counts, | ||
}); | ||
} | ||
result.insert(record.hgnc.clone(), count_out); | ||
} | ||
|
||
Ok(result) | ||
} | ||
|
||
/// Perform import of the JSONL files. | ||
fn jsonl_import( | ||
db: &rocksdb::DBWithThreadMode<rocksdb::MultiThreaded>, | ||
args: &Args, | ||
) -> Result<(), anyhow::Error> { | ||
let cf_data = db.cf_handle(&args.cf_name).unwrap(); | ||
|
||
tracing::info!("Loading impact per gene ..."); | ||
let before_per_gene = std::time::Instant::now(); | ||
let impact_per_gene = load_per_impact_jsonl(&args.path_per_impact_jsonl)?; | ||
tracing::info!( | ||
"... done loading impact per gene in {:?}", | ||
&before_per_gene.elapsed() | ||
); | ||
|
||
tracing::info!("Writing to database ..."); | ||
let before_write_to_db = std::time::Instant::now(); | ||
let hgnc_ids = impact_per_gene.keys().cloned().collect::<HashSet<_>>(); | ||
|
||
// Read through all records and insert each into the database. | ||
for hgnc_id in hgnc_ids.iter() { | ||
let record = clinvar_genes::pbs::ClinvarPerGeneRecord { | ||
per_impact_counts: impact_per_gene.get(hgnc_id).cloned().unwrap_or_default(), | ||
}; | ||
let buf = record.encode_to_vec(); | ||
|
||
db.put_cf(&cf_data, hgnc_id, buf)?; | ||
} | ||
tracing::info!( | ||
"... done writing to database in {:?}", | ||
&before_write_to_db.elapsed() | ||
); | ||
|
||
Ok(()) | ||
} | ||
|
||
/// Implementation of `clinvar-minimal import` sub command. | ||
pub fn run(common: &common::cli::Args, args: &Args) -> Result<(), anyhow::Error> { | ||
tracing::info!("Starting 'clinvar-minimal import' command"); | ||
tracing::info!("common = {:#?}", &common); | ||
tracing::info!("args = {:#?}", &args); | ||
|
||
// Open the RocksDB for writing. | ||
tracing::info!("Opening RocksDB for writing ..."); | ||
let before_opening_rocksdb = std::time::Instant::now(); | ||
let options = rocksdb_utils_lookup::tune_options( | ||
rocksdb::Options::default(), | ||
args.path_wal_dir.as_ref().map(|s| s.as_ref()), | ||
); | ||
let cf_names = &["meta", &args.cf_name]; | ||
let db = Arc::new(rocksdb::DB::open_cf_with_opts( | ||
&options, | ||
&args.path_out_rocksdb, | ||
cf_names | ||
.iter() | ||
.map(|name| (name.to_string(), options.clone())) | ||
.collect::<Vec<_>>(), | ||
)?); | ||
tracing::info!(" writing meta information"); | ||
let cf_meta = db.cf_handle("meta").unwrap(); | ||
db.put_cf(&cf_meta, "annonars-version", crate::VERSION)?; | ||
db.put_cf( | ||
&cf_meta, | ||
"genome-release", | ||
format!("{}", args.genome_release), | ||
)?; | ||
db.put_cf(&cf_meta, "db-name", "clinvar-minimal")?; | ||
tracing::info!( | ||
"... done opening RocksDB for writing in {:?}", | ||
before_opening_rocksdb.elapsed() | ||
); | ||
|
||
tracing::info!("Importing TSV files ..."); | ||
let before_import = std::time::Instant::now(); | ||
jsonl_import(&db, args)?; | ||
tracing::info!( | ||
"... done importing TSV files in {:?}", | ||
before_import.elapsed() | ||
); | ||
|
||
tracing::info!("Running RocksDB compaction ..."); | ||
let before_compaction = std::time::Instant::now(); | ||
rocksdb_utils_lookup::force_compaction_cf(&db, cf_names, Some(" "), true)?; | ||
tracing::info!( | ||
"... done compacting RocksDB in {:?}", | ||
before_compaction.elapsed() | ||
); | ||
|
||
tracing::info!("All done. Have a nice day!"); | ||
Ok(()) | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use super::*; | ||
|
||
use clap_verbosity_flag::Verbosity; | ||
use temp_testdir::TempDir; | ||
|
||
#[test] | ||
fn smoke_test_import_tsv() { | ||
let tmp_dir = TempDir::default(); | ||
let common = common::cli::Args { | ||
verbose: Verbosity::new(1, 0), | ||
}; | ||
let args = Args { | ||
genome_release: common::cli::GenomeRelease::Grch37, | ||
path_per_impact_jsonl: String::from("tests/clinvar-genes/gene-variant-report.jsonl"), | ||
path_out_rocksdb: format!("{}", tmp_dir.join("out-rocksdb").display()), | ||
cf_name: String::from("clinvar"), | ||
path_wal_dir: None, | ||
}; | ||
|
||
run(&common, &args).unwrap(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
//! Command line interface for minimal ClinVar data (for Mehari). | ||
|
||
pub mod import; | ||
pub mod query; | ||
pub mod reading; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
//! Querying for gene annotation data. | ||
|
||
use std::sync::Arc; | ||
|
||
use prost::Message; | ||
|
||
use crate::{common, genes::pbs}; | ||
|
||
/// Command line arguments for `clinvar-gene query` sub command. | ||
#[derive(clap::Parser, Debug, Clone)] | ||
#[command(about = "query gene information data from RocksDB", long_about = None)] | ||
pub struct Args { | ||
/// Path to RocksDB directory with data. | ||
#[arg(long)] | ||
pub path_rocksdb: String, | ||
/// Name of the column family to import into. | ||
#[arg(long, default_value = "clinvar-genes")] | ||
pub cf_name: String, | ||
/// Output file (default is stdout == "-"). | ||
#[arg(long, default_value = "-")] | ||
pub out_file: String, | ||
/// Output format. | ||
#[arg(long, default_value = "jsonl")] | ||
pub out_format: common::cli::OutputFormat, | ||
|
||
/// HGNC gene identifier to query for. | ||
#[arg(long)] | ||
pub hgnc_id: String, | ||
} | ||
|
||
/// Open RocksDB database. | ||
fn open_rocksdb( | ||
args: &Args, | ||
) -> Result<Arc<rocksdb::DBWithThreadMode<rocksdb::MultiThreaded>>, anyhow::Error> { | ||
tracing::info!("Opening RocksDB database ..."); | ||
let before_open = std::time::Instant::now(); | ||
let cf_names = &["meta", &args.cf_name]; | ||
let db = Arc::new(rocksdb::DB::open_cf_for_read_only( | ||
&rocksdb::Options::default(), | ||
&args.path_rocksdb, | ||
cf_names, | ||
true, | ||
)?); | ||
|
||
tracing::info!( | ||
"... opening RocksDB database took {:?}", | ||
before_open.elapsed() | ||
); | ||
|
||
Ok(db) | ||
} | ||
|
||
/// Print values to `out_writer`. | ||
fn print_record( | ||
out_writer: &mut Box<dyn std::io::Write>, | ||
output_format: common::cli::OutputFormat, | ||
value: &pbs::Record, | ||
) -> Result<(), anyhow::Error> { | ||
match output_format { | ||
common::cli::OutputFormat::Jsonl => { | ||
writeln!(out_writer, "{}", serde_json::to_string(value)?)?; | ||
} | ||
} | ||
|
||
Ok(()) | ||
} | ||
|
||
/// Implementation of `gene query` sub command. | ||
pub fn run(common: &common::cli::Args, args: &Args) -> Result<(), anyhow::Error> { | ||
tracing::info!("Starting 'gene query' command"); | ||
tracing::info!("common = {:#?}", &common); | ||
tracing::info!("args = {:#?}", &args); | ||
|
||
// Open the RocksDB database. | ||
let db = open_rocksdb(args)?; | ||
let cf_data = db.cf_handle(&args.cf_name).unwrap(); | ||
|
||
// Obtain writer to output. | ||
let mut out_writer = match args.out_file.as_ref() { | ||
"-" => Box::new(std::io::stdout()) as Box<dyn std::io::Write>, | ||
out_file => { | ||
let path = std::path::Path::new(out_file); | ||
Box::new(std::fs::File::create(path).unwrap()) as Box<dyn std::io::Write> | ||
} | ||
}; | ||
|
||
tracing::info!("Running query..."); | ||
let raw_value = db.get_cf(&cf_data, args.hgnc_id.as_bytes())?; | ||
if let Some(raw_value) = raw_value { | ||
print_record( | ||
&mut out_writer, | ||
args.out_format, | ||
&pbs::Record::decode(&mut std::io::Cursor::new(&raw_value))?, | ||
)?; | ||
} else { | ||
tracing::info!("No data found for HGNC ID {}", args.hgnc_id); | ||
} | ||
|
||
tracing::info!("All done. Have a nice day!"); | ||
Ok(()) | ||
} |
Oops, something went wrong.