Skip to content

Commit

Permalink
feat: adding support for clinvar-genes (#202)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Sep 11, 2023
1 parent 0e17128 commit 32e9f66
Show file tree
Hide file tree
Showing 11 changed files with 539 additions and 3 deletions.
13 changes: 12 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,23 @@
// The custom build script, needed as we use flatbuffers.

fn main() {
println!("cargo:rerun-if-changed=annonars/clinvar/v1/minimal.proto");
println!("cargo:rerun-if-changed=annonars/clinvar/v1/per_gene.proto");
println!("cargo:rerun-if-changed=annonars/cons/v1/base.proto");
println!("cargo:rerun-if-changed=annonars/dbsnp/v1/base.proto");
println!("cargo:rerun-if-changed=annonars/gnomad/v1/mtdna.proto");
println!("cargo:rerun-if-changed=annonars/gnomad/v1/nuclear.proto");
println!("cargo:rerun-if-changed=annonars/gnomad/v1/vep_common.proto");
println!("cargo:rerun-if-changed=annonars/gnomad/v1/vep_gnomad2.proto");
println!("cargo:rerun-if-changed=annonars/gnomad/v1/vep_gnomad3.proto");
println!("cargo:rerun-if-changed=annonars/helixmtdb/v1/base.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/clinvar/v1/minimal.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/cons/v1/base.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/dbsnp/v1/base.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/gene/v1/base.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/gnomad/v1/mtdna.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/gnomad/v1/gnomad2.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/gnomad/v1/gnomad3.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/gnomad/v1/mtdna.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/gnomad/v1/vep_common.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/gnomad/v1/vep_gnomad2.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/gnomad/v1/vep_gnomad3.proto");
Expand Down Expand Up @@ -36,6 +46,7 @@ fn main() {
.compile_protos(
&[
"annonars/clinvar/v1/minimal.proto",
"annonars/clinvar/v1/per_gene.proto",
"annonars/cons/v1/base.proto",
"annonars/dbsnp/v1/base.proto",
"annonars/gene/v1/base.proto",
Expand Down
205 changes: 205 additions & 0 deletions src/clinvar_genes/cli/import.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
//! Import of minimal ClinVar data.

use std::{collections::HashSet, io::BufRead, sync::Arc};

use clap::Parser;
use prost::Message;

use crate::{
clinvar_genes::{
self, cli::reading::gene_impact::Impact, pbs::GeneImpactRecordCounts,
pbs::Impact as PbImpact,
},
common,
};

/// Command line arguments for `tsv import` sub command.
#[derive(Parser, Debug, Clone)]
#[command(about = "import ClinVar per-gene data into RocksDB", long_about = None)]
pub struct Args {
/// Genome build to use in the build.
#[arg(long, value_enum)]
pub genome_release: common::cli::GenomeRelease,
/// Path to input per-impact JSONL file(s).
#[arg(long, required = true)]
pub path_per_impact_jsonl: String,
/// Path to output RocksDB directory.
#[arg(long)]
pub path_out_rocksdb: String,

/// Name of the column family to import into.
#[arg(long, default_value = "clinvar-genes")]
pub cf_name: String,
/// Optional path to RocksDB WAL directory.
#[arg(long)]
pub path_wal_dir: Option<String>,
}

/// Load per-impact JSONL file.
fn load_per_impact_jsonl(
path_per_impact_jsonl: &str,
) -> Result<indexmap::IndexMap<String, Vec<GeneImpactRecordCounts>>, anyhow::Error> {
// Open reader, possibly decompressing gziped files.
let reader: Box<dyn std::io::Read> = if path_per_impact_jsonl.ends_with(".gz") {
Box::new(flate2::read::GzDecoder::new(std::fs::File::open(

Check warning on line 44 in src/clinvar_genes/cli/import.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/import.rs#L44

Added line #L44 was not covered by tests
path_per_impact_jsonl,
)?))
} else {
Box::new(std::fs::File::open(path_per_impact_jsonl)?)
};

let mut result = indexmap::IndexMap::new();

let reader = std::io::BufReader::new(reader);
for line in reader.lines() {
let line = line?;
let record =
serde_json::from_str::<clinvar_genes::cli::reading::gene_impact::Record>(&line)?;

let mut count_out = Vec::new();
for (impact, counts) in record.counts {
let impact = match impact {
Impact::ThreePrimeUtrVariant => PbImpact::ThreePrimeUtrVariant,
Impact::FivePrimeUtrVariant => PbImpact::FivePrimeUtrVariant,
Impact::DownstreamGeneVariant => PbImpact::DownstreamTranscriptVariant,

Check warning on line 64 in src/clinvar_genes/cli/import.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/import.rs#L64

Added line #L64 was not covered by tests
Impact::FrameshiftVariant => PbImpact::FrameshiftVariant,
Impact::InframeIndel => PbImpact::InframeIndel,
Impact::StartLost => PbImpact::StartLost,

Check warning on line 67 in src/clinvar_genes/cli/import.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/import.rs#L67

Added line #L67 was not covered by tests
Impact::IntronVariant => PbImpact::IntronVariant,
Impact::MissenseVariant => PbImpact::MissenseVariant,
Impact::NonCodingTranscriptVariant => PbImpact::NonCodingTranscriptVariant,
Impact::StopGained => PbImpact::StopGained,
Impact::NoSequenceAlteration => PbImpact::NoSequenceAlteration,
Impact::SpliceAcceptorVariant => PbImpact::SpliceAcceptorVariant,
Impact::SpliceDonorVariant => PbImpact::SpliceDonorVariant,
Impact::StopLost => PbImpact::StopLost,
Impact::SyonymousVariant => PbImpact::SynonymousVariant,
Impact::UpstreamGeneVariant => PbImpact::UpstreamTranscriptVariant,

Check warning on line 77 in src/clinvar_genes/cli/import.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/import.rs#L77

Added line #L77 was not covered by tests
};
count_out.push(GeneImpactRecordCounts {
impact: impact as i32,
counts,
});
}
result.insert(record.hgnc.clone(), count_out);
}

Ok(result)
}

/// Perform import of the JSONL files.
fn jsonl_import(
db: &rocksdb::DBWithThreadMode<rocksdb::MultiThreaded>,
args: &Args,
) -> Result<(), anyhow::Error> {
let cf_data = db.cf_handle(&args.cf_name).unwrap();

tracing::info!("Loading impact per gene ...");
let before_per_gene = std::time::Instant::now();
let impact_per_gene = load_per_impact_jsonl(&args.path_per_impact_jsonl)?;
tracing::info!(
"... done loading impact per gene in {:?}",
&before_per_gene.elapsed()

Check warning on line 102 in src/clinvar_genes/cli/import.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/import.rs#L102

Added line #L102 was not covered by tests
);

tracing::info!("Writing to database ...");
let before_write_to_db = std::time::Instant::now();
let hgnc_ids = impact_per_gene.keys().cloned().collect::<HashSet<_>>();

// Read through all records and insert each into the database.
for hgnc_id in hgnc_ids.iter() {
let record = clinvar_genes::pbs::ClinvarPerGeneRecord {
per_impact_counts: impact_per_gene.get(hgnc_id).cloned().unwrap_or_default(),
};
let buf = record.encode_to_vec();

db.put_cf(&cf_data, hgnc_id, buf)?;
}
tracing::info!(
"... done writing to database in {:?}",
&before_write_to_db.elapsed()

Check warning on line 120 in src/clinvar_genes/cli/import.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/import.rs#L120

Added line #L120 was not covered by tests
);

Ok(())
}

/// Implementation of `clinvar-minimal import` sub command.
pub fn run(common: &common::cli::Args, args: &Args) -> Result<(), anyhow::Error> {
tracing::info!("Starting 'clinvar-minimal import' command");
tracing::info!("common = {:#?}", &common);
tracing::info!("args = {:#?}", &args);

// Open the RocksDB for writing.
tracing::info!("Opening RocksDB for writing ...");
let before_opening_rocksdb = std::time::Instant::now();
let options = rocksdb_utils_lookup::tune_options(
rocksdb::Options::default(),
args.path_wal_dir.as_ref().map(|s| s.as_ref()),
);
let cf_names = &["meta", &args.cf_name];
let db = Arc::new(rocksdb::DB::open_cf_with_opts(
&options,
&args.path_out_rocksdb,
cf_names
.iter()
.map(|name| (name.to_string(), options.clone()))
.collect::<Vec<_>>(),
)?);
tracing::info!(" writing meta information");
let cf_meta = db.cf_handle("meta").unwrap();
db.put_cf(&cf_meta, "annonars-version", crate::VERSION)?;
db.put_cf(
&cf_meta,
"genome-release",
format!("{}", args.genome_release),
)?;
db.put_cf(&cf_meta, "db-name", "clinvar-minimal")?;
tracing::info!(
"... done opening RocksDB for writing in {:?}",
before_opening_rocksdb.elapsed()

Check warning on line 159 in src/clinvar_genes/cli/import.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/import.rs#L159

Added line #L159 was not covered by tests
);

tracing::info!("Importing TSV files ...");
let before_import = std::time::Instant::now();
jsonl_import(&db, args)?;
tracing::info!(
"... done importing TSV files in {:?}",
before_import.elapsed()

Check warning on line 167 in src/clinvar_genes/cli/import.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/import.rs#L167

Added line #L167 was not covered by tests
);

tracing::info!("Running RocksDB compaction ...");
let before_compaction = std::time::Instant::now();
rocksdb_utils_lookup::force_compaction_cf(&db, cf_names, Some(" "), true)?;
tracing::info!(
"... done compacting RocksDB in {:?}",
before_compaction.elapsed()

Check warning on line 175 in src/clinvar_genes/cli/import.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/import.rs#L175

Added line #L175 was not covered by tests
);

tracing::info!("All done. Have a nice day!");
Ok(())
}

#[cfg(test)]
mod test {
use super::*;

use clap_verbosity_flag::Verbosity;
use temp_testdir::TempDir;

#[test]
fn smoke_test_import_tsv() {
let tmp_dir = TempDir::default();
let common = common::cli::Args {
verbose: Verbosity::new(1, 0),
};
let args = Args {
genome_release: common::cli::GenomeRelease::Grch37,
path_per_impact_jsonl: String::from("tests/clinvar-genes/gene-variant-report.jsonl"),
path_out_rocksdb: format!("{}", tmp_dir.join("out-rocksdb").display()),
cf_name: String::from("clinvar"),
path_wal_dir: None,
};

run(&common, &args).unwrap();
}
}
5 changes: 5 additions & 0 deletions src/clinvar_genes/cli/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
//! Command line interface for minimal ClinVar data (for Mehari).

pub mod import;
pub mod query;
pub mod reading;
101 changes: 101 additions & 0 deletions src/clinvar_genes/cli/query.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
//! Querying for gene annotation data.

use std::sync::Arc;

use prost::Message;

use crate::{common, genes::pbs};

/// Command line arguments for `clinvar-gene query` sub command.
#[derive(clap::Parser, Debug, Clone)]
#[command(about = "query gene information data from RocksDB", long_about = None)]
pub struct Args {
/// Path to RocksDB directory with data.
#[arg(long)]
pub path_rocksdb: String,
/// Name of the column family to import into.
#[arg(long, default_value = "clinvar-genes")]
pub cf_name: String,
/// Output file (default is stdout == "-").
#[arg(long, default_value = "-")]
pub out_file: String,
/// Output format.
#[arg(long, default_value = "jsonl")]
pub out_format: common::cli::OutputFormat,

/// HGNC gene identifier to query for.
#[arg(long)]
pub hgnc_id: String,
}

/// Open RocksDB database.
fn open_rocksdb(

Check warning on line 32 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L32

Added line #L32 was not covered by tests
args: &Args,
) -> Result<Arc<rocksdb::DBWithThreadMode<rocksdb::MultiThreaded>>, anyhow::Error> {
tracing::info!("Opening RocksDB database ...");
let before_open = std::time::Instant::now();
let cf_names = &["meta", &args.cf_name];
let db = Arc::new(rocksdb::DB::open_cf_for_read_only(
&rocksdb::Options::default(),

Check warning on line 39 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L35-L39

Added lines #L35 - L39 were not covered by tests
&args.path_rocksdb,
cf_names,
true,
)?);

tracing::info!(

Check warning on line 45 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L45

Added line #L45 was not covered by tests
"... opening RocksDB database took {:?}",
before_open.elapsed()

Check warning on line 47 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L47

Added line #L47 was not covered by tests
);

Ok(db)

Check warning on line 50 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L50

Added line #L50 was not covered by tests
}

/// Print values to `out_writer`.
fn print_record(

Check warning on line 54 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L54

Added line #L54 was not covered by tests
out_writer: &mut Box<dyn std::io::Write>,
output_format: common::cli::OutputFormat,
value: &pbs::Record,
) -> Result<(), anyhow::Error> {
match output_format {
common::cli::OutputFormat::Jsonl => {
writeln!(out_writer, "{}", serde_json::to_string(value)?)?;

Check warning on line 61 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L61

Added line #L61 was not covered by tests
}
}

Ok(())

Check warning on line 65 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L65

Added line #L65 was not covered by tests
}

/// Implementation of `gene query` sub command.
pub fn run(common: &common::cli::Args, args: &Args) -> Result<(), anyhow::Error> {
tracing::info!("Starting 'gene query' command");
tracing::info!("common = {:#?}", &common);
tracing::info!("args = {:#?}", &args);

Check warning on line 72 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L69-L72

Added lines #L69 - L72 were not covered by tests

// Open the RocksDB database.
let db = open_rocksdb(args)?;
let cf_data = db.cf_handle(&args.cf_name).unwrap();

Check warning on line 76 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L75-L76

Added lines #L75 - L76 were not covered by tests

// Obtain writer to output.
let mut out_writer = match args.out_file.as_ref() {
"-" => Box::new(std::io::stdout()) as Box<dyn std::io::Write>,
out_file => {
let path = std::path::Path::new(out_file);
Box::new(std::fs::File::create(path).unwrap()) as Box<dyn std::io::Write>

Check warning on line 83 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L79-L83

Added lines #L79 - L83 were not covered by tests
}
};

tracing::info!("Running query...");
let raw_value = db.get_cf(&cf_data, args.hgnc_id.as_bytes())?;
if let Some(raw_value) = raw_value {

Check warning on line 89 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L87-L89

Added lines #L87 - L89 were not covered by tests
print_record(
&mut out_writer,
args.out_format,
&pbs::Record::decode(&mut std::io::Cursor::new(&raw_value))?,

Check warning on line 93 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L93

Added line #L93 was not covered by tests
)?;
} else {
tracing::info!("No data found for HGNC ID {}", args.hgnc_id);

Check warning on line 96 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L96

Added line #L96 was not covered by tests
}

tracing::info!("All done. Have a nice day!");
Ok(())

Check warning on line 100 in src/clinvar_genes/cli/query.rs

View check run for this annotation

Codecov / codecov/patch

src/clinvar_genes/cli/query.rs#L99-L100

Added lines #L99 - L100 were not covered by tests
}
Loading

0 comments on commit 32e9f66

Please sign in to comment.