Skip to content

Commit

Permalink
kr2r
Browse files Browse the repository at this point in the history
  • Loading branch information
eric committed Jan 22, 2024
1 parent 9aa91ad commit a268ee8
Show file tree
Hide file tree
Showing 16 changed files with 702 additions and 25 deletions.
23 changes: 19 additions & 4 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ on:
# On each push to the `release` branch it will create or update a GitHub release, build your app, and upload the artifacts to the release.
env:
CARGO_TERM_COLOR: always
BINARIES_LIST: 'ncbi estimate_capacity'

jobs:
build-and-release:
Expand Down Expand Up @@ -56,10 +57,18 @@ jobs:
# shell: bash

- name: Prepare asset name
id: prep
run: |
PLATFORM_TAG=$(echo ${{ matrix.platform }} | sed -e 's/-latest//' -e 's/ubuntu-20.04/linux-x86_64/' -e 's/macos/macos-x86_64/' -e 's/windows/windows-x86_64/')
echo "ASSET_NAME=ncbi-$PLATFORM_TAG$(if [ ${{ runner.os }} = 'Windows' ]; then echo '.exe'; fi)" >> $GITHUB_ENV
echo "ASSET_NAME=${PLATFORM_TAG}.tar.gz" >> $GITHUB_ENV
shell: bash

- name: Create tar.gz archive
run: |
mkdir -p ./target/release/packaged
for binary in ${{ env.BINARIES_LIST }}; do
cp "./target/release/$binary" "./target/release/packaged/"
done
tar czvf "./target/release/${{ env.ASSET_NAME }}" -C "./target/release/packaged/" .
shell: bash

- name: Upload Release Asset
Expand All @@ -79,14 +88,20 @@ jobs:
docker run --name centos7-container -v $GITHUB_WORKSPACE:/github/workspace -w /github/workspace centos:7 \
/bin/bash -c "yum update -y && yum install -y gcc make openssl openssl-devel && curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y && export PATH=\$HOME/.cargo/bin:\$PATH && cd /github/workspace && cargo build --release"
docker cp centos7-container:/github/workspace/target/release/ncbi ./ncbi-centos7
mkdir -p ./target/release/packaged_centos7
for binary in $BINARIES_LIST; do
docker cp centos7-container:/github/workspace/target/release/$binary ./target/release/packaged_centos7/
done
tar czvf ./kraken_rust_centos7.tar.gz -C ./target/release/packaged_centos7 .
docker rm centos7-container
- name: Upload Release Asset for CentOS 7
if: matrix.platform == 'ubuntu-20.04'
run: |
gh release upload ${{ github.ref_name }} \
./ncbi-centos7 \
./kraken_rust_centos7.tar.gz \
--clobber
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
Expand Down
5 changes: 5 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"rust-analyzer.linkedProjects": [
"./kr2r/Cargo.toml"
]
}
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[workspace]
members = [
"ncbi","kr2r"
"ncbi",
"kr2r"
]

resolver = "2"
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,10 @@ Options:
-h, --help
Print help (see a summary with '-h')
```



## Bloom Filter

布隆过滤器是一种空间效率高的概率数据结构,用于测试一个元素是否在一个集合中。
这个或许可以用来先判断 hash_value 是否存在于 hash_table 中。过滤掉 60% 左右无效的 reads。
10 changes: 10 additions & 0 deletions kr2r/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,23 @@ edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[[bin]]
name = "estimate_capacity"
path = "src/bin/estimate_capacity.rs"


[features]
default = ["dna"]
dna = []
protein = []


[dependencies]
clap = { version = "4.4.10", features = ["derive"] }
seq_io = "0.3.2"
hyperloglogplus = { version = "*", features = ["const-loop"] }
# mur3 = "0.1.0"
walkdir = "2"


[dev-dependencies]
Expand Down
109 changes: 109 additions & 0 deletions kr2r/src/bin/estimate_capacity.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
use clap::{error::ErrorKind, Error, Parser};
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
use kr2r::mmscanner::{MinimizerScanner, BITS_PER_CHAR, DEFAULT_SPACED_SEED_MASK};
use kr2r::utils::{expand_spaced_seed_mask, find_library_fna_files};
use kr2r::KBuildHasher;
use seq_io::fasta::{Reader, Record};
use seq_io::parallel::read_parallel;
use std::path::PathBuf;
use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc, Mutex,
};

#[derive(Parser, Debug)]
#[clap(
version,
about = "estimate capacity",
long_about = "Estimates the size of the Kraken 2 hash table."
)]
struct Args {
/// 构建数据库的目录
#[arg(long = "db", default_value = "lib")]
database: PathBuf,

/// Set length of k-mers, k must be positive integer, k=35, k cannot be less than l
#[clap(short, long, value_parser = clap::value_parser!(u64).range(1..), required = true)]
k_mer: u64,

/// Set length of minimizers, 1 <= l <= 31
#[clap(short, long, value_parser = clap::value_parser!(u8).range(1..=31), required = true)]
l_mer: u8,

/// Set maximum qualifying hash code
#[clap(short, long, default_value = "4")]
n: usize,

/// Spaced seed mask
#[clap(short = 'S', long, default_value= "0", value_parser = parse_binary)]
spaced_seed_mask: u64,

/// Minimizer ordering toggle mask
#[clap(short = 'T', long, value_parser = parse_binary)]
toggle_mask: Option<u64>,

/// Read block size
#[clap(short = 'B', long, default_value = "31457280")]
block_size: usize,

/// Number of threads
#[clap(short = 'p', long, default_value = "4")]
threads: usize,
}

fn parse_binary(src: &str) -> Result<u64, std::num::ParseIntError> {
u64::from_str_radix(src, 2)
}

fn main() {
let mut args = Args::parse();
if args.k_mer < args.l_mer as u64 {
let err = Error::raw(ErrorKind::ValueValidation, "k cannot be less than l");
err.exit();
}
if args.spaced_seed_mask != DEFAULT_SPACED_SEED_MASK {
args.spaced_seed_mask =
expand_spaced_seed_mask(args.spaced_seed_mask, BITS_PER_CHAR as u64);
}
let fna_files = find_library_fna_files(args.database);
let hllp: HyperLogLogPlus<u64, _> = HyperLogLogPlus::new(16, KBuildHasher::default()).unwrap();

let hllp = Arc::new(Mutex::new(hllp));
let counter = Arc::new(AtomicUsize::new(0)); // 初始化原子计数器

for fna_file in fna_files {
println!("fna_file {:?}", fna_file);
let reader = Reader::from_path(fna_file).unwrap();
let counter_clone = counter.clone();
read_parallel(
reader,
4,
2,
|record_set| {
for record in record_set.into_iter() {
let k_mer = args.k_mer as usize;
let l_mer = args.l_mer as usize;
let mut scranner =
MinimizerScanner::default(record.seq().to_vec(), k_mer, l_mer);
scranner.set_spaced_seed_mask(args.spaced_seed_mask);
if let Some(toggle_mask) = args.toggle_mask {
scranner.set_toggle_mask(toggle_mask);
}
while let Some(minimizer) = scranner.next_minimizer() {
let mut hllp_clone = hllp.lock().unwrap();
hllp_clone.insert(&minimizer);
counter_clone.fetch_add(1, Ordering::SeqCst); // 递增计数器
}
}
},
|_| {},
);
}

let final_count = counter.load(Ordering::SeqCst); // 读取计数器的最终值

let mut hllp_clone = hllp.lock().unwrap();
let hllp_count = hllp_clone.count();
println!("Final count: {:?}", final_count);
println!("HLLP count: {:?}", hllp_count);
}
Loading

0 comments on commit a268ee8

Please sign in to comment.