Skip to content

Commit

Permalink
Running SemiBin2 with abundance information from strobealign-aemb
Browse files Browse the repository at this point in the history
  • Loading branch information
psj1997 committed Feb 26, 2024
2 parents 97f6122 + 02eb2c8 commit 2f8d572
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 7 deletions.
1 change: 1 addition & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Unreleased
* SemiBin1: Introduce separate SemiBin1 command
* internal: Code simplification and refactor
* deprecation: Deprecate --orf-finder=fraggenescan option
* SemiBin: do not use more processes than can be taken advantage of (#155)

Version 2.0.2 Oct 31 2023 by BigDataBiology
* multi_easy_bin: Fix multi_easy_bin with --write-pre-recluster (#128)
Expand Down
4 changes: 2 additions & 2 deletions SemiBin/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def run_infomap(g, edge_weights, vertex_weights, num_process):
'''Run infomap, using multiple processors (if available)'''
if num_process == 1:
return g.community_infomap(edge_weights=edge_weights, vertex_weights=vertex_weights, trials=NR_INFOMAP_TRIALS)
import multiprocessing
with multiprocessing.Pool(num_process) as p:
import multiprocessing as mp
with mp.Pool(min(num_process, NR_INFOMAP_TRIALS)) as p:
rs = [p.apply_async(run_infomap1, (g, edge_weights, vertex_weights, 1))
for _ in range(NR_INFOMAP_TRIALS)]
p.close()
Expand Down
39 changes: 39 additions & 0 deletions SemiBin/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,44 @@ def generate_sequence_features_single(logger, contig_fasta,
logger.info('We will only calculate k-mer features.')

if not only_kmer:
n_sample = len(bams)
is_combined = n_sample >= 5
bam_list = bams

logger.info('Calculating coverage for every sample.')

with Pool(min(num_process, len(bams))) as pool:
results = [
pool.apply_async(
generate_cov,
args=(
bam_file,
bam_index,
output,
must_link_threshold,
is_combined,
binned_length,
logger,
None
))
for bam_index, bam_file in enumerate(bams)]
for r in results:
s = r.get()
logger.info(f'Processed: {s}')

for bam_index, bam_file in enumerate(bams):
if not os.path.exists(os.path.join(output, '{}_data_cov.csv'.format(
os.path.split(bam_file)[-1] + '_{}'.format(bam_index)))):
sys.stderr.write(
f"Error: Generating coverage file fail\n")
sys.exit(1)
if is_combined:
if not os.path.exists(os.path.join(output, '{}_data_split_cov.csv'.format(
os.path.split(bam_file)[-1] + '_{}'.format(bam_index)))):
sys.stderr.write(
f"Error: Generating coverage file fail\n")
sys.exit(1)

logger.debug('Start generating kmer features from fasta file.')
kmer_whole = generate_kmer_features_from_fasta(
contig_fasta, binned_length, 4)
Expand Down Expand Up @@ -922,6 +960,7 @@ def fasta_sample_iter(fn):
os.path.join(args.output, f'samples/{sample}.fa'),
args.ratio)


if args.bams:
with Pool(args.num_process if args.num_process != 0 else None) as pool:
results = [
Expand Down
8 changes: 7 additions & 1 deletion SemiBin/naive_orffinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@
STOP_CODONS = ['TAA', 'TAG', 'TGA']
MIN_LEN = 90

# See https://github.com/BigDataBiology/SemiBin/issues/155
# A bit of testing showed that scaling tops up at 10~12 processes (on a 16-core
# machine). This leaves a little leeway for a few more processes, but still
# caps it to avoid #155
MAX_PROCESS_ORFS = 24

def reverse_complement(seq):
return seq[::-1].translate(str.maketrans('ATGC', 'TACG'))

Expand Down Expand Up @@ -123,7 +129,7 @@ def run_naiveorf(fasta_path, num_process, output):
out.write(f'{translate(extract(seq, orf))}\n')
else:
ctx = mp.get_context('spawn')
with ctx.Pool(processes=num_process) as p:
with ctx.Pool(processes=min(num_process, MAX_PROCESS_ORFS)) as p:
outs = p.imap(get_orfs,
fasta.fasta_iter(fasta_path),
chunksize=8)
Expand Down
18 changes: 14 additions & 4 deletions test/test_bin.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,14 @@ def test_cluster():
res,
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

def _renumber_cluster(xs):
ys = np.zeros_like(xs)
renumber = {}
for ix,x in enumerate(xs):
if x not in renumber:
renumber[x] = len(renumber)
ys[ix] = renumber[x]
return ys

def test_recluster():
contig_dict = {h:seq for h,seq in fasta_iter('test/bin_data/input.fasta')}
Expand All @@ -116,10 +124,12 @@ def test_recluster():
random_seed=123)

# Computed with a previous version
np.testing.assert_array_equal(
reclustered,
np.array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 29, 21, 22, 23, 24, 25, 26,
expected = np.array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 29, 21, 22, 23, 24, 25, 26,
26, 28, 27, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 10, 11, 12, 13,
14, 15, 16, 17, 18, 19]))
14, 15, 16, 17, 18, 19])

np.testing.assert_array_equal(
_renumber_cluster(reclustered),
_renumber_cluster(expected))


0 comments on commit 2f8d572

Please sign in to comment.