Skip to content

Commit

Permalink
extract then merge
Browse files Browse the repository at this point in the history
  • Loading branch information
alienzj committed Apr 25, 2024
1 parent 59c6e35 commit eb52cde
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 18 deletions.
62 changes: 62 additions & 0 deletions SemiBin/generate_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,68 @@ def combine_cov(cov_dir : str, bam_list, is_combined : bool): # bam_list : list[
data_split_cov = None
return data_cov, data_split_cov


def combine_sample_cov(sample: str, cov_dir: str, bam_list, is_combined: bool, separator):
"""
generate cov/cov_split for specific sample in one file
Parameters
----------
sample_id : sample name
cov_dir : where coverage files are stored
bam_list : list of BAM files
is_combined : whether to process split files
separator: separator
Returns
-------
sample_cov : DataFrame
sample_cov_split : DataFrame (if is_combined) or None (otherwise)
"""
import pandas as pd
import numpy as np

covs = []
split_covs = []
for bam_index, bam_file in enumerate(bam_list):
bam_fname = os.path.split(bam_file)[-1]
data_cov = pd.read_csv(f'{cov_dir}/{bam_fname}_{bam_index}_data_cov.csv', index_col=0)
data_cov = data_cov.reset_index()
columns_list = list(data_cov.columns)
columns_list[0] = 'contig_name'
data_cov.columns = columns_list

part_data = data_cov[data_cov['contig_name'].str.contains(sample + separator, regex=False)]
part_data = part_data.set_index("contig_name")
part_data.index.name = None
part_data.index = [ix.split(separator)[1] for ix in part_data.index]
covs.append(part_data)

if is_combined:
data_split_cov = pd.read_csv(f'{cov_dir}/{bam_fname}_{bam_index}_data_split_cov.csv', index_col=0)
data_split_cov = data_split_cov.reset_index()
columns_list = list(data_split_cov.columns)
columns_list[0] = 'contig_name'
data_split_cov.columns = columns_list

part_data = data_split_cov[data_split_cov['contig_name'].str.contains(sample + separator, regex=False)]
part_data = part_data.set_index("contig_name")
part_data.index.name = None
part_data.index = [ix.split(separator)[1] for ix in part_data.index]
split_covs.append(part_data)

sample_cov = pd.concat(covs, axis=1)
sample_cov.index = sample_cov.index.astype(str)
if is_combined:
sample_cov_split = pd.concat(split_covs, axis=1)
sample_cov_split.index = sample_cov_split.index.astype(str)
abun_scale = (sample_cov_split.mean() / 100).apply(np.ceil) * 100
sample_cov_split = sample_cov_split.div(abun_scale)
else:
sample_cov_split = None
return sample_cov, sample_cov_split


def generate_cov_from_abundances(abundances, output, contig_path, contig_threshold=1000, sep=None, contig_threshold_dict=None):
import pandas as pd
import numpy as np
Expand Down
24 changes: 6 additions & 18 deletions SemiBin/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from . import utils
from .utils import validate_normalize_args, get_must_link_threshold, generate_cannot_link, \
set_random_seed, process_fasta, split_data, get_model_path, extract_bams
from .generate_coverage import generate_cov, combine_cov, generate_cov_from_abundances
from .generate_coverage import generate_cov, combine_cov, combine_sample_cov, generate_cov_from_abundances
from .generate_kmer import generate_kmer_features_from_fasta
from .fasta import fasta_iter

Expand Down Expand Up @@ -978,29 +978,17 @@ def fasta_sample_iter(fn):
sys.exit(1)

# Generate cov features for every sample
data_cov, data_split_cov = combine_cov(os.path.join(args.output, 'samples'), args.bams, is_combined)
if is_combined:
data_split_cov = data_split_cov.reset_index()
columns_list = list(data_split_cov.columns)
columns_list[0] = 'contig_name'
data_split_cov.columns = columns_list

data_cov = data_cov.reset_index()
columns_list = list(data_cov.columns)
columns_list[0] = 'contig_name'
data_cov.columns = columns_list

for sample in sample_list:
output_path = os.path.join(args.output, 'samples', sample)
os.makedirs(output_path, exist_ok=True)

part_data = split_data(data_cov, sample, args.separator, is_combined)
part_data.to_csv(os.path.join(output_path, 'data_cov.csv'))
sample_cov, sample_cov_split = combine_sample_cov(
sample, os.path.join(args.output, "samples"), args.bams, is_combined, args.separator)

sample_cov.to_csv(os.path.join(output_path, 'data_cov.csv'))

if is_combined:
part_data = split_data(data_split_cov, sample, args.separator, is_combined)
part_data.to_csv(os.path.join(
output_path, 'data_split_cov.csv'))
sample_cov_split.to_csv(os.path.join(output_path, 'data_split_cov.csv'))

sample_contig_fasta = os.path.join(
args.output, f'samples/{sample}.fa')
Expand Down

0 comments on commit eb52cde

Please sign in to comment.