Skip to content

Commit

Permalink
reduce initial flight and fix some params for refining
Browse files Browse the repository at this point in the history
  • Loading branch information
rhysnewell committed Nov 3, 2023
1 parent c232fc4 commit 3ff27f5
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 22 deletions.
2 changes: 1 addition & 1 deletion flight.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ channels:
- bioconda
- defaults
dependencies:
- python>=3.9
- python>=3.8,<=3.10
- joblib>=1,<1.2 # <1.2 because of https://github.com/scikit-learn-contrib/hdbscan/pull/563
- tbb
- hdbscan
Expand Down
7 changes: 7 additions & 0 deletions flight/flight.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,13 @@ def main():
''')

## Main input array. Coverages from CoverM contig
refine_options.add_argument(
'--output_prefix',
help='The prefix for the output files',
dest='output_prefix',
required=False,
default="refined_bins"
)

refine_options.add_argument(
'--checkm_file',
Expand Down
6 changes: 4 additions & 2 deletions flight/rosella/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,16 +795,18 @@ def rescue_contigs(self, min_bin_size=200000):
max_bin_id += 1


def write_bins(self, min_bin_size=200000):
def write_bins(self, min_bin_size=200000, output_prefix=None):
# self.bins = {k.item():v if isinstance(k, np.int64) else k:v for k,v in self.bins.items()}
writing_bins = {}
for key, value in self.bins.items():
if isinstance(key, int64):
writing_bins[key.item()] = value
else:
writing_bins[key] = value
if output_prefix is None:
output_prefix = "rosella_bins"

with open(self.path + '/rosella_bins.json', 'w') as fp:
with open(self.path + f"/{output_prefix}.json", 'w') as fp:
json.dump(writing_bins, fp, cls=NpEncoder)


Expand Down
40 changes: 24 additions & 16 deletions flight/rosella/rosella.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,12 +263,12 @@ def perform_binning(self, args):
y_max, 0))
self.bin_contigs()

self.findem = [
'RL|S1|C13963', 'RL|S1|C11210', 'RL|S1|C12411', 'RL|S1|C13372', 'RL|S1|C14115', 'RL|S1|C16600', 'RL|S1|C17450',
'contig_810_pilon', 'scaffold_1358_pilon', # Ret
# 'contig_3_pilon'
'contig_17512_pilon' # AalE
]
# self.findem = [
# 'RL|S1|C13963', 'RL|S1|C11210', 'RL|S1|C12411', 'RL|S1|C13372', 'RL|S1|C14115', 'RL|S1|C16600', 'RL|S1|C17450',
# 'contig_810_pilon', 'scaffold_1358_pilon', # Ret
# # 'contig_3_pilon'
# 'contig_17512_pilon' # AalE
# ]
self.plot(
None,
self.findem
Expand All @@ -284,8 +284,8 @@ def perform_binning(self, args):
# self.embed_unbinned("unbinned_1")
# logging.info("Refining bins...")
# self.quick_filter(plots, 0, 1, x_min, x_max, y_min, y_max)
self.slow_refine(plots, 0, 5, x_min, x_max, y_min, y_max)
self.big_contig_filter(plots, 0, 3, x_min, x_max, y_min, y_max)
# self.slow_refine(plots, 0, 5, x_min, x_max, y_min, y_max)
# self.big_contig_filter(plots, 0, 3, x_min, x_max, y_min, y_max)
# self.quick_filter(plots, 0, 1, x_min, x_max, y_min, y_max)

logging.info("Third embedding.")
Expand All @@ -295,9 +295,9 @@ def perform_binning(self, args):
# self.dissolve_bins(5e5)
# self.embed_unbinned("unbinned_2")
# self.slow_refine(plots, 0, 2, x_min, x_max, y_min, y_max)
self.dissolve_bins(1e6)
self.embed_unbinned(self.findem, "unbinned_2", switches)
self.embed_unbinned(self.findem, "unbinned_3", switches)
# self.dissolve_bins(1e6)
# self.embed_unbinned(self.findem, "unbinned_2", switches)
# self.embed_unbinned(self.findem, "unbinned_3", switches)
# self.slow_refine(plots, 0, 0, x_min, x_max, y_min, y_max)
# self.big_contig_filter(plots, 0, 2, x_min, x_max, y_min, y_max)
# self.dissolve_bins(1e6)
Expand Down Expand Up @@ -419,6 +419,7 @@ def perform_refining(self, args):
args.bin_extension,
self.checkm_file
)
logging.info(f"Bin stats: {input_bin_stats}")
self.disconnected = np.array([False for _ in range(self.large_contigs.shape[0])])
self.disconnected_intersected = np.array([False for _ in range(self.large_contigs.shape[0])])
self.embeddings = np.random.rand(self.large_contigs.shape[0], 2)
Expand All @@ -435,7 +436,7 @@ def perform_refining(self, args):
self.big_contig_filter(plots, 0, 3, x_min, x_max, y_min, y_max)
self.bin_filtered(int(args.min_bin_size), keep_unbinned=False, unbinned_only=False)
logging.info(f"Writing bins... {len(self.bins.keys())}")
self.write_bins(int(args.min_bin_size))
self.write_bins(int(args.min_bin_size), args.output_prefix)



Expand Down Expand Up @@ -463,8 +464,8 @@ def retrieve_stats(coverage_file, bin_paths=None, bin_folder=None, bin_extension

for bin_index, fasta_path in enumerate(bin_paths):
bin_index += 1
bin_id = fasta_path.split("/")[-1]
bin_id = os.path.splitext(bin_id)[0]
bin_id_ext = fasta_path.split("/")[-1]
bin_id = os.path.splitext(bin_id_ext)[0]

contig_ids = []
for sequence in SeqIO.parse(open(fasta_path), "fasta"):
Expand All @@ -483,8 +484,15 @@ def retrieve_stats(coverage_file, bin_paths=None, bin_folder=None, bin_extension
# checkm1 uses Bin Id
checkm_stats = checkm_file[checkm_file["Bin Id"] == bin_id]
except KeyError:
# checkm2 uses Name
checkm_stats = checkm_file[checkm_file["Name"] == bin_id]
try:
# checkm2 uses Name
checkm_stats = checkm_file[checkm_file["Name"] == bin_id]
except KeyError:
# amber uses BINID and contains full path from when it was run
# so match on endswith
checkm_stats = checkm_file[checkm_file["BINID"].str.endswith(bin_id_ext)]
checkm_stats["Contamination"] = (1 - checkm_stats["precision_bp"]) * 100
checkm_stats["Completeness"] = checkm_stats["recall_bp"] * 100

output_dict["completeness"].append(checkm_stats["Completeness"].values[0])
output_dict["contamination"].append(checkm_stats["Contamination"].values[0])
Expand Down
3 changes: 1 addition & 2 deletions flight/rosella/validating.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/usr/bin/env python
###############################################################################
# binning.py - A binning algorithm spinning off of the methodology of
# Lorikeet
# validating.py
###############################################################################
# #
# This program is free software: you can redistribute it and/or modify #
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def get_version(relpath):
include_package_data=True,
install_requires= [
"umap-learn >= 0.5.3",
"numpy <= 1.21",
"numpy <= 1.23",
"scikit-learn == 1.0.2",
"scipy == 1.8.1",
"scikit-bio >= 0.5.7",
Expand Down

0 comments on commit 3ff27f5

Please sign in to comment.