diff --git a/doc/command-line.md b/doc/command-line.md index 00a89018bb..9f5e5eec4e 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -620,6 +620,8 @@ with fields: If `--outdir` is specified, all of the signatures are placed in outdir. +Note: `split` only saves files in the JSON `.sig` format. + ### `sourmash signature merge` - merge two or more signatures into one Merge two (or more) signatures. @@ -637,6 +639,9 @@ then the merged signature will have the sum of all abundances across the individual signatures. The `--flatten` flag will override this behavior and allow merging of mixtures by removing all abundances. +Note: `merge` only creates one output file, with one signature in it, +in the JSON `.sig` format. + ### `sourmash signature rename` - rename a signature Rename the display name for one or more signatures - this is the name @@ -666,6 +671,9 @@ will subtract all of the hashes in `file2.sig` and `file3.sig` from To use `subtract` on signatures calculated with `-p abund`, you must specify `--flatten`. +Note: `subtract` only creates one output file, with one signature in it, +in the JSON `.sig` format. + ### `sourmash signature intersect` - intersect two (or more) signatures Output the intersection of the hash values in multiple signature files. @@ -682,6 +690,9 @@ The `intersect` command flattens all signatures, i.e. the abundances in any signatures will be ignored and the output signature will have `track_abundance` turned off. +Note: `intersect` only creates one output file, with one signature in it, +in the JSON `.sig` format. + ### `sourmash signature downsample` - decrease the size of a signature Downsample one or more signatures. @@ -773,6 +784,9 @@ sourmash signature import filename.msh.json -o imported.sig ``` will import the contents of `filename.msh.json` into `imported.sig`. +Note: `import` only creates one output file, with one signature in it, +in the JSON `.sig` format. + ### `sourmash signature export` - export signatures to mash. Export signatures from sourmash format. Currently only supports @@ -860,6 +874,25 @@ signatures from zip files. You can create a compressed collection of signatures using `zip -r collection.zip *.sig` and then specify `collections.zip` on the command line. +### Saving signatures, more generally + +As of sourmash 4.1, most signature saving arguments (`--save-matches` +for `search` and `gather`, `-o` for `sourmash sketch`, and most of the +`sourmash signature` commands) support flexible saving of collections of +signatures into JSON text, Zip files, and/or directories. + +This behavior is triggered by the requested output filename -- + +* to save to JSON signature files, use `.sig`; `-` will send JSON to stdout. +* to save to gzipped JSON signature files, use `.sig.gz`; +* to save to a Zip file collection, use `.zip`; +* to save signature files to a directory, use a name ending in `/`; the directory will be created if it doesn't exist; + +If none of these file extensions is detected, output will be written in the JSON `.sig` format, either to the provided output filename or to stdout. + +All of these save formats can be loaded by sourmash commands, too. + + ### Loading all signatures under a directory All of the `sourmash` commands support loading signatures from diff --git a/src/sourmash/cli/sig/cat.py b/src/sourmash/cli/sig/cat.py index 72840402bc..99d53090d7 100644 --- a/src/sourmash/cli/sig/cat.py +++ b/src/sourmash/cli/sig/cat.py @@ -12,7 +12,7 @@ def subparser(subparsers): help='suppress non-error output' ) subparser.add_argument( - '-o', '--output', metavar='FILE', + '-o', '--output', metavar='FILE', default='-', help='output signature to this file (default stdout)' ) subparser.add_argument( diff --git a/src/sourmash/cli/sig/downsample.py b/src/sourmash/cli/sig/downsample.py index b21d36a766..f9e94fd3f6 100644 --- a/src/sourmash/cli/sig/downsample.py +++ b/src/sourmash/cli/sig/downsample.py @@ -22,7 +22,8 @@ def subparser(subparsers): ) subparser.add_argument( '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)' + help='output signature to this file (default stdout)', + default='-', ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) diff --git a/src/sourmash/cli/sig/extract.py b/src/sourmash/cli/sig/extract.py index 3b9a7635de..d2066e8bcc 100644 --- a/src/sourmash/cli/sig/extract.py +++ b/src/sourmash/cli/sig/extract.py @@ -14,7 +14,8 @@ def subparser(subparsers): ) subparser.add_argument( '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)' + help='output signature to this file (default stdout)', + default='-', ) subparser.add_argument( '--md5', default=None, diff --git a/src/sourmash/cli/sig/filter.py b/src/sourmash/cli/sig/filter.py index 38442662ed..41c3ec0bce 100644 --- a/src/sourmash/cli/sig/filter.py +++ b/src/sourmash/cli/sig/filter.py @@ -14,7 +14,8 @@ def subparser(subparsers): ) subparser.add_argument( '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)' + help='output signature to this file (default stdout)', + default='-' ) subparser.add_argument( '--md5', type=str, default=None, diff --git a/src/sourmash/cli/sig/flatten.py b/src/sourmash/cli/sig/flatten.py index b01af8bd58..6bc5538bcf 100644 --- a/src/sourmash/cli/sig/flatten.py +++ b/src/sourmash/cli/sig/flatten.py @@ -14,7 +14,8 @@ def subparser(subparsers): ) subparser.add_argument( '-o', '--output', metavar='FILE', - help='output signature to this file (default stdout)' + help='output signature to this file (default stdout)', + default='-', ) subparser.add_argument( '--md5', default=None, diff --git a/src/sourmash/cli/sig/rename.py b/src/sourmash/cli/sig/rename.py index 5bd910076c..ea60dceabd 100644 --- a/src/sourmash/cli/sig/rename.py +++ b/src/sourmash/cli/sig/rename.py @@ -16,7 +16,9 @@ def subparser(subparsers): help='print debugging output' ) subparser.add_argument( - '-o', '--output', metavar='FILE', help='output to this file' + '-o', '--output', metavar='FILE', + help='output renamed signature to this file (default stdout)', + default='-' ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py index cc52be1b3a..92507b24b0 100644 --- a/src/sourmash/command_compute.py +++ b/src/sourmash/command_compute.py @@ -9,7 +9,7 @@ import time from . import sourmash_args -from .signature import SourmashSignature, save_signatures +from .signature import SourmashSignature from .logging import notify, error, set_quiet from .utils import RustObject from ._lowlevel import ffi, lib @@ -267,9 +267,23 @@ def set_sig_name(sigs, filename, name=None): def save_siglist(siglist, sigfile_name): + import sourmash + # save! - with sourmash_args.FileOutput(sigfile_name, 'w') as fp: - save_signatures(siglist, fp) + with sourmash_args.SaveSignaturesToLocation(sigfile_name) as save_sig: + for ss in siglist: + try: + save_sig.add(ss) + except sourmash.exceptions.Panic: + # this deals with a disconnect between the way Rust + # and Python handle signatures; Python expects one + # minhash (and hence one md5sum) per signature, while + # Rust supports multiple. For now, go through serializing + # and deserializing the signature! See issue #1167 for more. + json_str = sourmash.save_signatures([ss]) + for ss in sourmash.load_signatures(json_str): + save_sig.add(ss) + notify('saved signature(s) to {}. Note: signature license is CC0.', sigfile_name) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 1b66ed2263..399fd4d2b6 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -15,7 +15,8 @@ from . import signature as sig from . import sourmash_args from .logging import notify, error, print_results, set_quiet -from .sourmash_args import DEFAULT_LOAD_K, FileOutput, FileOutputCSV +from .sourmash_args import (DEFAULT_LOAD_K, FileOutput, FileOutputCSV, + SaveSignaturesToLocation) WATERMARK_SIZE = 10000 @@ -524,8 +525,10 @@ def search(args): # save matching signatures upon request if args.save_matches: notify('saving all matched signatures to "{}"', args.save_matches) - with FileOutput(args.save_matches, 'wt') as fp: - sig.save_signatures([ sr.match for sr in results ], fp) + + with SaveSignaturesToLocation(args.save_matches) as save_sig: + for sr in results: + save_sig.add(sr.match) def categorize(args): @@ -714,8 +717,9 @@ def gather(args): # save matching signatures? if found and args.save_matches: notify(f"saving all matches to '{args.save_matches}'") - with FileOutput(args.save_matches, 'wt') as fp: - sig.save_signatures([ r.match for r in found ], fp) + with SaveSignaturesToLocation(args.save_matches) as save_sig: + for sr in found: + save_sig.add(sr.match) # save unassigned hashes? if args.output_unassigned: diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 09f8d8cdea..667e74cfcb 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -70,9 +70,10 @@ def cat(args): encountered_md5sums = defaultdict(int) # used by --unique progress = sourmash_args.SignatureLoadingProgress() - siglist = [] + save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) + save_sigs.open() + for sigfile in args.signatures: - this_siglist = [] try: loader = sourmash_args.load_file_as_signatures(sigfile, progress=progress) @@ -85,19 +86,18 @@ def cat(args): if args.unique and encountered_md5sums[md5] > 1: continue - siglist.append(sig) + save_sigs.add(sig) except Exception as exc: error(str(exc)) error('(continuing)') notify('loaded {} signatures from {}...', n_loaded, sigfile, end='\r') - notify('loaded {} signatures total.', len(siglist)) + notify('loaded {} signatures total.', len(save_sigs)) - with FileOutput(args.output, 'wt') as fp: - sourmash.save_signatures(siglist, fp=fp) + save_sigs.close() - notify('output {} signatures', len(siglist)) + notify('output {} signatures', len(save_sigs)) multiple_md5 = [ 1 for cnt in encountered_md5sums.values() if cnt > 1 ] if multiple_md5: @@ -523,7 +523,9 @@ def rename(args): progress = sourmash_args.SignatureLoadingProgress() - outlist = [] + save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) + save_sigs.open() + for filename in args.sigfiles: debug('loading {}', filename) siglist = sourmash_args.load_file_as_signatures(filename, @@ -533,12 +535,11 @@ def rename(args): for sigobj in siglist: sigobj._name = args.name - outlist.append(sigobj) + save_sigs.add(sigobj) - with FileOutput(args.output, 'wt') as fp: - sourmash.save_signatures(outlist, fp=fp) + save_sigs.close() - notify("set name to '{}' on {} signatures", args.name, len(outlist)) + notify("set name to '{}' on {} signatures", args.name, len(save_sigs)) def extract(args): @@ -550,7 +551,9 @@ def extract(args): progress = sourmash_args.SignatureLoadingProgress() - outlist = [] + save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) + save_sigs.open() + total_loaded = 0 for filename in args.signatures: siglist = sourmash_args.load_file_as_signatures(filename, @@ -567,18 +570,18 @@ def extract(args): if args.name is not None: siglist = [ ss for ss in siglist if args.name in str(ss) ] - outlist.extend(siglist) + for ss in siglist: + save_sigs.add(ss) notify("loaded {} total that matched ksize & molecule type", total_loaded) - if not outlist: + if not save_sigs: error("no matching signatures!") sys.exit(-1) - with FileOutput(args.output, 'wt') as fp: - sourmash.save_signatures(outlist, fp=fp) + save_sigs.close() - notify("extracted {} signatures from {} file(s)", len(outlist), + notify("extracted {} signatures from {} file(s)", len(save_sigs), len(args.signatures)) @@ -591,7 +594,9 @@ def filter(args): progress = sourmash_args.SignatureLoadingProgress() - outlist = [] + save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) + save_sigs.open() + total_loaded = 0 for filename in args.signatures: siglist = sourmash_args.load_file_as_signatures(filename, @@ -628,27 +633,28 @@ def filter(args): ss.minhash = filtered_mh - outlist.extend(siglist) + save_sigs.add(ss) - with FileOutput(args.output, 'wt') as fp: - sourmash.save_signatures(outlist, fp=fp) + save_sigs.close() notify("loaded {} total that matched ksize & molecule type", total_loaded) - notify("extracted {} signatures from {} file(s)", len(outlist), + notify("extracted {} signatures from {} file(s)", len(save_sigs), len(args.signatures)) def flatten(args): """ - flatten a signature, removing abundances. + flatten one or more signatures, removing abundances. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) progress = sourmash_args.SignatureLoadingProgress() - outlist = [] + save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) + save_sigs.open() + total_loaded = 0 for filename in args.signatures: siglist = sourmash_args.load_file_as_signatures(filename, @@ -667,15 +673,13 @@ def flatten(args): for ss in siglist: ss.minhash = ss.minhash.flatten() + save_sigs.add(ss) - outlist.extend(siglist) - - with FileOutput(args.output, 'wt') as fp: - sourmash.save_signatures(outlist, fp=fp) + save_sigs.close() notify("loaded {} total that matched ksize & molecule type", total_loaded) - notify("extracted {} signatures from {} file(s)", len(outlist), + notify("extracted {} signatures from {} file(s)", len(save_sigs), len(args.signatures)) @@ -694,9 +698,11 @@ def downsample(args): error('cannot specify both --num and --scaled') sys.exit(-1) + save_sigs = sourmash_args.SaveSignaturesToLocation(args.output) + save_sigs.open() + progress = sourmash_args.SignatureLoadingProgress() - output_list = [] total_loaded = 0 for sigfile in args.signatures: siglist = sourmash_args.load_file_as_signatures(sigfile, @@ -734,10 +740,9 @@ def downsample(args): sigobj.minhash = mh_new - output_list.append(sigobj) + save_sigs.add(sigobj) - with FileOutput(args.output, 'wt') as fp: - sourmash.save_signatures(output_list, fp=fp) + save_sigs.close() notify("loaded and downsampled {} signatures", total_loaded) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 5ad52587a4..9cb31e4625 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -7,6 +7,8 @@ import itertools from enum import Enum import traceback +import gzip +import zipfile import screed @@ -423,16 +425,18 @@ class FileOutput(object): will properly handle no argument or '-' as sys.stdout. """ - def __init__(self, filename, mode='wt', newline=None): + def __init__(self, filename, mode='wt', *, newline=None, encoding='utf-8'): self.filename = filename self.mode = mode self.fp = None self.newline = newline + self.encoding = encoding def open(self): if self.filename == '-' or self.filename is None: return sys.stdout - self.fp = open(self.filename, self.mode, newline=self.newline) + self.fp = open(self.filename, self.mode, newline=self.newline, + encoding=self.encoding) return self.fp def __enter__(self): @@ -535,3 +539,204 @@ def start_file(self, filename, loader): self.n_sig += n_this self.short_notify("loaded {} sigs from '{}'", n_this, filename) + + +# +# enum and classes for saving signatures progressively +# + +class _BaseSaveSignaturesToLocation: + "Base signature saving class. Track location (if any) and count." + def __init__(self, location): + self.location = location + self.count = 0 + + def __repr__(self): + raise NotImplementedError + + def __len__(self): + return self.count + + def __enter__(self): + "provide context manager functionality" + self.open() + return self + + def __exit__(self, type, value, traceback): + "provide context manager functionality" + self.close() + + def add(self, ss): + self.count += 1 + + +class SaveSignatures_NoOutput(_BaseSaveSignaturesToLocation): + "Do not save signatures." + def __repr__(self): + return 'SaveSignatures_NoOutput()' + + def open(self): + pass + + def close(self): + pass + + +class SaveSignatures_Directory(_BaseSaveSignaturesToLocation): + "Save signatures within a directory, using md5sum names." + def __init__(self, location): + super().__init__(location) + + def __repr__(self): + return f"SaveSignatures_Directory('{self.location}')" + + def close(self): + pass + + def open(self): + try: + os.mkdir(self.location) + except FileExistsError: + pass + except: + notify("ERROR: cannot create signature output directory '{}'", + self.location) + sys.exit(-1) + + def add(self, ss): + super().add(ss) + md5 = ss.md5sum() + + # don't overwrite even if duplicate md5sum + outname = os.path.join(self.location, f"{md5}.sig.gz") + if os.path.exists(outname): + i = 0 + while 1: + outname = os.path.join(self.location, f"{md5}_{i}.sig.gz") + if not os.path.exists(outname): + break + i += 1 + + with gzip.open(outname, "wb") as fp: + sig.save_signatures([ss], fp, compression=1) + + +class SaveSignatures_SigFile(_BaseSaveSignaturesToLocation): + "Save signatures within a directory, using md5sum names." + def __init__(self, location): + super().__init__(location) + self.keep = [] + self.compress = 0 + if self.location.endswith('.gz'): + self.compress = 1 + + def __repr__(self): + return f"SaveSignatures_SigFile('{self.location}')" + + def open(self): + pass + + def close(self): + if self.location == '-': + sourmash.save_signatures(self.keep, sys.stdout) + else: + # text mode? encode in utf-8 + mode = "w" + encoding = 'utf-8' + + # compressed? bytes & binary. + if self.compress: + encoding = None + mode = "wb" + + with open(self.location, mode, encoding=encoding) as fp: + sourmash.save_signatures(self.keep, fp, + compression=self.compress) + + def add(self, ss): + super().add(ss) + self.keep.append(ss) + + +class SaveSignatures_ZipFile(_BaseSaveSignaturesToLocation): + "Save compressed signatures in an uncompressed Zip file." + def __init__(self, location): + super().__init__(location) + self.zf = None + + def __repr__(self): + return f"SaveSignatures_ZipFile('{self.location}')" + + def close(self): + self.zf.close() + + def open(self): + self.zf = zipfile.ZipFile(self.location, 'w', zipfile.ZIP_STORED) + + def _exists(self, name): + try: + self.zf.getinfo(name) + return True + except KeyError: + return False + + def add(self, ss): + assert self.zf + super().add(ss) + + md5 = ss.md5sum() + outname = f"signatures/{md5}.sig.gz" + + # don't overwrite even if duplicate md5sum. + if self._exists(outname): + i = 0 + while 1: + outname = os.path.join(self.location, f"{md5}_{i}.sig.gz") + if not self._exists(outname): + break + i += 1 + + json_str = sourmash.save_signatures([ss], compression=1) + self.zf.writestr(outname, json_str) + + +class SigFileSaveType(Enum): + SIGFILE = 1 + SIGFILE_GZ = 2 + DIRECTORY = 3 + ZIPFILE = 4 + NO_OUTPUT = 5 + +_save_classes = { + SigFileSaveType.SIGFILE: SaveSignatures_SigFile, + SigFileSaveType.SIGFILE_GZ: SaveSignatures_SigFile, + SigFileSaveType.DIRECTORY: SaveSignatures_Directory, + SigFileSaveType.ZIPFILE: SaveSignatures_ZipFile, + SigFileSaveType.NO_OUTPUT: SaveSignatures_NoOutput +} + + +def SaveSignaturesToLocation(filename, *, force_type=None): + """Create and return an appropriate object for progressive saving of + signatures.""" + save_type = None + if not force_type: + if filename is None: + save_type = SigFileSaveType.NO_OUTPUT + elif filename.endswith('/'): + save_type = SigFileSaveType.DIRECTORY + elif filename.endswith('.gz'): + save_type = SigFileSaveType.SIGFILE_GZ + elif filename.endswith('.zip'): + save_type = SigFileSaveType.ZIPFILE + else: + # default to SIGFILE intentionally! + save_type = SigFileSaveType.SIGFILE + else: + save_type = force_type + + cls = _save_classes.get(save_type) + if cls is None: + raise Exception("invalid save type; this should never happen!?") + + return cls(filename) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 135efae65b..bc6eef7334 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1944,10 +1944,7 @@ def test_search_metagenome_downsample_containment(): def test_search_metagenome_downsample_index(c): # does same search as search_metagenome_downsample_containment but # rescales during indexing - # - # for now, this test should fail; we need to clean up some internal - # stuff before we can properly implement this! - # + testdata_glob = utils.get_test_data('gather/GCF*.sig') testdata_sigs = glob.glob(testdata_glob) @@ -1970,6 +1967,38 @@ def test_search_metagenome_downsample_index(c): assert '12 matches; showing first 3:' in str(c) +def test_search_metagenome_downsample_save_matches(runtmp): + c = runtmp + + # does same search as search_metagenome_downsample_containment but + # rescales during indexing + + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + output_matches = runtmp.output('out.zip') + + # downscale during indexing, rather than during search. + c.run_sourmash('index', 'gcf_all', *testdata_sigs, '-k', '21', + '--scaled', '100000') + + assert os.path.exists(c.output('gcf_all.sbt.zip')) + + c.run_sourmash('search', query_sig, 'gcf_all', '-k', '21', + '--containment', '--save-matches', output_matches) + print(c) + + # is a zip file + with zipfile.ZipFile(output_matches, "r") as zf: + assert list(zf.infolist()) + + # ...with 12 signatures: + saved = list(sourmash.load_file_as_signatures(output_matches)) + assert len(saved) == 12 + + def test_mash_csv_to_sig(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa.msh.dump') diff --git a/tests/test_sourmash_args.py b/tests/test_sourmash_args.py new file mode 100644 index 0000000000..667d016958 --- /dev/null +++ b/tests/test_sourmash_args.py @@ -0,0 +1,204 @@ +""" +Tests for functions in sourmash_args module. +""" +import os +import csv +import pytest +import gzip +import zipfile +import io +import contextlib + +import sourmash_tst_utils as utils +import sourmash +from sourmash import sourmash_args + + +def test_save_signatures_api_none(): + # save to sigfile + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + with sourmash_args.SaveSignaturesToLocation(None) as save_sig: + print(repr(save_sig)) + save_sig.add(ss2) + save_sig.add(ss47) + + # nothing to test - no output! + + +def test_save_signatures_to_location_1_sig(runtmp): + # save to sigfile.sig + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + outloc = runtmp.output('foo.sig') + with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + print(save_sig) + save_sig.add(ss2) + save_sig.add(ss47) + + saved = list(sourmash.load_file_as_signatures(outloc)) + assert ss2 in saved + assert ss47 in saved + assert len(saved) == 2 + + +def test_save_signatures_to_location_1_stdout(): + # save to stdout + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + output_capture = io.StringIO() + with contextlib.redirect_stdout(output_capture): + with sourmash_args.SaveSignaturesToLocation("-") as save_sig: + save_sig.add(ss2) + save_sig.add(ss47) + + output = output_capture.getvalue() + + saved = list(sourmash.signature.load_signatures(output)) + assert ss2 in saved + assert ss47 in saved + assert len(saved) == 2 + + +def test_save_signatures_to_location_1_sig_is_default(runtmp): + # save to sigfile.txt + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + outloc = runtmp.output('foo.txt') + with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + print(save_sig) + save_sig.add(ss2) + save_sig.add(ss47) + + saved = list(sourmash.signature.load_signatures(outloc)) + assert ss2 in saved + assert ss47 in saved + assert len(saved) == 2 + + +def test_save_signatures_to_location_1_sig_gz(runtmp): + # save to sigfile.gz + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + outloc = runtmp.output('foo.sig.gz') + with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + print(save_sig) + save_sig.add(ss2) + save_sig.add(ss47) + + # can we open as a .gz file? + with gzip.open(outloc, "r") as fp: + print(save_sig) + fp.read() + + saved = list(sourmash.load_file_as_signatures(outloc)) + assert ss2 in saved + assert ss47 in saved + assert len(saved) == 2 + + +def test_save_signatures_to_location_1_zip(runtmp): + # save to sigfile.zip + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + outloc = runtmp.output('foo.zip') + with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + print(save_sig) + save_sig.add(ss2) + save_sig.add(ss47) + + # can we open as a .zip file? + with zipfile.ZipFile(outloc, "r") as zf: + assert list(zf.infolist()) + + saved = list(sourmash.load_file_as_signatures(outloc)) + assert ss2 in saved + assert ss47 in saved + assert len(saved) == 2 + + +def test_save_signatures_to_location_1_zip_dup(runtmp): + # save to sigfile.zip + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + outloc = runtmp.output('foo.zip') + with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + print(save_sig) + save_sig.add(ss2) + save_sig.add(ss47) + save_sig.add(ss2) + save_sig.add(ss47) + + # can we open as a .zip file? + with zipfile.ZipFile(outloc, "r") as zf: + assert list(zf.infolist()) + + saved = list(sourmash.load_file_as_signatures(outloc)) + assert ss2 in saved + assert ss47 in saved + assert len(saved) == 4 + + +def test_save_signatures_to_location_1_dirout(runtmp): + # save to sigout/ (directory) + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + outloc = runtmp.output('sigout/') + with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + print(save_sig) + save_sig.add(ss2) + save_sig.add(ss47) + + assert os.path.isdir(outloc) + + saved = list(sourmash.load_file_as_signatures(outloc)) + assert ss2 in saved + assert ss47 in saved + assert len(saved) == 2 + + +def test_save_signatures_to_location_1_dirout_duplicate(runtmp): + # save to sigout/ (directory) + sig2 = utils.get_test_data('2.fa.sig') + ss2 = sourmash.load_one_signature(sig2, ksize=31) + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47, ksize=31) + + outloc = runtmp.output('sigout/') + with sourmash_args.SaveSignaturesToLocation(outloc) as save_sig: + print(save_sig) + save_sig.add(ss2) + save_sig.add(ss47) + save_sig.add(ss2) + save_sig.add(ss47) + + assert os.path.isdir(outloc) + + saved = list(sourmash.load_file_as_signatures(outloc)) + assert ss2 in saved + assert ss47 in saved + assert len(saved) == 4 diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py index 31e3a18ab5..e95e2583f0 100644 --- a/tests/test_sourmash_sketch.py +++ b/tests/test_sourmash_sketch.py @@ -357,6 +357,27 @@ def test_do_sourmash_sketchdna_output_valid_file(): for testdata in (testdata1, testdata2, testdata3)) +def test_do_sourmash_sketchdna_output_zipfile(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + testdata3 = utils.get_test_data('short3.fa') + + outfile = os.path.join(location, 'shorts.zip') + + status, out, err = utils.runscript('sourmash', + ['sketch', 'dna', '-o', outfile, + testdata1, + testdata2, testdata3], + in_directory=location) + + assert os.path.exists(outfile) + assert not out # stdout should be empty + + sigs = list(sourmash.load_file_as_signatures(outfile)) + assert len(sigs) == 3 + + def test_do_sourmash_sketchdna_output_stdout_valid(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa')