From 25d308c9ea737e1d391dddef30974b3c204fa5d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Startek?= Date: Mon, 23 Sep 2024 19:45:25 +0200 Subject: [PATCH] Other formats --- scripts/opentims_extract_tdf.py | 114 ++++++++++++++++++++++++-------- 1 file changed, 86 insertions(+), 28 deletions(-) diff --git a/scripts/opentims_extract_tdf.py b/scripts/opentims_extract_tdf.py index 3bb1057..6077d3b 100755 --- a/scripts/opentims_extract_tdf.py +++ b/scripts/opentims_extract_tdf.py @@ -6,51 +6,109 @@ from opentimspy import OpenTIMS if opentimspy.bruker_bridge_present: - all_columns = ('frame','scan','tof','intensity','mz','inv_ion_mobility','retention_time') + all_columns = ( + "frame", + "scan", + "tof", + "intensity", + "mz", + "inv_ion_mobility", + "retention_time", + ) else: - all_columns = ('frame','scan','tof','intensity','retention_time') + all_columns = ("frame", "scan", "tof", "intensity", "retention_time") import argparse -parser = argparse.ArgumentParser(description='Export a set of frames in TSV format.') +parser = argparse.ArgumentParser(description="Export a set of frames in TSV format.") parser.add_argument("path", help="TDF dataset path", type=Path) -parser.add_argument("frames", help="Comma-separated list of frames, including ranges. Example: 314,320-330,435. Will output everything if omitted.", nargs='?', default="") +parser.add_argument( + "frames", + help="Comma-separated list of frames, including ranges. Example: 314,320-330,435. Will output everything if omitted.", + nargs="?", + default="", +) parser.add_argument("--no-hdr", help="Do not print the header.", action="store_true") -parser.add_argument("-o", "--output", help="File to output to. Will print to stdout if omitted", type=Path, default=None) -args=parser.parse_args() +parser.add_argument( + "-o", + "--output", + help="File to output to. Will print to stdout if omitted", + type=Path, + default=None, +) +parser.add_argument( + "--format", + type=str, + help="Output format, one of: mmapped_df (fastest), hdf5, csv (default).", + required=False, + default="csv", +) +args = parser.parse_args() + +import tqdm frames = set() if args.frames != "": - for frame_desc in args.frames.split(','): - if '-' in frame_desc: - start, end = frame_desc.split('-') - frames.update(range(int(start), int(end)+1)) + for frame_desc in args.frames.split(","): + if "-" in frame_desc: + start, end = frame_desc.split("-") + frames.update(range(int(start), int(end) + 1)) else: frames.add(int(frame_desc)) -out_file = sys.stdout if args.output is None else args.output.open(mode="wt") +if args.format == "mmapped_df": + try: + import mmapped_df + except ImportError: + print( + "mmapped_df is not installed. Please install it from https://github.com/michalsta/mmapped_df, or use a different format." + ) + sys.exit(1) + opener = lambda: mmapped_df.DatasetWriter(args.output) + writer = lambda f, df: f.append(**df) +elif args.format == "hdf5": + import pandas as pd -with OpenTIMS(args.path) as D: - if args.frames == "": - frames = set(D.frames['Id']) + opener = lambda: pd.HDFStore(args.output, mode="w", complevel=9, complib="blosc") + writer = lambda f, df: pd.DataFrame(df).to_hdf( + f, + key="df", + format="table", + data_columns=True, + complevel=9, + complib="blosc", + append=True, + ) +elif args.format == "csv": + import pandas as pd + + out_file = sys.stdout if args.output is None else args.output.open(mode="wt") + opener = lambda: out_file + hdr_written = False + + def writer(f, df): + global hdr_written + df = pd.DataFrame(df) + df.to_csv(f, header=not hdr_written) + hdr_written = True + +else: + raise ValueError( + f"Invalid format: {args.format}. Please specify one of: mmapped_df, hdf5, csv." + ) - # prepare and print the CSV header: - if not args.no_hdr: - header = '"' + '"\t"'.join(all_columns) + '"' - print(header, file=out_file) +progressbar = lambda x: x +if args.output != None: + progressbar = tqdm.tqdm + +with OpenTIMS(args.path) as D, opener() as f: + if args.frames == "": + frames = set(D.frames["Id"]) # Iterate over frames. This will store only one frame at a time in RAM, preventing out of memory errors. - for frame_id in sorted(frames): + for frame_id in progressbar(list(sorted(frames))): frame = D.query(frame_id) - peak_idx = 0 - # Frame is stored as a dict of column vectors - while peak_idx < len(frame['frame']): - row = [str(frame[colname][peak_idx]) for colname in all_columns] - print('\t'.join(row), file=out_file) - peak_idx += 1 - -if args.output is not None: - out_file.close() + writer(f, frame)