Skip to content

Commit

Permalink
Other formats
Browse files Browse the repository at this point in the history
  • Loading branch information
michalsta committed Sep 23, 2024
1 parent f0d55f0 commit 25d308c
Showing 1 changed file with 86 additions and 28 deletions.
114 changes: 86 additions & 28 deletions scripts/opentims_extract_tdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,51 +6,109 @@
from opentimspy import OpenTIMS

if opentimspy.bruker_bridge_present:
all_columns = ('frame','scan','tof','intensity','mz','inv_ion_mobility','retention_time')
all_columns = (
"frame",
"scan",
"tof",
"intensity",
"mz",
"inv_ion_mobility",
"retention_time",
)
else:
all_columns = ('frame','scan','tof','intensity','retention_time')
all_columns = ("frame", "scan", "tof", "intensity", "retention_time")

import argparse

parser = argparse.ArgumentParser(description='Export a set of frames in TSV format.')
parser = argparse.ArgumentParser(description="Export a set of frames in TSV format.")
parser.add_argument("path", help="TDF dataset path", type=Path)
parser.add_argument("frames", help="Comma-separated list of frames, including ranges. Example: 314,320-330,435. Will output everything if omitted.", nargs='?', default="")
parser.add_argument(
"frames",
help="Comma-separated list of frames, including ranges. Example: 314,320-330,435. Will output everything if omitted.",
nargs="?",
default="",
)
parser.add_argument("--no-hdr", help="Do not print the header.", action="store_true")
parser.add_argument("-o", "--output", help="File to output to. Will print to stdout if omitted", type=Path, default=None)
args=parser.parse_args()
parser.add_argument(
"-o",
"--output",
help="File to output to. Will print to stdout if omitted",
type=Path,
default=None,
)
parser.add_argument(
"--format",
type=str,
help="Output format, one of: mmapped_df (fastest), hdf5, csv (default).",
required=False,
default="csv",
)

args = parser.parse_args()

import tqdm

frames = set()
if args.frames != "":
for frame_desc in args.frames.split(','):
if '-' in frame_desc:
start, end = frame_desc.split('-')
frames.update(range(int(start), int(end)+1))
for frame_desc in args.frames.split(","):
if "-" in frame_desc:
start, end = frame_desc.split("-")
frames.update(range(int(start), int(end) + 1))
else:
frames.add(int(frame_desc))


out_file = sys.stdout if args.output is None else args.output.open(mode="wt")
if args.format == "mmapped_df":
try:
import mmapped_df
except ImportError:
print(
"mmapped_df is not installed. Please install it from https://github.com/michalsta/mmapped_df, or use a different format."
)
sys.exit(1)
opener = lambda: mmapped_df.DatasetWriter(args.output)
writer = lambda f, df: f.append(**df)
elif args.format == "hdf5":
import pandas as pd

with OpenTIMS(args.path) as D:
if args.frames == "":
frames = set(D.frames['Id'])
opener = lambda: pd.HDFStore(args.output, mode="w", complevel=9, complib="blosc")
writer = lambda f, df: pd.DataFrame(df).to_hdf(
f,
key="df",
format="table",
data_columns=True,
complevel=9,
complib="blosc",
append=True,
)
elif args.format == "csv":
import pandas as pd

out_file = sys.stdout if args.output is None else args.output.open(mode="wt")
opener = lambda: out_file
hdr_written = False

def writer(f, df):
global hdr_written
df = pd.DataFrame(df)
df.to_csv(f, header=not hdr_written)
hdr_written = True

else:
raise ValueError(
f"Invalid format: {args.format}. Please specify one of: mmapped_df, hdf5, csv."
)

# prepare and print the CSV header:
if not args.no_hdr:
header = '"' + '"\t"'.join(all_columns) + '"'
print(header, file=out_file)

progressbar = lambda x: x
if args.output != None:
progressbar = tqdm.tqdm

with OpenTIMS(args.path) as D, opener() as f:
if args.frames == "":
frames = set(D.frames["Id"])

# Iterate over frames. This will store only one frame at a time in RAM, preventing out of memory errors.
for frame_id in sorted(frames):
for frame_id in progressbar(list(sorted(frames))):
frame = D.query(frame_id)
peak_idx = 0
# Frame is stored as a dict of column vectors
while peak_idx < len(frame['frame']):
row = [str(frame[colname][peak_idx]) for colname in all_columns]
print('\t'.join(row), file=out_file)
peak_idx += 1

if args.output is not None:
out_file.close()
writer(f, frame)

0 comments on commit 25d308c

Please sign in to comment.