Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "ENH: Directly open compressed files" #3748

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions doc/source/examining/loading_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,6 @@ any arguments, and it will return a list of the names that can be supplied:

This will return a list of possible filenames; more information can be accessed on the data catalog.


.. _loading-archived-data:

Archived Data
-------------

If your data is stored as a (compressed) tar file, you can access the contained
dataset directly without extracting the tar file.
This can be achieved using the ``load_archive`` function:

.. code-block:: python

import yt

ds = yt.load_archive("IsolatedGalaxy.tar.gz", "IsolatedGalaxy/galaxy0030/galaxy0030")

The first argument is the path to the archive file, the second one is the path to the file to load
in the archive. Subsequent arguments are passed to ``yt.load``.

The functionality requires the package `ratarmount <https://github.com/mxmlnkn/ratarmount/>`_ to be installed.
Under the hood, yt will mount the archive as a (read-only) filesystem. Note that this requires the
entire archive to be read once to compute the location of each file in the archive; subsequent accesses
will be much faster.
All archive formats supported by `ratarmount <https://github.com/mxmlnkn/ratarmount>`_ should be loadable, provided
the dependencies are installed; this includes ``tar``, ``tar.gz`` and tar.bz2`` formats.

.. _loading-amrvac-data:

AMRVAC Data
Expand Down
2 changes: 1 addition & 1 deletion nose_unit.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ nologcapture=1
verbosity=2
where=yt
with-timer=1
ignore-files=(test_load_errors.py|test_load_sample.py|test_commons.py|test_ambiguous_fields.py|test_field_access_pytest.py|test_save.py|test_line_annotation_unit.py|test_eps_writer.py|test_registration.py|test_invalid_origin.py|test_outputs_pytest\.py|test_normal_plot_api\.py|test_load_archive\.py)
ignore-files=(test_load_errors.py|test_load_sample.py|test_commons.py|test_ambiguous_fields.py|test_field_access_pytest.py|test_save.py|test_line_annotation_unit.py|test_eps_writer.py|test_registration.py|test_invalid_origin.py|test_outputs_pytest\.py|test_normal_plot_api\.py)
exclude-test=yt.frontends.gdf.tests.test_outputs.TestGDF
1 change: 0 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,6 @@ full =
requests>=2.20.0
scipy>=1.5.0
xarray>=0.16.1
ratarmount~=0.8.1;platform_system!="Windows"
mapserver =
bottle
minimal =
Expand Down
5 changes: 2 additions & 3 deletions tests/ci_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,13 @@ linux|Linux)
proj-data \
proj-bin \
libgeos-dev \
libopenmpi-dev \
libfuse2
libopenmpi-dev
;;
osx|macOS)
sudo mkdir -p /usr/local/man
sudo chown -R "${USER}:admin" /usr/local/man
brew update
HOMEBREW_NO_AUTO_UPDATE=1 brew install hdf5 proj geos open-mpi netcdf ccache osxfuse
HOMEBREW_NO_AUTO_UPDATE=1 brew install hdf5 proj geos open-mpi netcdf ccache
;;
esac

Expand Down
1 change: 0 additions & 1 deletion tests/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,6 @@ other_tests:
- "--ignore-files=test_save.py"
- "--ignore-files=test_registration.py"
- "--ignore-files=test_invalid_origin.py"
- "--ignore-files=test_load_archive\\.py"
- "--ignore-files=test_outputs_pytest\\.py"
- "--ignore-files=test_normal_plot_api\\.py"
- "--exclude-test=yt.frontends.gdf.tests.test_outputs.TestGDF"
Expand Down
1 change: 0 additions & 1 deletion yt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@
from yt.loaders import (
load,
load_amr_grids,
load_archive,
load_hexahedral_mesh,
load_octree,
load_particles,
Expand Down
138 changes: 2 additions & 136 deletions yt/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,20 @@
This module gathers all user-facing functions with a `load_` prefix.

"""
import atexit
import os
import sys
import tarfile
import time
import types
import warnings
from multiprocessing import Pipe, Process
from multiprocessing.connection import Connection
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
from typing import List, Optional, Tuple
from urllib.parse import urlsplit

import numpy as np
from more_itertools import always_iterable

from yt.data_objects.static_output import Dataset
from yt.funcs import levenshtein_distance
from yt.sample_data.api import lookup_on_disk_data
from yt.utilities.decompose import decompose_array, get_psize
from yt.utilities.exceptions import (
MountError,
YTAmbiguousDataType,
YTIllDefinedAMR,
YTSimulationNotIdentified,
Expand All @@ -36,7 +28,7 @@
output_type_registry,
simulation_time_series_registry,
)
from yt.utilities.on_demand_imports import _pooch as pooch, _ratarmount as ratarmount
from yt.utilities.on_demand_imports import _pooch as pooch

# --- Loaders for known data formats ---

Expand Down Expand Up @@ -1433,129 +1425,3 @@ def load_sample(
loadable_path = loadable_path.joinpath(load_name, specific_file)

return load(loadable_path, **kwargs)


def _mount_helper(
archive: str, mountPoint: str, ratarmount_kwa: Dict, conn: Connection
):
try:
fuseOperationsObject = ratarmount.TarMount(
pathToMount=archive,
mountPoint=mountPoint,
lazyMounting=True,
**ratarmount_kwa,
)
fuseOperationsObject.use_ns = True
conn.send(True)
except Exception:
conn.send(False)
raise

ratarmount.fuse.FUSE(
operations=fuseOperationsObject,
mountpoint=mountPoint,
foreground=True,
nothreads=True,
)


# --- Loader for tar-based datasets ---
def load_archive(
fn: Union[str, Path],
path: str,
ratarmount_kwa: Optional[Dict] = None,
mount_timeout: float = 1.0,
*args,
**kwargs,
) -> Dataset:
r"""
Load archived data with yt.

This is a wrapper around :func:`~yt.loaders.load` to include mounting
and unmounting the archive as a read-only filesystem and load it.

Parameters
----------

fn: str
The `filename` of the archive containing the dataset.

path: str
The path to the dataset in the archive.

ratarmount_kwa: dict, optional
Optional parameters to pass to ratarmount to mount the archive.

mount_timeout: float, optional
The timeout to wait for ratarmount to mount the archive. Default is 1s.

Notes
-----

- The function is experimental and may work or not depending on your setup.
- Any additional keyword argument is passed down to :func:`~yt.loaders.load`.
- This function requires ratarmount to be installed.
- This function does not work on Windows system.
"""

warnings.warn(
"The 'load_archive' function is still experimental and may be unstable."
)

fn = os.path.expanduser(fn)

# This will raise FileNotFoundError if the path isn't matched
# either in the current dir or yt.config.ytcfg['data_dir_directory']
if not fn.startswith("http"):
fn = str(lookup_on_disk_data(fn))

if ratarmount_kwa is None:
ratarmount_kwa = {}

try:
tarfile.open(fn)
except tarfile.ReadError:
raise YTUnidentifiedDataType(fn, *args, **kwargs)

# Note: the temporary directory will be created by ratarmount
tempdir = fn + ".mount"
tempdir_base = tempdir
i = 0
while os.path.exists(tempdir):
i += 1
tempdir = f"{tempdir_base}.{i}"

parent_conn, child_conn = Pipe()
proc = Process(target=_mount_helper, args=(fn, tempdir, ratarmount_kwa, child_conn))
proc.start()
if not parent_conn.recv():
raise MountError(f"An error occured while mounting {fn} in {tempdir}")

# Note: the mounting needs to happen in another process which
# needs be run in the foreground (otherwise it may
# unmount). To prevent a race-condition here, we wait
# for the folder to be mounted within a reasonable time.
t = 0.0
while t < mount_timeout:
if os.path.ismount(tempdir):
break
time.sleep(0.1)
t += 0.1
else:
raise MountError(f"Folder {tempdir} does not appear to be mounted")

# We need to kill the process at exit (to force unmounting)
def umount_callback():
proc.terminate()

atexit.register(umount_callback)

# Alternatively, can dismount manually
def del_callback(self):
proc.terminate()
atexit.unregister(umount_callback)

ds = load(os.path.join(tempdir, path), *args, **kwargs)
ds.dismount = types.MethodType(del_callback, ds)

return ds
120 changes: 0 additions & 120 deletions yt/tests/test_load_archive.py

This file was deleted.

5 changes: 0 additions & 5 deletions yt/utilities/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,8 +922,3 @@ class GenerationInProgress(Exception):
def __init__(self, fields):
self.fields = fields
super().__init__()


class MountError(Exception):
def __init__(self, message):
self.message = message
Loading