Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoided infinite recursion when loading catalog files that recursively #370

Merged
merged 3 commits into from
Mar 2, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 27 additions & 6 deletions ontopy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import types
from typing import TYPE_CHECKING
import urllib.request
import urllib.parse
import warnings
import defusedxml.ElementTree as ET

Expand Down Expand Up @@ -220,12 +221,14 @@ class ReadCatalogError(IOError):
"""Error reading catalog file."""


def read_catalog( # pylint: disable=too-many-locals,too-many-statements
def read_catalog( # pylint: disable=too-many-locals,too-many-statements,too-many-arguments
uri,
catalog_file="catalog-v001.xml",
baseuri=None,
recursive=False,
return_paths=False,
visited_iris=None,
visited_paths=None,
):
"""Reads a Protègè catalog file and returns as a dict.

Expand All @@ -249,10 +252,19 @@ def read_catalog( # pylint: disable=too-many-locals,too-many-statements
If `return_paths` is true, a set of directory paths to source
files is returned in addition to the default dict.

The `visited_uris` and `visited_paths` arguments are only intended for
internal use to avoid infinite recursions.

A ReadCatalogError is raised if the catalog file cannot be found.
"""
# Protocols supported by urllib.request
web_protocols = "http://", "https://", "ftp://"

iris = visited_iris if visited_iris else {}
dirs = visited_paths if visited_paths else set()
if uri in iris:
return (iris, dirs) if return_paths else iris

if uri.startswith(web_protocols):
# Call read_catalog() recursively to ensure that the temporary
# file is properly cleaned up
Expand Down Expand Up @@ -282,6 +294,8 @@ def read_catalog( # pylint: disable=too-many-locals,too-many-statements
baseuri=baseuri if baseuri else base,
recursive=recursive,
return_paths=return_paths,
visited_iris=iris,
visited_paths=dirs,
)
raise ReadCatalogError(
"Cannot download catalog from URLs: " + ", ".join(uris)
Expand Down Expand Up @@ -335,9 +349,8 @@ def load_uri(uri, dirname):
if baseuri and baseuri.startswith(web_protocols):
url = f"{baseuri}/{uri_as_str}"
else:
url = os.path.normpath(
os.path.join(baseuri if baseuri else dirname, uri_as_str)
)
url = os.path.join(baseuri if baseuri else dirname, uri_as_str)
url = normalise_url(url)

iris.setdefault(uri.attrib["name"], url)
if recursive:
Expand All @@ -351,14 +364,14 @@ def load_uri(uri, dirname):
baseuri=None,
recursive=recursive,
return_paths=True,
visited_iris=iris,
visited_paths=dirs,
)
iris.update(iris_)
dirs.update(dirs_)
else:
load_catalog(catalog)

iris = {}
dirs = set()
load_catalog(filepath)
if return_paths:
return iris, dirs
Expand Down Expand Up @@ -672,3 +685,11 @@ def annotate_with_ontology(onto, imported=True):
setattr(entity, "ontology_name", onto.name)
if onto.base_iri not in getattr(entity, "ontology_iri"):
setattr(entity, "ontology_iri", onto.base_iri)


def normalise_url(url):
"""Returns `url` in a normalised form."""
splitted = urllib.parse.urlsplit(url)
components = list(splitted)
components[2] = os.path.normpath(splitted.path)
return urllib.parse.urlunsplit(components)