Graph expansion using inoculation or SHACL TripleRules will now expan…

…d into a separate named graph if you are working with an RDFLib Dataset instead of a Graph.
RDFLib · Nov 1, 2024 · 83684fe · 83684fe
1 parent d7241ca
commit 83684fe
Show file tree

Hide file tree

Showing 13 changed files with 351 additions and 139 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,24 @@ and this project adheres to [Python PEP 440 Versioning](https://www.python.org/d
 ## [Unreleased]
 - Nothing yet
 
+## [0.29.0] - 2024-11-01
+
+### Added
+- When validating a Dataset instead of a bare Graph, PySHACL will now expand RDFS and OWL-RL inferences into
+  a separate named graph, to avoid polluting the datagraph.
+- When using SHACL Triple Rules from SHACL-AF spec, PySHACL will now add the expressed triples into
+  a separate named graph. This allows you to more easily get the expanded triples back out again afterward.
+
+### Changed
+- PySHACL no longer supports older RDFLib versions
+  - PySHACL relies on the latest OWL-RL version, that in-turn relies on the latest RDFLib version
+  - Therefore PySHACL now requires RDFLib v7.1.1 or newer
+- Dropped Python 3.8 support.
+  - Python developers discontinued Python 3.8 last month
+  - The next version of RDFLib and OWL-RL will not support Python 3.8
+  - Removed Python 3.8 from the RDFLib test suite
+  - Python 3.9-specific typing changes will be incrementally introduced
+
 ## [0.28.1] - 2024-10-25
 
 ### Fixed
@@ -1182,7 +1200,8 @@ just leaves the files open. Now it is up to the command-line client to close the
 
 - Initial version, limited functionality
 
-[Unreleased]: https://github.com/RDFLib/pySHACL/compare/v0.28.1...HEAD
+[Unreleased]: https://github.com/RDFLib/pySHACL/compare/v0.29.0...HEAD
+[0.29.0]: https://github.com/RDFLib/pySHACL/compare/v0.28.1...v0.29.0
 [0.28.1]: https://github.com/RDFLib/pySHACL/compare/v0.28.0...v0.28.1
 [0.28.0]: https://github.com/RDFLib/pySHACL/compare/v0.27.0...v0.28.0
 [0.27.0]: https://github.com/RDFLib/pySHACL/compare/v0.26.0...v0.27.0

diff --git a/pyshacl/inference/custom_rdfs_closure.py b/pyshacl/inference/custom_rdfs_closure.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+from typing import Optional, TYPE_CHECKING
+
 try:
     from owlrl import OWL
 
@@ -25,6 +27,9 @@
 from owlrl.OWLRL import OWLRL_Semantics
 from owlrl.RDFSClosure import RDFS_Semantics as OrigRDFSSemantics
 
+if TYPE_CHECKING:
+    from rdflib.graph import Graph
+
 
 class CustomRDFSSemantics(OrigRDFSSemantics):
     def one_time_rules(self):
@@ -49,9 +54,9 @@ class CustomRDFSOWLRLSemantics(CustomRDFSSemantics, OWLRL_Semantics):
         (OWL.DataRange, OWL.equivalentClass, RDFS.Datatype),
     ]
 
-    def __init__(self, graph, axioms, daxioms, rdfs=True):
-        OWLRL_Semantics.__init__(self, graph, axioms, daxioms, rdfs)
-        CustomRDFSSemantics.__init__(self, graph, axioms, daxioms, rdfs)
+    def __init__(self, graph, axioms, daxioms, rdfs: bool = True, destination: Optional['Graph'] = None):
+        OWLRL_Semantics.__init__(self, graph, axioms, daxioms, rdfs=rdfs, destination=destination)
+        CustomRDFSSemantics.__init__(self, graph, axioms, daxioms, rdfs=rdfs, destination=destination)
         self.rdfs = True
 
     # noinspection PyMethodMayBeStatic

diff --git a/pyshacl/rdfutil/inoculate.py b/pyshacl/rdfutil/inoculate.py
@@ -2,22 +2,20 @@
 from typing import TYPE_CHECKING, Dict, Optional, Union
 
 import rdflib
-from rdflib.graph import DATASET_DEFAULT_GRAPH_ID
-from rdflib.namespace import NamespaceManager
 
-from .clone import clone_blank_node, clone_graph, clone_node
+from .clone import clone_blank_node, clone_node, clone_dataset
 from .consts import OWL, RDF, ConjunctiveLike, GraphLike, OWL_classes, OWL_properties, RDFS_classes, RDFS_properties
 
 if TYPE_CHECKING:
     from rdflib import BNode
-    from rdflib.term import IdentifiedNode
+    from rdflib.term import URIRef
 
     from .consts import RDFNode
 
 OWLNamedIndividual = OWL.NamedIndividual
 
 
-def inoculate(data_graph: rdflib.Graph, ontology: rdflib.Graph) -> rdflib.Graph:
+def inoculate(data_graph: rdflib.Graph, ontology: GraphLike) -> rdflib.Graph:
     """
     Copies all RDFS and OWL axioms (classes, relationship definitions, and properties)
     from the ontology graph into the data_graph.
@@ -33,6 +31,9 @@ def inoculate(data_graph: rdflib.Graph, ontology: rdflib.Graph) -> rdflib.Graph:
     ontology_ns = ontology.namespace_manager
     data_graph_ns = data_graph.namespace_manager
 
+    if isinstance(ontology, (rdflib.ConjunctiveGraph, rdflib.Dataset)):
+        # always set default context true on the ontology DS
+        ontology.default_context = True
     # Bind any missing ontology namespaces in the DataGraph NS manager.
     if ontology_ns is not data_graph_ns:
         data_graph_prefixes = {p: n for (p, n) in data_graph_ns.namespaces()}
@@ -108,119 +109,44 @@ def inoculate(data_graph: rdflib.Graph, ontology: rdflib.Graph) -> rdflib.Graph:
 
 
 def inoculate_dataset(
-    base_ds: ConjunctiveLike, ontology_ds: GraphLike, target_ds: Optional[Union[ConjunctiveLike, str]] = None
+    base_ds: ConjunctiveLike, ontology_ds: GraphLike, target_ds: Optional[Union[ConjunctiveLike, str]] = None,
+    target_graph_identifier: Optional['URIRef'] = None,
 ):
     """
     Make a clone of base_ds (dataset) and add RDFS and OWL triples from ontology_ds
     :param base_ds:
     :type base_ds: rdflib.Dataset
     :param ontology_ds:
-    :type ontology_ds: rdflib.Dataset
+    :type ontology_ds: rdflib.Dataset|rdflib.ConjunctiveGraph|rdflib.Graph
     :param target_ds:
     :type target_ds: rdflib.Dataset|str|NoneType
+    :param target_graph_identifier:
+    :type target_graph_identifier: rdflib.URIRef | None
     :return: The cloned Dataset with ontology triples from ontology_ds
     :rtype: rdflib.Dataset
     """
 
-    # TODO: Decide whether we need to clone base_ds before calling this,
-    # or we clone base_ds as part of this function
-    default_union: bool = base_ds.default_union
-    base_named_graphs = [
-        (
-            rdflib.Graph(base_ds.store, i, namespace_manager=base_ds.namespace_manager)  # type: ignore[arg-type]
-            if not isinstance(i, rdflib.Graph)
-            else i
-        )
-        for i in base_ds.store.contexts(None)
-    ]
-    if isinstance(base_ds, rdflib.Dataset) and len(base_named_graphs) < 1:
-        base_named_graphs = [
-            rdflib.Graph(base_ds.store, DATASET_DEFAULT_GRAPH_ID, namespace_manager=base_ds.namespace_manager)
-        ]
-    base_default_context_id = base_ds.default_context.identifier
     if target_ds is None:
-        target_ds = rdflib.Dataset(default_union=default_union)
-        target_ds.namespace_manager = NamespaceManager(target_ds, 'core')
-        target_ds.default_context.namespace_manager = target_ds.namespace_manager
+        target_ds = clone_dataset(base_ds)
+    elif target_ds is base_ds:
+        pass
     elif target_ds == "inplace" or target_ds == "base":
         target_ds = base_ds
     elif isinstance(target_ds, str):
         raise RuntimeError("target_ds cannot be a string (unless it is 'inplace' or 'base')")
+
     if isinstance(target_ds, (rdflib.ConjunctiveGraph, rdflib.Dataset)):
         if not isinstance(target_ds, rdflib.Dataset):
             raise RuntimeError("Cannot inoculate ConjunctiveGraph, use Dataset instead.")
     else:
         raise RuntimeError("Cannot inoculate datasets if target_ds passed in is not a Dataset itself.")
-    ont_default_context_id: Union[IdentifiedNode, str, None]
-    if isinstance(ontology_ds, (rdflib.Dataset, rdflib.ConjunctiveGraph)):
-        ont_graphs = [
-            (
-                rdflib.Graph(ontology_ds.store, i, namespace_manager=ontology_ds.namespace_manager)  # type: ignore[arg-type]
-                if not isinstance(i, rdflib.Graph)
-                else i
-            )
-            for i in ontology_ds.store.contexts(None)
-        ]
-        ont_default_context_id = ontology_ds.default_context.identifier
-    else:
-        ont_graphs = [ontology_ds]
-        ont_default_context_id = None
-    if target_ds is base_ds or target_ds == "inplace" or target_ds == "base":
-        target_ds = base_ds
-        for bg in base_named_graphs:
-            if len(base_named_graphs) > 1 and bg.identifier == base_default_context_id and len(bg) < 1:
-                # skip empty default named graph in base_graph
-                continue
-            for og in ont_graphs:
-                if len(ont_graphs) > 1 and og.identifier == ont_default_context_id and len(og) < 1:
-                    # skip empty default named graph in ontology_graph
-                    continue
-                inoculate(bg, og)
+
+    if target_graph_identifier:
+        dest_graph = target_ds.get_context(target_graph_identifier)
     else:
-        inoculated_graphs = {}
-        for bg in base_named_graphs:
-            if len(base_named_graphs) > 1 and bg.identifier == base_default_context_id and len(bg) < 1:
-                # skip empty default named graph in base_graph
-                continue
-            target_g = rdflib.Graph(store=target_ds.store, identifier=bg.identifier)
-            clone_g = clone_graph(bg, target_graph=target_g)
-            for og in ont_graphs:
-                if len(ont_graphs) > 1 and og.identifier == ont_default_context_id and len(og) < 1:
-                    # skip empty default named graph in ontology_graph
-                    continue
-                inoculate(clone_g, og)
-            inoculated_graphs[bg.identifier] = clone_g
-
-        base_graph_identifiers = [bg.identifier for bg in base_named_graphs]
-        base_default_context_id = base_ds.default_context.identifier
-        target_default_context_id = target_ds.default_context.identifier
-        if base_default_context_id != target_default_context_id:
-            old_target_default_context = target_ds.default_context
-            old_target_default_context_id = old_target_default_context.identifier
-            if isinstance(target_ds, rdflib.Dataset):
-                new_target_default_context = target_ds.graph(base_default_context_id)
-            else:
-                new_target_default_context = target_ds.get_context(base_default_context_id)
-                target_ds.store.add_graph(new_target_default_context)
-            target_ds.default_context = new_target_default_context
-            if old_target_default_context_id not in base_graph_identifiers:
-                if isinstance(target_ds, rdflib.Dataset):
-                    target_ds.remove_graph(old_target_default_context)
-                else:
-                    target_ds.store.remove_graph(old_target_default_context)
-            target_default_context_id = new_target_default_context.identifier
-        else:
-            if isinstance(target_ds, rdflib.Dataset):
-                _ = target_ds.graph(target_default_context_id)
-            else:
-                t_default = target_ds.get_context(target_default_context_id)
-                target_ds.store.add_graph(t_default)
-        for i, ig in inoculated_graphs.items():
-            if ig == target_ds.default_context or i == target_default_context_id:
-                continue
-            if isinstance(target_ds, rdflib.Dataset):
-                _ = target_ds.graph(ig)  # alias to Dataset.add_graph()
-            else:
-                target_ds.store.add_graph(ig)
+        dest_graph = target_ds.default_context
+
+    # inoculate() routine will set default_union on the ontology_ds if it is a Dataset
+    inoculate(dest_graph, ontology_ds)
 
     return target_ds
diff --git a/pyshacl/rule_expand_runner.py b/pyshacl/rule_expand_runner.py
@@ -92,7 +92,7 @@ def mix_in_ontology(self):
             else:
                 to_graph = clone_graph(self.data_graph, identifier=self.data_graph.identifier)
             return inoculate(to_graph, self.ont_graph)
-        return inoculate_dataset(self.data_graph, self.ont_graph, self.data_graph if self.inplace else None)
+        return inoculate_dataset(self.data_graph, self.ont_graph, self.data_graph if self.inplace else None, URIRef("urn:pyshacl:inoculation"))
 
     def make_executor(self) -> SHACLExecutor:
         return SHACLExecutor(
@@ -134,7 +134,7 @@ def run(self) -> GraphLike:
                     datagraph = clone_graph(datagraph)
                     has_cloned = True
                 self.logger.debug(f"Running pre-inferencing with option='{inference_option}'.")
-                self._run_pre_inference(datagraph, inference_option, logger=self.logger)
+                self._run_pre_inference(datagraph, inference_option, URIRef("urn:pyshacl:inference"), logger=self.logger)
                 self.pre_inferenced = True
             if not has_cloned and not self.inplace:
                 # We still need to clone in advanced mode, because of triple rules

diff --git a/pyshacl/rules/__init__.py b/pyshacl/rules/__init__.py
@@ -85,6 +85,7 @@ def gather_rules(
         ret_rules[shape].append(rule)
     return ret_rules
 
+RULES_ITERATE_LIMIT = 100
 
 def apply_rules(
     executor: SHACLExecutor,
@@ -98,11 +99,13 @@ def apply_rules(
     for shape, rules in sorted_shapes_rules:
         # sort the rules by the sh:order before execution
         rules = sorted(rules, key=lambda x: x.order)
-        iterate_limit = 100
+        _iterate_limit = int(RULES_ITERATE_LIMIT)
         while True:
-            if iterate_limit < 1:
-                raise ReportableRuntimeError("SHACL Shape Rule iteration exceeded iteration limit of 100.")
-            iterate_limit -= 1
+            if _iterate_limit < 1:
+                raise ReportableRuntimeError(
+                    f"SHACL Shape Rule iteration exceeded iteration limit of {RULES_ITERATE_LIMIT}."
+                )
+            _iterate_limit -= 1
             this_modified = 0
             for r in rules:
                 if r.deactivated:

diff --git a/pyshacl/rules/shacl_rule.py b/pyshacl/rules/shacl_rule.py
@@ -1,13 +1,17 @@
 # -*- coding: utf-8 -*-
 from decimal import Decimal
-from typing import Sequence, Union
+from typing import Sequence, TYPE_CHECKING, Optional
 
 from rdflib import RDF, Literal
 
 from pyshacl.consts import SH_condition, SH_deactivated, SH_order
 from pyshacl.errors import RuleLoadError
 from pyshacl.pytypes import RDFNode, SHACLExecutor
 
+if TYPE_CHECKING:
+    from pyshacl.pytypes import GraphLike
+    from rdflib.term import URIRef
+
 RDF_first = RDF.first
 
 
@@ -41,7 +45,7 @@ def __init__(self, executor: SHACLExecutor, shape, rule_node, iterate=False):
         self.executor = executor
         self.shape = shape
         self.node = rule_node
-        self.iterate = False
+        self.iterate = iterate
 
         deactivated_nodes = list(self.shape.sg.objects(self.node, SH_deactivated))
         self._deactivated = len(deactivated_nodes) > 0 and bool(deactivated_nodes[0])
@@ -111,7 +115,8 @@ def filter_conditions(self, focus_nodes: Sequence[RDFNode], data_graph):
 
     def apply(
         self,
-        data_graph,
-        focus_nodes: Union[Sequence[RDFNode], None] = None,
+        data_graph: 'GraphLike',
+        focus_nodes: Optional[Sequence[RDFNode]] = None,
+        target_graph_identifier: Optional['URIRef'] = None,
     ):
         raise NotImplementedError()
diff --git a/pyshacl/rules/sparql/__init__.py b/pyshacl/rules/sparql/__init__.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-from typing import TYPE_CHECKING, List, Sequence, Union
+from typing import TYPE_CHECKING, List, Sequence, Union, Optional
 
 import rdflib
 from rdflib import Literal
@@ -13,11 +13,13 @@
 from ..shacl_rule import SHACLRule
 
 if TYPE_CHECKING:
+    from rdflib.term import URIRef
     from pyshacl.pytypes import GraphLike, RDFNode, SHACLExecutor
     from pyshacl.shape import Shape
 
 XSD_string = XSD.string
 
+SPARQL_RULE_ITERATE_LIMIT = 100
 
 class SPARQLRule(SHACLRule):
     __slots__ = ("_constructs", "_qh")
@@ -52,7 +54,8 @@ def __init__(self, executor: 'SHACLExecutor', shape: 'Shape', rule_node: 'rdflib
     def apply(
         self,
         data_graph: 'GraphLike',
-        focus_nodes: Union[Sequence['RDFNode'], None] = None,
+        focus_nodes: Optional[Sequence['RDFNode']] = None,
+        target_graph_identifier: Optional['URIRef'] = None,
     ) -> int:
         focus_list: Sequence['RDFNode']
         if focus_nodes is not None:
@@ -70,10 +73,12 @@ def apply(
             focus_list = filtered_focus_nodes
         all_added = 0
         SPARQLQueryHelper = get_query_helper_cls()
-        iterate_limit = 100
+        iterate_limit = int(SPARQL_RULE_ITERATE_LIMIT)
         while True:
             if iterate_limit < 1:
-                raise ReportableRuntimeError("Local SPARQLRule iteration exceeded iteration limit of 100.")
+                raise ReportableRuntimeError(
+                    f"Local SPARQLRule iteration exceeded iteration limit of {SPARQL_RULE_ITERATE_LIMIT}."
+                )
             iterate_limit -= 1
             added = 0
             applicable_nodes = self.filter_conditions(focus_list, data_graph)
@@ -101,8 +106,15 @@ def apply(
                         added += 1
                         construct_graphs.add(result_graph)
             if added > 0:
+                if isinstance(data_graph, (rdflib.Dataset, rdflib.ConjunctiveGraph)):
+                    if target_graph_identifier is not None:
+                        target_graph = data_graph.get_context(target_graph_identifier)
+                    else:
+                        target_graph = data_graph.default_context
+                else:
+                    target_graph = data_graph
                 for g in construct_graphs:
-                    data_graph = clone_graph(g, target_graph=data_graph)
+                    data_graph = clone_graph(g, target_graph=target_graph)
                 all_added += added
                 if self.iterate:
                     continue  # Jump up to iterate