Skip to content

Commit

Permalink
Merge pull request #861 from cognitedata/main
Browse files Browse the repository at this point in the history
Release 0.104.0
  • Loading branch information
Juliamg authored Dec 20, 2024
2 parents d6b0193 + ba8bd92 commit 3a2935d
Show file tree
Hide file tree
Showing 116 changed files with 8,697 additions and 3,827 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,6 @@ real_cases/
*.env

# Excel chache files
~*.xlsx
~*.xlsx
docs/**/*.aml
docs/**/*.xml
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
.PHONY: run-explorer run-tests run-linters build-ui build-python build-docker run-docker compose-up
version="0.103.1"
version="0.104.0"
run-explorer:
@echo "Running explorer API server..."
# open "http://localhost:8000/static/index.html" || true
Expand Down
2 changes: 1 addition & 1 deletion cognite/neat/_graph/extractors/_mock_graph_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(
# fixes potential issues with circular dependencies
from cognite.neat._rules.transformers import DMSToInformation

self.rules = DMSToInformation().transform(rules).rules
self.rules = DMSToInformation().transform(rules)
elif isinstance(rules, InformationRules):
self.rules = rules
else:
Expand Down
110 changes: 109 additions & 1 deletion cognite/neat/_graph/transformers/_base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,27 @@
import dataclasses
import warnings
from abc import ABC, abstractmethod
from typing import ClassVar
from typing import ClassVar, TypeAlias, cast

from rdflib import Graph
from rdflib.query import ResultRow

from cognite.neat._issues.warnings import NeatValueWarning
from cognite.neat._shared import Triple
from cognite.neat._utils.collection_ import iterate_progress_bar
from cognite.neat._utils.graph_transformations_report import GraphTransformationResult

To_Add_Triples: TypeAlias = list[Triple]
To_Remove_Triples: TypeAlias = list[Triple]


@dataclasses.dataclass
class RowTransformationOutput:
remove_triples: To_Remove_Triples = dataclasses.field(default_factory=list)
add_triples: To_Add_Triples = dataclasses.field(default_factory=list)
instances_removed_count: int = 0
instances_added_count: int = 0
instances_modified_count: int = 0


class BaseTransformer(ABC):
Expand All @@ -12,3 +32,91 @@ class BaseTransformer(ABC):
@abstractmethod
def transform(self, graph: Graph) -> None:
raise NotImplementedError()


class BaseTransformerStandardised(ABC):
"""Standardised base transformer to use in case a transformer is adding or removing triples from a graph. If you
are doing more specialised operations, please overwrite the .transform() method.
"""

description: str
_use_only_once: bool = False
_need_changes: ClassVar[frozenset[str]] = frozenset()
_use_iterate_bar_threshold: int = 500

@abstractmethod
def operation(self, query_result_row: ResultRow) -> RowTransformationOutput:
"""The operations to perform on each row resulting from the ._iterate_query() method.
The operation should return a list of triples to add and to remove.
"""
raise NotImplementedError()

@abstractmethod
def _count_query(self) -> str:
"""
Overwrite to fetch all affected properties in the graph as a result of the transformation.
Returns:
A query string.
"""
raise NotImplementedError()

@abstractmethod
def _iterate_query(self) -> str:
"""
The query to use for extracting target triples from the graph and performing the transformation.
Returns:
A query string.
"""
raise NotImplementedError()

def _skip_count_query(self) -> str:
"""
The query to use for extracting target triples from the graph and performing the transformation.
Returns:
A query string.
"""
return ""

def transform(self, graph: Graph) -> GraphTransformationResult:
outcome = GraphTransformationResult(self.__class__.__name__)
outcome.added = outcome.modified = outcome.removed = 0

iteration_count_res = list(graph.query(self._count_query()))
iteration_count = int(iteration_count_res[0][0]) # type: ignore [index, arg-type]

outcome.affected_nodes_count = iteration_count

if self._skip_count_query():
skipped_count_res = list(graph.query(self._skip_count_query()))
skipped_count = int(skipped_count_res[0][0]) # type: ignore [index, arg-type]
warnings.warn(
NeatValueWarning(f"Skipping {skipped_count} properties in transformation {self.__class__.__name__}"),
stacklevel=2,
)
outcome.skipped = skipped_count

if iteration_count == 0:
return outcome

result_iterable = graph.query(self._iterate_query())
if iteration_count > self._use_iterate_bar_threshold:
result_iterable = iterate_progress_bar( # type: ignore[misc, assignment]
result_iterable,
total=iteration_count,
description=self.description,
)

for row in result_iterable:
row = cast(ResultRow, row)
row_output = self.operation(row)

outcome.added += row_output.instances_added_count
outcome.removed += row_output.instances_removed_count
outcome.modified += row_output.instances_modified_count

for triple in row_output.add_triples:
graph.add(triple)
for triple in row_output.remove_triples:
graph.remove(triple)

return outcome
4 changes: 4 additions & 0 deletions cognite/neat/_graph/transformers/_classic_cdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from ._base import BaseTransformer


# TODO: standardise
class AddAssetDepth(BaseTransformer):
description: str = "Adds depth of asset in the asset hierarchy to the graph"
_use_only_once: bool = True
Expand Down Expand Up @@ -84,6 +85,7 @@ def get_depth(
return None


# TODO: standardise
class BaseAssetConnector(BaseTransformer, ABC):
_asset_type: URIRef = DEFAULT_NAMESPACE.Asset
_item_type: URIRef
Expand Down Expand Up @@ -166,6 +168,7 @@ class AssetEventConnector(BaseAssetConnector):
_connection_type = DEFAULT_NAMESPACE.event


# TODO: standardise
class AssetRelationshipConnector(BaseTransformer):
description: str = "Connects assets via relationships"
_use_only_once: bool = True
Expand Down Expand Up @@ -242,6 +245,7 @@ def transform(self, graph: Graph) -> None:
graph.remove((relationship_id, self.relationship_target_xid_prop, None))


# TODO: standardise
class RelationshipAsEdgeTransformer(BaseTransformer):
"""Converts relationships into edges in the graph.
Expand Down
150 changes: 103 additions & 47 deletions cognite/neat/_graph/transformers/_prune_graph.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
from typing import cast

from rdflib import Graph, Namespace, URIRef
from rdflib.query import ResultRow

from cognite.neat._constants import DEFAULT_NAMESPACE
from cognite.neat._shared import Triple
from cognite.neat._utils.rdf_ import as_neat_compliant_uri
from cognite.neat._utils.text import sentence_or_string_to_camel

from ._base import BaseTransformer
from ._base import BaseTransformer, BaseTransformerStandardised, RowTransformationOutput


# TODO: Standardise after figuring out the bug which appears when running test_iodd_transformers.py
class AttachPropertyFromTargetToSource(BaseTransformer):
"""
Transformer that considers a TargetNode and SourceNode relationship, to extract a property that is attached to
Expand Down Expand Up @@ -148,6 +150,7 @@ def transform(self, graph) -> None:
graph.remove((target_node, None, None))


# TODO: Remove or adapt IODD
class PruneDanglingNodes(BaseTransformer):
"""
Knowledge graph pruner and resolver. Will remove rdf triples from graph that does not have connections
Expand Down Expand Up @@ -189,69 +192,122 @@ def transform(self, graph: Graph) -> None:
graph.remove((subject, None, None))


class PruneTypes(BaseTransformer):
class PruneTypes(BaseTransformerStandardised):
"""
Removes all the instances of specific type
"""

description: str = "Prunes nodes of specific rdf types"
_query_template = """
SELECT ?subject
WHERE {{
?subject a <{rdf_type}> .
}}
"""

def __init__(
self,
node_prune_types: list[URIRef],
):
self.node_prune_types = node_prune_types

def transform(self, graph: Graph) -> None:
for type_ in self.node_prune_types:
for (subject,) in list(graph.query(self._query_template.format(rdf_type=type_))): # type: ignore
graph.remove((subject, None, None))


class PruneDeadEndEdges(BaseTransformer):
def _iterate_query(self) -> str:
filter_string = ""
for node in self.node_prune_types:
filter_string += f" <{node}> "

query = """
SELECT ?subject
WHERE {{
?subject a ?type .
VALUES ?type {{ {rdf_types_string} }}
}}
"""
return query.format(rdf_types_string=filter_string)

def _count_query(self) -> str:
filter_string = ""
for node in self.node_prune_types:
filter_string += f" <{node}> "

query = """
SELECT ( COUNT( ?subject ) as ?count )
WHERE {{
?subject a ?type .
VALUES ?type {{ {rdf_types_string} }}
}}
"""
return query.format(rdf_types_string=filter_string)

def operation(self, query_result_row: ResultRow) -> RowTransformationOutput:
row_output = RowTransformationOutput()

(subject,) = query_result_row
row_output.remove_triples.append((subject, None, None)) # type: ignore
row_output.instances_removed_count = 1

return row_output


class PruneDeadEndEdges(BaseTransformerStandardised):
"""
Removes all the triples where object is a node that is not found in graph
"""

description: str = "Prunes the graph of specified rdf types that do not have connections to other nodes."
_query_template = """
SELECT ?subject ?predicate ?object
WHERE {
?subject ?predicate ?object .
FILTER (isIRI(?object) && ?predicate != rdf:type)
FILTER NOT EXISTS {?object ?p ?o .}
}
"""

def transform(self, graph: Graph) -> None:
for triple in graph.query(self._query_template):
graph.remove(cast(Triple, triple))


class PruneInstancesOfUnknownType(BaseTransformer):
description: str = "Pruning the graph of triples where object is a node that is not found in graph."

def _iterate_query(self) -> str:
return """
SELECT ?subject ?predicate ?object
WHERE {
?subject ?predicate ?object .
FILTER (isIRI(?object) && ?predicate != rdf:type)
FILTER NOT EXISTS {?object ?p ?o .}
}
"""

def _count_query(self) -> str:
return """
SELECT (COUNT(?object) AS ?count)
WHERE {
?subject ?predicate ?object .
FILTER (isIRI(?object) && ?predicate != rdf:type)
FILTER NOT EXISTS {?object ?p ?o .}
}
"""

def operation(self, row: ResultRow) -> RowTransformationOutput:
row_output = RowTransformationOutput()
row_output.remove_triples.append(cast(Triple, row))
row_output.instances_modified_count = 1

return row_output


class PruneInstancesOfUnknownType(BaseTransformerStandardised):
"""
Removes all the triples where object is a node that is not found in graph
"""

description: str = "Prunes the graph of specified rdf types that do not have connections to other nodes."
_query_template = """
SELECT DISTINCT ?subject
WHERE {
?subject ?p ?o .
FILTER NOT EXISTS {?subject a ?object .}
}
"""

def transform(self, graph: Graph) -> None:
for (subject,) in graph.query(self._query_template): # type: ignore
graph.remove((subject, None, None))
description: str = "Prunes the graph of triples where the object is a node that is not found in the graph."

def _iterate_query(self) -> str:
return """
SELECT DISTINCT ?subject
WHERE {
?subject ?p ?o .
FILTER NOT EXISTS {?subject a ?object .}
}
"""

def _count_query(self) -> str:
return """
SELECT (COUNT(DISTINCT ?subject) as ?count)
WHERE {
?subject ?p ?o .
FILTER NOT EXISTS {?subject a ?object .}
}
"""

def operation(self, query_result_row: ResultRow) -> RowTransformationOutput:
row_output = RowTransformationOutput()
(subject,) = query_result_row
row_output.remove_triples.append(cast(Triple, (subject, None, None)))
row_output.instances_removed_count = 1

return row_output
Loading

0 comments on commit 3a2935d

Please sign in to comment.