improve examples

EKGF · Jul 25, 2024 · b82b022 · b82b022
1 parent 89ebe1a
commit b82b022
Show file tree

Hide file tree

Showing 5 changed files with 217 additions and 157 deletions.
diff --git a/examples/data-linage/README.md b/examples/data-linage/README.md
diff --git a/examples/data-linage/example.json b/examples/data-linage/example.json
diff --git a/examples/data-lineage/README.md b/examples/data-lineage/README.md
@@ -0,0 +1,141 @@
+## Data Lineage
+
+It is important to be able to trace the lineage of data. Within DPROD, this can be done in two ways: at a high level from one data product to another and, if you want, at a more detailed level of the underlying datasets.
+
+### High Level Lineage: Between Data Products
+
+Data products have input and output ports, and one data product’s input port will point to another data product’s output port.
+
+This allows a user to query the lineage. The data products all have URLs as identifiers, and properties all connect to each other, so you can walk from one data product to the downstream data products that feed it.
+
+You can follow the path that leads from one data product to another like this:
+
+```text
+Data Product >> inputPort >> isAccessServiceOf >> isDistributionOf >> Input Data Product 
+```
+
+Let's look at some example data with three data products that connect to each other through their input and output ports: 
+```json
+{
+  "@context": "https://ekgf.github.io/data-product-spec/dprod.jsonld",
+  "dataProducts": [
+    {
+      "id": "https://y.com/data-product/company-finance",
+      "type": "DataProduct",
+      "inputPort": [
+        {
+          "id": "https://y.com/data-product/company-sales/port/2025-sales",
+          "type": "DataService"
+        },
+        {
+          "id": "https://y.com/data-product/company-hr/port/2025-payroll",
+          "type": "DataService"
+        }
+      ],
+      "outputPort": {
+        "id": "https://y.com/data-product/company-sales/port/2025-balance-sheet",
+        "type": "DataService",
+        "label": "Balance Sheet",
+        "endpointURL": "https://y.com/data-product/company-sales/port/2025-c",
+        "isAccessServiceOf": {
+          "type": "Distribution",
+          "format": "https://www.iana.org/assignments/media-types/application/json",
+          "isDistributionOf": {
+            "type": "Dataset",
+            "id": "https://y.com/data-product/company-sales/dataset/2025-balance-sheet",
+            "conformsTo": "https://y.com/schema/BalanceSheet"
+          }
+        }
+      }
+    },
+    {
+      "id": "https://y.com/data-product/company-sales",
+      "type": "DataProduct",
+      "outputPort": {
+        "id": "https://y.com/data-product/company-sales/port/2025-sales",
+        "type": "DataService",
+        "label": "Sales",
+        "endpointURL": "https://y.com/data-product/company-sales/port/2025-sales",
+        "isAccessServiceOf": {
+          "type": "Distribution",
+          "format": "https://www.iana.org/assignments/media-types/application/json",
+          "isDistributionOf": {
+            "type": "Dataset",
+            "label": "Sales",
+            "id": "https://y.com/data-product/company-sales/dataset/2025-sales",
+            "conformsTo": "https://y.com/schema/Sale"
+          }
+        }
+      }
+    },
+    {
+      "id": "https://y.com/data-product/company-hr",
+      "type": "DataProduct",
+      "outputPort": {
+        "id": "https://y.com/data-product/company-sales/port/2025-payroll",
+        "type": "DataService",
+        "label": "Payroll",
+        "endpointURL": "https://y.com/data-product/company-hr/port/2025-payroll",
+        "isAccessServiceOf": {
+          "type": "Distribution",
+          "format": "https://www.iana.org/assignments/media-types/text/csv",
+          "isDistributionOf": {
+            "type": "Dataset",
+            "label": "Payroll",
+            "id": "https://y.com/data-product/company-sales/dataset/2025-payroll",
+            "conformsTo": "https://y.com/schema/Payroll"
+          }
+        }
+      }
+    }
+  ]
+}
+```
+
+Given this example data, if we started at the data product `https://y.com/data-product/company-finance`, we could walk the relationships to find the input data products that feed it:
+```text
+https://y.com/data-product/company-finance >> :inputPort >> :isAccessServiceOf >> :isDistributionOf >> [https://y.com/data-product/company-sales , https://y.com/data-product/company-hr]
+```
+
+In Linked Data, we would actually do this with a query like this:
+```sparql
+PREFIX dcat: <http://www.w3.org/ns/dcat#>
+PREFIX dprod: <https://ekgf.github.io/data-product-spec/dprod/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+PREFIX : <https://y.com/data-product/>
+
+SELECT DISTINCT ?input
+WHERE
+{ 
+  :company-finance dprod:inputPort ?inputPort.
+  ?inputPort dprod:isAccessServiceOf/dprod:isDistributionOf/rdfs:label ?input.
+}
+```
+
+
+
+### Detailed Level: Between Datasets
+
+If you wish to track lineage at a more granular level, you can also use PROV (https://www.w3.org/TR/prov-o/) at the dataset level.
+
+```ttl
+dap:atnf-P366-2003SEPT
+  rdf:type dcat:Dataset ;
+  dcterms:bibliographicCitation "Burgay, M; McLaughlin, M; Kramer, M; Lyne, A; Joshi, B; Pearce, G; D'Amico, N; Possenti, A; Manchester, R; Camilo, F (2017): Parkes observations for project P366 semester 2003SEPT. v1. CSIRO. Data Collection. https://doi.org/10.4225/08/598dc08d07bb7" ;
+  dcterms:title "Parkes observations for project P366 semester 2003SEPT"@en ;
+  dcat:landingPage <https://data.csiro.au/dap/landingpage?pid=csiro:P366-2003SEPT> ;
+  prov:wasGeneratedBy dap:P366 ;
+  .
+
+dap:P366
+  rdf:type prov:Activity ;
+  dcterms:type <http://dbpedia.org/resource/Observation> ;
+  prov:startedAtTime "2000-11-01"^^xsd:date ;
+  prov:used dap:Parkes-radio-telescope ;
+  prov:wasInformedBy dap:ATNF ;
+  rdfs:label "P366 - Parkes multibeam high-latitude pulsar survey"@en ;
+  rdfs:seeAlso <https://doi.org/10.1111/j.1365-2966.2006.10100.x> ;
+  .
+```
+
+See: https://www.w3.org/TR/vocab-dcat-3/#examples-dataset-provenance.
diff --git a/examples/observability/README.md b/examples/observability/README.md
@@ -1,8 +1,81 @@
+## Observability Ports
+
 An Observability Port is a designated interface or endpoint in a system or application specifically used for monitoring and diagnostic purposes. It allows external tools or services to collect and analyze data related to the system's performance, health, and behaviour. By exposing metrics, logs, and traces through this port, administrators and developers can gain insights into the system's state, troubleshoot issues, and ensure it operates efficiently and reliably.
 
-DPROD has a schema-first design, to the first thing you would need to do is define an schema for your logging information. It could be a schema based on open telemetry etc, but in this example we use RLOG (which is a semantic ontology for logging).
+### Defining Observability Ports in DPROD
+
+DPROD has a schema-first design. The first thing you would need to do is define a schema for your logging information. It could be a schema based on OpenTelemetry, but in this example, we use RLOG (which is a semantic ontology for logging).
+
+To find the Observability Port, you would query the ports to identify the ones that return an `RLOG:Entry`:
+
+```text
+  outputPort >> isAccessServiceOf >> isDistributionOf >> conformsTo  >> rlog:Entry
+```
+
+### Example Data Product with Observability Port
+
+You can see that the example data product has two ports, one with the data and one with the logging. This query will return the URI of the port that returns logging data: `https://y.com/uk-bonds/observability-port`.
+
+Here is an example of a data product with an observability port:
+
+```json
+{
+  "@context": "https://ekgf.github.io/data-product-spec/dprod.jsonld",
+  "dataProducts": [
+    {
+      "id": "https://y.com/data-product/uk-bonds",
+      "type": "DataProduct",
+      "inputPort": [
+        {
+          "id": "https://y.com/data-product/uk-bonds/port/2024-data",
+          "type": "DataService"
+        }
+      ],
+      "outputPort": [
+        {
+          "id": "https://y.com/data-product/uk-bonds/port/2024-observability",
+          "type": "DataService",
+          "label": "Observability Port",
+          "endpointURL": "https://y.com/data-product/uk-bonds/port/2024-observability",
+          "isAccessServiceOf": {
+            "type": "Distribution",
+            "format": "https://www.iana.org/assignments/media-types/application/json",
+            "isDistributionOf": {
+              "type": "Dataset",
+              "id": "https://y.com/data-product/uk-bonds/dataset/2024-observability",
+              "conformsTo": "https://y.com/schema/ObservabilityLog"
+            }
+          }
+        },
+        {
+          "id": "https://y.com/data-product/uk-bonds/port/2024-data",
+          "type": "DataService",
+          "label": "Data Port",
+          "endpointURL": "https://y.com/data-product/uk-bonds/port/2024-data",
+          "isAccessServiceOf": {
+            "type": "Distribution",
+            "format": "https://www.iana.org/assignments/media-types/application/json",
+            "isDistributionOf": {
+              "type": "Dataset",
+              "id": "https://y.com/data-product/uk-bonds/dataset/2024-data",
+              "conformsTo": "https://y.com/schema/Data"
+            }
+          }
+        }
+      ]
+    }
+  ]
+}
+```
+
+Given that our schema defines the class for an observation, we can use that to find all observantly ports on data product like this:
+
+```text
+[https://y.com/data-product/uk-bonds/port/2024-observability] >> isAccessServiceOf >> isDistributionOf >> conformsTo >> https://y.com/schema/ObservabilityLog
+```
+
+In Linked Data we would use a SPARQL query to do that:
 
-If I want to find the Observability Port then I would query the ports to find the ones that returned an RLOG:Entry:
 ```sparql
 SELECT ?port
 WHERE
@@ -12,4 +85,4 @@ WHERE
 }
 ```
 
-You can see that the example data product has two ports one with the data and one with the logging. This query will return the URI of the port that returns logging data: https://y.com/uk-bonds/observability-port
+This query will return the URI of the port that provides logging data: `https://y.com/data-product/uk-bonds/port/2024-observability`.
diff --git a/examples/observability/example.json b/examples/observability/example.json