From 9d0b2c6b32d3679037aaa0cb82348a8185e05efb Mon Sep 17 00:00:00 2001 From: Alex Nelson Date: Wed, 15 Nov 2023 10:31:38 -0500 Subject: [PATCH] Define DownloadableObjects hierarchy and DownloadableRelation This patch re-introduces `DownloadableFile`, as well as some related classes, to describe a concept IRI that behaves as a file-yielding URL. This patch removes a qualified SHACL constraint that had been written on a conflated-classes guess that instigated UCO Issue 534. After discussion on Issue 534, this shape should have been removed, but it wasn't until pySHACL Issue 213 was addressed that the shape started triggering and raising awareness it was still around. Hand-maintained data, as well as the Digital Corpora rendering script, have been updated to accommodate the new shapes around `DownloadableObject` and `DownloadableRelation`. A later patch will need to add the `DownloadableRelation`s to the chain of custody. A follow-on patch will regenerate Make-managed files. References: * https://github.com/RDFLib/pySHACL/issues/213 * https://github.com/ucoProject/UCO/issues/534 Signed-off-by: Alex Nelson --- catalog/catalog.ttl | 16 +- catalog/datasets/crossover/supplemental.ttl | 145 +++++++++++++++--- .../dfrws2017-challenge/supplemental.ttl | 65 +++++++- catalog/kb-datasets-shapes.ttl | 8 +- ontology/case-corpora.ttl | 74 ++++++++- shapes/sh-case-corpora.ttl | 89 +++++++++-- src/digital_corpora_supplement_ttl.py | 109 ++++++++++--- tests/distribution_PASS.ttl | 21 ++- tests/distribution_XFAIL.ttl | 65 +++++--- tests/test_shapes.py | 7 +- 10 files changed, 505 insertions(+), 94 deletions(-) diff --git a/catalog/catalog.ttl b/catalog/catalog.ttl index a2e21ad..a571423 100644 --- a/catalog/catalog.ttl +++ b/catalog/catalog.ttl @@ -20,6 +20,19 @@ owl:imports ; . +kb:DownloadableRelation-db9bbe6f-8a71-41b4-bab2-122d7a048ca0 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-1816262c-bd39-458a-b7a9-face4a0a35bb ; + . + +kb:URL-1816262c-bd39-458a-b7a9-face4a0a35bb + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-9b081add-c597-4bf4-80d6-2a391eba3a50 ; + . + kb:catalog-f4ffe9dd-9f19-4756-ad7f-24356e5e9752 a case-corpora:Catalog ; case-corpora:hasDistribution kb:distribution-adc5fd35-e9fa-4d1b-8598-f5ce3bba3e9c ; @@ -64,7 +77,6 @@ kb:url-facet-9b081add-c597-4bf4-80d6-2a391eba3a50 . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-9b081add-c597-4bf4-80d6-2a391eba3a50 ; + a case-corpora:DownloadableObject ; . diff --git a/catalog/datasets/crossover/supplemental.ttl b/catalog/datasets/crossover/supplemental.ttl index cbb8978..94c8c76 100644 --- a/catalog/datasets/crossover/supplemental.ttl +++ b/catalog/datasets/crossover/supplemental.ttl @@ -1,3 +1,4 @@ +@prefix case-corpora: . @prefix drafting: . @prefix kb: . @prefix owl: . @@ -8,6 +9,123 @@ @prefix wd: . @prefix xsd: . +kb:DownloadableRelation-0d749b77-bb7a-4fd7-adbd-ebafae6ae9ee + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-55a8eeff-4609-451e-b29b-3f4528d09054 ; + . + +kb:DownloadableRelation-2aa6c5fe-ea85-4e5f-a398-5a743222ec94 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-28ebac3b-bd82-47a9-98fa-c5eade040705 ; + . + +kb:DownloadableRelation-374af858-bfe4-4419-a1c9-02dc68fc01b5 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-650ec343-ae9f-4889-9dfd-0f45606f526b ; + . + +kb:DownloadableRelation-3b1752c2-7806-4cb5-926a-f58067fb68c7 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-3585f3b1-8279-4f8c-8e1e-3b18010cafcd ; + . + +kb:DownloadableRelation-3ed4592e-e33a-4e6c-893f-6ac7973156c4 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-11ebee68-c7bc-4130-be73-1298181575a1 ; + . + +kb:DownloadableRelation-44907477-cfa5-4e37-a407-379a58627ccb + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-f1cec27b-b243-4a98-8d57-719376b059d1 ; + . + +kb:DownloadableRelation-73e77820-e08f-4196-a1f0-5bfb95fb07a6 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-05e18d67-0d0d-4522-8c73-c4fc73a37585 ; + . + +kb:DownloadableRelation-a1696319-ae91-4741-895b-19e23e6bc387 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-0ceb353f-d908-478d-8f8d-07d75589046c ; + . + +kb:DownloadableRelation-e73e5990-d2e6-45fa-b801-dbdf1b96838e + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-44c35322-f850-4338-969e-45d9fb5e511e ; + . + +kb:URL-05e18d67-0d0d-4522-8c73-c4fc73a37585 + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-88fc82c0-3994-434c-8ec7-8e2261060d71 ; + . + +kb:URL-0ceb353f-d908-478d-8f8d-07d75589046c + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-a1647374-f59d-48c1-b763-b93b37d32536 ; + . + +kb:URL-11ebee68-c7bc-4130-be73-1298181575a1 + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-bc8514bd-3852-4c91-a45f-5e587b0062e9 ; + . + +kb:URL-28ebac3b-bd82-47a9-98fa-c5eade040705 + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-2645d00d-1b22-44c6-bd39-82550044a50f ; + . + +kb:URL-3585f3b1-8279-4f8c-8e1e-3b18010cafcd + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-92bc17e0-745c-48bb-acc0-ea634746711a ; + . + +kb:URL-44c35322-f850-4338-969e-45d9fb5e511e + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-74e92a95-d670-454f-b606-4ad7135dbb73 ; + . + +kb:URL-55a8eeff-4609-451e-b29b-3f4528d09054 + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-0fa77d25-7669-4486-bc57-02b9f2fe5a6f ; + . + +kb:URL-650ec343-ae9f-4889-9dfd-0f45606f526b + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-abc1e0e4-c4b3-42f6-b83d-e0f6567c2903 ; + . + +kb:URL-f1cec27b-b243-4a98-8d57-719376b059d1 + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-9aa3ffbf-0251-4d56-b22e-dbe6f9cbda2d ; + . + kb:device-facet-1ee161fc-5b41-48e0-bbdc-69ffa8c03230 a uco-observable:DeviceFacet ; uco-observable:manufacturer wd:Q20716 ; @@ -66,47 +184,38 @@ kb:url-facet-bc8514bd-3852-4c91-a45f-5e587b0062e9 . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-0fa77d25-7669-4486-bc57-02b9f2fe5a6f ; + a case-corpora:DownloadableFile ; . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-2645d00d-1b22-44c6-bd39-82550044a50f ; + a case-corpora:DownloadableFile ; . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-74e92a95-d670-454f-b606-4ad7135dbb73 ; + a case-corpora:DownloadableFile ; . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-88fc82c0-3994-434c-8ec7-8e2261060d71 ; + a case-corpora:DownloadableFile ; . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-92bc17e0-745c-48bb-acc0-ea634746711a ; + a case-corpora:DownloadableFile ; . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-9aa3ffbf-0251-4d56-b22e-dbe6f9cbda2d ; + a case-corpora:DownloadableFile ; . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-a1647374-f59d-48c1-b763-b93b37d32536 ; + a case-corpora:DownloadableFile ; . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-abc1e0e4-c4b3-42f6-b83d-e0f6567c2903 ; + a case-corpora:DownloadableFile ; . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-bc8514bd-3852-4c91-a45f-5e587b0062e9 ; + a case-corpora:DownloadableFile ; . diff --git a/catalog/datasets/dfrws2017-challenge/supplemental.ttl b/catalog/datasets/dfrws2017-challenge/supplemental.ttl index bfc0d80..b2a4238 100644 --- a/catalog/datasets/dfrws2017-challenge/supplemental.ttl +++ b/catalog/datasets/dfrws2017-challenge/supplemental.ttl @@ -1,3 +1,4 @@ +@prefix case-corpora: . @prefix kb: . @prefix owl: . @prefix prov: . @@ -8,6 +9,58 @@ @prefix wd: . @prefix xsd: . +kb:DownloadableRelation-03131d31-e559-49d5-a3a6-e75d2fbf0ee3 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-d7608b02-86e1-4973-ad57-0e7b3e16256a ; + . + +kb:DownloadableRelation-45729df9-dbb6-402f-a946-121b78af234f + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-f21d9735-65ea-41a3-a032-ce7afae23455 ; + . + +kb:DownloadableRelation-aca66eb5-d2d0-4c4a-a96e-1153008fd05a + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-69e88242-f05e-43fb-b94d-74a03023e372 ; + . + +kb:DownloadableRelation-f3f089bb-ac1c-4250-b006-da46e89cd230 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional "true"^^xsd:boolean ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:URL-7023cbc0-a453-4972-84ce-58f8c1636407 ; + . + +kb:URL-69e88242-f05e-43fb-b94d-74a03023e372 + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-2f2958a3-7e9d-4d14-b31f-4148d35ab52c ; + . + +kb:URL-7023cbc0-a453-4972-84ce-58f8c1636407 + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-9b9d4a1a-059e-47f8-aee0-d93edf61b5f2 ; + . + +kb:URL-d7608b02-86e1-4973-ad57-0e7b3e16256a + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-d9e4bbae-4b47-4d8c-b547-9e524d3a4ca0 ; + . + +kb:URL-f21d9735-65ea-41a3-a032-ce7afae23455 + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-785c662e-8261-4a17-8031-114f86f99af1 ; + . + kb:device-650599c6-701f-4f2e-becb-74398b366ba3 a uco-observable:Device ; uco-core:description "Google OnHub wifi router connected to SmartThings Hub and IPTime switch" ; @@ -85,22 +138,18 @@ kb:url-facet-d9e4bbae-4b47-4d8c-b547-9e524d3a4ca0 . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-2f2958a3-7e9d-4d14-b31f-4148d35ab52c ; + a case-corpora:DownloadableFile ; . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-785c662e-8261-4a17-8031-114f86f99af1 ; + a case-corpora:DownloadableFile ; . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-9b9d4a1a-059e-47f8-aee0-d93edf61b5f2 ; + a case-corpora:DownloadableFile ; . - a uco-observable:URL ; - uco-core:hasFacet kb:url-facet-d9e4bbae-4b47-4d8c-b547-9e524d3a4ca0 ; + a case-corpora:DownloadableFile ; . diff --git a/catalog/kb-datasets-shapes.ttl b/catalog/kb-datasets-shapes.ttl index f2651e2..706840f 100644 --- a/catalog/kb-datasets-shapes.ttl +++ b/catalog/kb-datasets-shapes.ttl @@ -1,3 +1,4 @@ +@prefix case-corpora: . @prefix owl: . @prefix rdf: . @prefix rdfs: . @@ -10,8 +11,13 @@ rdfs:comment "This file contains adjustments to make solely for validating kb-datasets.ttl."@en ; . +case-corpora:DownloadableObject + sh:deactivated "true"^^xsd:boolean ; + sh:description "This shape is deactivated for kb-datasets.ttl, so each distribution.ttl can make references to download URLs without needing to specify DownloadRelations." ; + . + sh-case-corpora:hasDownloadURL-range-shape sh:deactivated "true"^^xsd:boolean ; - sh:description "This shape is deactivated for kb-datasets.ttl, so each distribution.ttl can make references to download URLs without needing to fully specify the URLs." ; + sh:description "This shape is deactivated for kb-datasets.ttl, so each distribution.ttl can make references to download URLs without needing to fully specify the \"download URL's\" class." ; . diff --git a/ontology/case-corpora.ttl b/ontology/case-corpora.ttl index 6c9cd83..f30719f 100644 --- a/ontology/case-corpora.ttl +++ b/ontology/case-corpora.ttl @@ -73,6 +73,78 @@ case-corpora:Distribution ; . +case-corpora:DownloadableContentData + a owl:Class ; + rdfs:subClassOf + case-corpora:DownloadableObject , + uco-observable:ContentData + ; + . + +case-corpora:DownloadableFile + a owl:Class ; + rdfs:subClassOf + case-corpora:DownloadableObject , + uco-observable:File + ; + . + +case-corpora:DownloadableObject + a owl:Class ; + rdfs:comment "An ObservableObject that can be downloaded via some URL. DownloadableObject is defined to specifically exclude some classes that participate in a download action in different ways, especially URL and DownloadableRelation."@en ; + owl:equivalentClass [ + a owl:Class ; + owl:intersectionOf ( + uco-observable:ObservableObject + [ + a owl:Class ; + owl:complementOf uco-observable:ObservableRelationship ; + ] + [ + a owl:Class ; + owl:complementOf uco-observable:URL ; + ] + [ + a owl:Restriction ; + owl:onProperty [ + owl:inverseOf uco-core:source ; + ] ; + owl:someValuesFrom case-corpora:DownloadableRelation ; + ] + ) ; + ] ; + skos:editorialNote "Members of this class are added through extrinsic relationships, namely by being part of a DownloadableRelation."@en ; + . + +case-corpora:DownloadableRelation + a owl:Class ; + rdfs:subClassOf + uco-observable:ObservableRelationship , + [ + a owl:Restriction ; + owl:onProperty uco-core:source ; + owl:allValuesFrom case-corpora:DownloadableObject ; + ] , + [ + a owl:Restriction ; + owl:onProperty uco-core:target ; + owl:allValuesFrom uco-observable:URL ; + ] , + [ + a owl:Restriction ; + owl:onProperty uco-core:kindOfRelationship ; + owl:hasValue "Downloadable_From" ; + ] , + [ + a owl:Restriction ; + owl:onProperty uco-core:isDirectional ; + owl:hasValue "true"^^xsd:boolean ; + ] + ; + rdfs:comment "A Relationship that ties a File or ContentData to a URL from which the File (or ContentData) can be downloaded."@en ; + skos:editorialNote "This class name pattern is drawn from PathRelationFacet."@en ; + . + case-corpora:IANAMediaType a owl:Class ; rdfs:subClassOf dcterms:MediaType ; @@ -148,7 +220,7 @@ case-corpora:hasDownloadURL a owl:ObjectProperty ; rdfs:subPropertyOf dcat:downloadURL ; rdfs:domain case-corpora:Distribution ; - rdfs:range uco-observable:URL ; + rdfs:range case-corpora:DownloadableObject ; . drafting:FileHashQuality diff --git a/shapes/sh-case-corpora.ttl b/shapes/sh-case-corpora.ttl index d2e58ca..82b3316 100644 --- a/shapes/sh-case-corpora.ttl +++ b/shapes/sh-case-corpora.ttl @@ -136,6 +136,78 @@ case-corpora:Distribution sh:targetClass case-corpora:Distribution ; . +case-corpora:DownloadableObject + a sh:NodeShape ; + sh:not [ + a sh:NodeShape ; + sh:or ( + [ + a sh:NodeShape ; + sh:class uco-observable:ObservableRelationship ; + ] + [ + a sh:NodeShape ; + sh:class uco-observable:URL ; + ] + ) ; + ] ; + sh:property [ + a sh:PropertyShape ; + sh:message "CASE-Corpora requires a DownloadableObject have a stated tie to a DownloadableRelation."@en ; + sh:path [ + sh:inversePath uco-core:source ; + ] ; + sh:qualifiedMinCount "1"^^xsd:integer ; + sh:qualifiedValueShape [ + a sh:NodeShape ; + sh:class case-corpora:DownloadableRelation ; + ] ; + ] ; + sh:targetClass case-corpora:DownloadableObject ; + . + +case-corpora:DownloadableRelation + a sh:NodeShape ; + sh:property + [ + a sh:PropertyShape ; + sh:class case-corpora:DownloadableObject ; + sh:path uco-core:source ; + ] , + [ + a sh:PropertyShape ; + sh:class uco-observable:URL ; + sh:message "CASE-Corpora requires a URL tied to a DownloadableRelation have its full value stored in the fullValue of a URLFacet."@en ; + sh:path uco-core:target ; + sh:property [ + a sh:PropertyShape ; + sh:path uco-core:hasFacet ; + sh:qualifiedMinCount "1"^^xsd:integer ; + sh:qualifiedValueShape [ + a sh:NodeShape ; + sh:class uco-observable:URLFacet ; + sh:property [ + a sh:PropertyShape ; + sh:minCount "1"^^xsd:integer ; + sh:path uco-observable:fullValue ; + ] ; + ] ; + ] ; + ] , + [ + a sh:PropertyShape ; + sh:hasValue "Downloadable_From" ; + sh:path uco-core:kindOfRelationship ; + ] , + [ + a sh:PropertyShape ; + sh:hasValue "true"^^xsd:boolean ; + sh:path uco-core:isDirectional ; + ] + ; + sh:targetClass case-corpora:DownloadableRelation ; + . + drafting:FileHashQuality a sh:NodeShape ; sh:property [ @@ -233,22 +305,7 @@ sh-case-corpora:hasDistribution-subjects-shape sh-case-corpora:hasDownloadURL-objects-shape a sh:NodeShape ; - sh:class uco-observable:URL ; - sh:property [ - a sh:PropertyShape ; - sh:message "CASE-Corpora requires any referenced Distribution have a downloadable URL, stored in the fullValue of a URLFacet."@en ; - sh:path uco-core:hasFacet ; - sh:qualifiedMinCount "1"^^xsd:integer ; - sh:qualifiedValueShape [ - a sh:NodeShape ; - sh:class uco-observable:URLFacet ; - sh:property [ - a sh:PropertyShape ; - sh:minCount "1"^^xsd:integer ; - sh:path uco-observable:fullValue ; - ] ; - ] ; - ] ; + sh:class case-corpora:DownloadableObject ; sh:targetObjectsOf case-corpora:hasDownloadURL ; . diff --git a/src/digital_corpora_supplement_ttl.py b/src/digital_corpora_supplement_ttl.py index 71e6c1d..974278f 100644 --- a/src/digital_corpora_supplement_ttl.py +++ b/src/digital_corpora_supplement_ttl.py @@ -16,8 +16,9 @@ import logging import os import re +import uuid from csv import DictReader -from typing import Any, Dict, List, Set +from typing import Any, Dict, List, Optional, Set import urllib.parse from case_utils.namespace import ( @@ -32,6 +33,7 @@ L_SHA256, L_SHA3_256, get_facet_uriref, + inherence_uuid, ) from case_utils_extras import method_value_to_node from rdflib import Graph, Literal, Namespace, URIRef @@ -42,6 +44,72 @@ RX_HEXBINARY = re.compile("^[0-9a-f]+$", re.IGNORECASE) +def characterize_url(graph: Graph, n_thing: URIRef, ns_kb: Namespace) -> URIRef: + """ + Guarantee URLFacet is present. Populate, if not found already defined. + + :returns: Returns generated observable:URL IRI. + """ + url_uuid = uuid.uuid5(uuid.NAMESPACE_URL, str(n_thing)) + n_url = ns_kb["URL-" + str(url_uuid)] + n_url_facet: Optional[URIRef] = None + for n_object in graph.objects(n_url, NS_UCO_CORE.hasFacet): + assert isinstance(n_object, URIRef) + for triple in graph.triples( + (n_object, NS_RDF.type, NS_UCO_OBSERVABLE.URLFacet) + ): + n_url_facet = n_object + if n_url_facet is None: + n_url_facet = get_facet_uriref( + n_url, NS_UCO_OBSERVABLE.URLFacet, namespace=ns_kb + ) + graph.add((n_url, NS_UCO_CORE.hasFacet, n_url_facet)) + graph.add((n_url_facet, NS_RDF.type, NS_UCO_OBSERVABLE.URLFacet)) + graph.add((n_url_facet, NS_UCO_OBSERVABLE.fullValue, Literal(str(n_thing)))) + return n_url + + +def get_digital_corpora_sends_relation( + graph: Graph, + n_downloadable_object: URIRef, + n_s3_object: URIRef, + namespace: Namespace, +) -> URIRef: + """ + :returns: Returns a deterministic IRI for an S3Object's mapping to an HTTPS URL. + """ + uuid0 = inherence_uuid(n_downloadable_object) + uuid1 = uuid.uuid5(uuid0, str(n_s3_object)) + uuid2 = uuid.uuid5(uuid1, "Sends") + n_relationship = namespace["ObservableRelationship-" + str(uuid2)] + graph.add((n_relationship, NS_RDF.type, NS_UCO_OBSERVABLE.ObservableRelationship)) + graph.add((n_relationship, NS_UCO_CORE.isDirectional, Literal(True))) + graph.add((n_relationship, NS_UCO_CORE.kindOfRelationship, Literal("Sends"))) + graph.add((n_relationship, NS_UCO_CORE.source, n_downloadable_object)) + graph.add((n_relationship, NS_UCO_CORE.target, n_s3_object)) + return n_relationship + + +def get_digital_corpora_downloadable_relation( + graph: Graph, n_downloadable_object: URIRef, n_url: URIRef, namespace: Namespace +) -> URIRef: + """ + :returns: Returns a deterministic IRI for an S3Object downloadable by Digital Corpora's directory presentation protocol. + """ + uuid0 = inherence_uuid(n_downloadable_object) + uuid1 = uuid.uuid5(uuid0, str(n_url)) + uuid2 = uuid.uuid5(uuid1, "Downloadable_From") + n_relationship = namespace["DownloadableRelation-" + str(uuid2)] + graph.add((n_relationship, NS_RDF.type, NS_CASE_CORPORA.DownloadableRelation)) + graph.add((n_relationship, NS_UCO_CORE.isDirectional, Literal(True))) + graph.add( + (n_relationship, NS_UCO_CORE.kindOfRelationship, Literal("Downloadable_From")) + ) + graph.add((n_relationship, NS_UCO_CORE.source, n_downloadable_object)) + graph.add((n_relationship, NS_UCO_CORE.target, n_url)) + return n_relationship + + def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--debug", action="store_true") @@ -85,15 +153,19 @@ def main() -> None: continue s3key_quoted = urllib.parse.quote(row["s3key"]) n_s3_object = URIRef("s3://digitalcorpora/" + s3key_quoted) - n_download_url = URIRef( + n_downloadable_object = URIRef( "https://digitalcorpora.s3.amazonaws.com/" + s3key_quoted ) - if n_download_url not in n_subjects: + if n_downloadable_object not in n_subjects: continue - n_subjects.remove(n_download_url) + n_subjects.remove(n_downloadable_object) graph.add((n_s3_object, NS_RDF.type, NS_DRAFTING.S3Object)) - graph.add((n_download_url, NS_RDF.type, NS_UCO_OBSERVABLE.URL)) + graph.add( + (n_downloadable_object, NS_RDF.type, NS_CASE_CORPORA.DownloadableObject) + ) + n_url = characterize_url(graph, n_downloadable_object, NS_KB) + graph.add((n_url, NS_RDF.type, NS_UCO_OBSERVABLE.URL)) graph.add( ( n_s3_object, @@ -103,7 +175,7 @@ def main() -> None: ) graph.add( ( - n_download_url, + n_downloadable_object, NS_UCO_CORE.createdBy, NS_KB["organization-72ec45c9-ea94-4503-9428-ad73300056f5"], ) @@ -122,13 +194,23 @@ def main() -> None: (n_content_data_facet, NS_RDF.type, NS_UCO_OBSERVABLE.ContentDataFacet) ) graph.add((n_s3_object, NS_UCO_CORE.hasFacet, n_content_data_facet)) + + # Tie S3 Object to URL, two ways. + # First, uco-observable:dataPayloadReferenceURL. + # Second, DownloadableRelation. graph.add( ( n_content_data_facet, NS_UCO_OBSERVABLE.dataPayloadReferenceURL, - n_download_url, + n_downloadable_object, ) ) + _ = get_digital_corpora_downloadable_relation( + graph, n_downloadable_object, n_url, NS_KB + ) + _ = get_digital_corpora_sends_relation( + graph, n_downloadable_object, n_s3_object, NS_KB + ) n_file_facet = get_facet_uriref( n_s3_object, NS_UCO_OBSERVABLE.FileFacet, namespace=NS_KB @@ -136,19 +218,6 @@ def main() -> None: graph.add((n_file_facet, NS_RDF.type, NS_UCO_OBSERVABLE.FileFacet)) graph.add((n_s3_object, NS_UCO_CORE.hasFacet, n_file_facet)) - n_url_facet = get_facet_uriref( - n_download_url, NS_UCO_OBSERVABLE.URLFacet, namespace=NS_KB - ) - graph.add((n_url_facet, NS_RDF.type, NS_UCO_OBSERVABLE.URLFacet)) - graph.add((n_download_url, NS_UCO_CORE.hasFacet, n_url_facet)) - graph.add( - ( - n_url_facet, - NS_UCO_OBSERVABLE.fullValue, - Literal(str(n_download_url)), - ) - ) - graph.add( ( n_file_facet, diff --git a/tests/distribution_PASS.ttl b/tests/distribution_PASS.ttl index ea9a937..c0e6b58 100644 --- a/tests/distribution_PASS.ttl +++ b/tests/distribution_PASS.ttl @@ -26,20 +26,29 @@ kb:file-facet-446a4dcc-5eb8-4df7-941a-d0552c2c354c uco-observable:fileName "dataset.dat" . +kb:downloadable-relation-75ad1d7a-8c10-4890-8f86-68c79528cc43 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional true ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:url-ce3d16e9-85d7-48e2-b898-a88994375677 ; + . + +kb:url-ce3d16e9-85d7-48e2-b898-a88994375677 + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-6e2f1463-d1c2-4281-ab6f-b31256f71715 ; + . + kb:url-facet-6e2f1463-d1c2-4281-ab6f-b31256f71715 a uco-observable:URLFacet ; uco-observable:fullValue "http://example.org/dataset.dat" ; . - a - uco-observable:File , - uco-observable:URL - ; + a case-corpora:DownloadableFile ; uco-core:hasFacet kb:content-data-facet-2d69d86e-fb28-44bc-9615-f8dbb9f8a6de , - kb:file-facet-446a4dcc-5eb8-4df7-941a-d0552c2c354c , - kb:url-facet-6e2f1463-d1c2-4281-ab6f-b31256f71715 + kb:file-facet-446a4dcc-5eb8-4df7-941a-d0552c2c354c ; . diff --git a/tests/distribution_XFAIL.ttl b/tests/distribution_XFAIL.ttl index ff52152..e3ae1fe 100644 --- a/tests/distribution_XFAIL.ttl +++ b/tests/distribution_XFAIL.ttl @@ -23,7 +23,7 @@ kb:content-data-facet-ccd3fac6-e2ad-4966-9614-80b1e72f2da9 kb:distribution-9e4bd389-eaba-4d18-90e6-6f0a171fe9fe a case-corpora:Distribution ; - rdfs:comment "This will trigger an error from the download URL not having a URL Facet."@en ; + rdfs:comment "This will trigger an error from the download URL not tying (indirectly) to a URL Facet."@en ; case-corpora:hasDownloadURL ; dcat:mediaType ; . @@ -33,11 +33,20 @@ kb:file-facet-d5526f1f-ce26-4248-9322-a54a4887fe90 uco-observable:fileName "dataset-1.dat" . +kb:url-dd3e2489-37f0-478c-92bb-c45831337dd4 + a uco-observable:URL ; + . + +kb:downloadable-relation-2f7b00ba-6574-4645-9ace-0f59a50c6bf8 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional true ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:url-dd3e2489-37f0-478c-92bb-c45831337dd4 ; + . + - a - uco-observable:File , - uco-observable:URL - ; + a case-corpora:DownloadableFile ; uco-core:hasFacet kb:content-data-facet-ccd3fac6-e2ad-4966-9614-80b1e72f2da9 , kb:file-facet-d5526f1f-ce26-4248-9322-a54a4887fe90 @@ -53,7 +62,7 @@ kb:content-data-facet-c39833a3-13f2-4131-8415-7c16296e907b kb:distribution-868aafbd-f783-4b9c-849e-c3436f806810 a case-corpora:Distribution ; - rdfs:comment "This will trigger an error from the download URL referencing a Facet that could be a URL Facet, but lacks an explicit type."@en ; + rdfs:comment "This will trigger an error from the associated download URL referencing a Facet that could be a URL Facet, but lacks an explicit type."@en ; case-corpora:hasDownloadURL ; dcat:mediaType ; . @@ -63,19 +72,28 @@ kb:file-facet-facd7584-bcb2-41c7-a26a-fe3972a87fc0 uco-observable:fileName "dataset-2.dat" . +kb:url-34d751fd-9039-45fc-87c5-b1cdbfc7ef10 + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-fcfb2373-24f7-457b-9dd0-b9b909cadb86 ; + . + kb:url-facet-fcfb2373-24f7-457b-9dd0-b9b909cadb86 uco-observable:fullValue "http://example.org/dataset-2.dat" ; . +kb:downloadable-relation-0477d45f-1112-4df6-aa6d-8f61b91deb59 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional true ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:url-34d751fd-9039-45fc-87c5-b1cdbfc7ef10 ; + . + - a - uco-observable:File , - uco-observable:URL - ; + a case-corpora:DownloadableFile ; uco-core:hasFacet kb:content-data-facet-c39833a3-13f2-4131-8415-7c16296e907b , - kb:file-facet-facd7584-bcb2-41c7-a26a-fe3972a87fc0 , - kb:url-facet-fcfb2373-24f7-457b-9dd0-b9b909cadb86 + kb:file-facet-facd7584-bcb2-41c7-a26a-fe3972a87fc0 ; . @@ -89,7 +107,7 @@ kb:content-data-facet-84af03ab-efce-401f-8fc7-84ffc680eb3b kb:distribution-79b6e1c6-2823-470e-b603-e136b31ecdce a case-corpora:Distribution ; - rdfs:comment "This will trigger an error from the download URL having a URL Facet without a fullValue."@en ; + rdfs:comment "This will trigger an error from the associated download URL having a URL Facet without a fullValue."@en ; case-corpora:hasDownloadURL ; dcat:mediaType ; . @@ -99,19 +117,28 @@ kb:file-facet-81d9b1f4-a3aa-4b33-b23e-e9532d8bf526 uco-observable:fileName "dataset-3.dat" . +kb:url-b51dfe9c-48bf-41a8-97fd-cf139845fa8c + a uco-observable:URL ; + uco-core:hasFacet kb:url-facet-4bcc4d8f-d201-477e-9bdf-dd3bf4b243cc ; + . + kb:url-facet-4bcc4d8f-d201-477e-9bdf-dd3bf4b243cc a uco-observable:URLFacet ; . +kb:downloadable-relation-0b234d00-dfbf-495c-8306-3f8927f5bb57 + a case-corpora:DownloadableRelation ; + uco-core:isDirectional true ; + uco-core:kindOfRelationship "Downloadable_From" ; + uco-core:source ; + uco-core:target kb:url-b51dfe9c-48bf-41a8-97fd-cf139845fa8c ; + . + - a - uco-observable:File , - uco-observable:URL - ; + a case-corpora:DownloadableFile ; uco-core:hasFacet kb:content-data-facet-84af03ab-efce-401f-8fc7-84ffc680eb3b , - kb:file-facet-81d9b1f4-a3aa-4b33-b23e-e9532d8bf526 , - kb:url-facet-4bcc4d8f-d201-477e-9bdf-dd3bf4b243cc + kb:file-facet-81d9b1f4-a3aa-4b33-b23e-e9532d8bf526 ; . diff --git a/tests/test_shapes.py b/tests/test_shapes.py index e82c889..7fe77a2 100644 --- a/tests/test_shapes.py +++ b/tests/test_shapes.py @@ -21,10 +21,11 @@ def test_qualified_url_shape() -> None: + # TODO: This IRI requires pySHACL >= 0.24.0 to appear in the results. + # URIRef("http://example.org/kb/url-dd3e2489-37f0-478c-92bb-c45831337dd4") expected: Set[URIRef] = { - URIRef("http://example.org/dataset-1.dat"), - URIRef("http://example.org/dataset-2.dat"), - URIRef("http://example.org/dataset-3.dat"), + URIRef("http://example.org/kb/url-34d751fd-9039-45fc-87c5-b1cdbfc7ef10"), + URIRef("http://example.org/kb/url-b51dfe9c-48bf-41a8-97fd-cf139845fa8c"), } computed: Set[URIRef] = set()