From ea8d9b30968fb89488caaa41da3871174b358400 Mon Sep 17 00:00:00 2001 From: Fuhu Xia Date: Tue, 24 Sep 2024 17:59:47 -0400 Subject: [PATCH 1/5] fix docker --- Makefile | 10 +++++----- docker-compose.yml | 4 +++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index cca17664..7ea0a97f 100644 --- a/Makefile +++ b/Makefile @@ -2,20 +2,20 @@ CKAN_VERSION ?= 2.10 COMPOSE_FILE ?= docker-compose.yml build: ## Build the docker containers - CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) build + CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) build lint: ## Lint the code - SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f docker-compose.yml run --rm app flake8 ckanext --count --show-source --statistics --exclude ckan + SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f docker-compose.yml run --rm app flake8 ckanext --count --show-source --statistics --exclude ckan clean: ## Clean workspace and containers find . -name *.pyc -delete - SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) down -v --remove-orphan + SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) down -v test: ## Run tests in a new container - SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) run --rm app ./test.sh + SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app ./test.sh up: ## Start the containers - SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) up app + SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up app .DEFAULT_GOAL := help diff --git a/docker-compose.yml b/docker-compose.yml index a74be2f9..037f4dd7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -44,4 +44,6 @@ services: redis: image: redis solr: - image: ckan/ckan-solr-dev:2.9 + image: datagov/catalog.data.gov.solr:latest + ports: + - "8983:8983" From 8ddf851eee2390c4bbf430345086d5f10d11148d Mon Sep 17 00:00:00 2001 From: Fuhu Xia Date: Tue, 24 Sep 2024 18:01:10 -0400 Subject: [PATCH 2/5] set max_resource_count --- ckanext/datajson/datajson.py | 6 +++++- ckanext/datajson/parse_datajson.py | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/ckanext/datajson/datajson.py b/ckanext/datajson/datajson.py index b37b3d1b..163c7636 100644 --- a/ckanext/datajson/datajson.py +++ b/ckanext/datajson/datajson.py @@ -772,7 +772,11 @@ def import_stage(self, harvest_object): extras.append({'key': k, 'value': v}) # Set specific information about the dataset. - self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version) + try: + self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version) + except DataError as e: + self._save_object_error(e.error, harvest_object, 'Import') + return None # Try to update an existing package with the ID set in harvest_object.guid. If that GUID # corresponds with an existing package, get its current metadata. diff --git a/ckanext/datajson/parse_datajson.py b/ckanext/datajson/parse_datajson.py index b9c02230..12323298 100644 --- a/ckanext/datajson/parse_datajson.py +++ b/ckanext/datajson/parse_datajson.py @@ -1,4 +1,6 @@ from ckan.lib.munge import munge_title_to_name +from ckan.lib.navl.dictization_functions import DataError +from ckan.plugins.toolkit import config import re @@ -108,6 +110,12 @@ def parse_datajson_entry(datajson, package, defaults, schema_version): } distribution.append(d) + max_resource_count = config.get('ckanext.datajson.max_resource_count') + if max_resource_count and len(distribution) > max_resource_count: + error_message = (f'Too many resources. Maximum allowed is {max_resource_count}. ' + f'Actual size is {len(distribution)}.') + raise DataError(error_message) + datajson["distribution"] = distribution for d in datajson.get("distribution", []): From 11764f5bf23c569a6e1664653f2f7d693c99e603 Mon Sep 17 00:00:00 2001 From: Fuhu Xia Date: Tue, 24 Sep 2024 18:01:26 -0400 Subject: [PATCH 3/5] add test --- .../datajson-samples/many-resources.data.json | 118 ++++++++++++++++++ .../datajson/tests/mock_datajson_source.py | 3 + .../tests/test_datajson_ckan_all_harvester.py | 10 ++ 3 files changed, 131 insertions(+) create mode 100644 ckanext/datajson/tests/datajson-samples/many-resources.data.json diff --git a/ckanext/datajson/tests/datajson-samples/many-resources.data.json b/ckanext/datajson/tests/datajson-samples/many-resources.data.json new file mode 100644 index 00000000..cae87879 --- /dev/null +++ b/ckanext/datajson/tests/datajson-samples/many-resources.data.json @@ -0,0 +1,118 @@ +{ + "conformsTo": "https://project-open-data.cio.gov/v1.1/schema", + "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json", + "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld", + "@type": "dcat:Catalog", + "dataset": [ + { + "@type": "dcat:Dataset", + "title": "Many Resources", + "description": "range of topics of interest for this community. These training opportunities are interactive and allow the participants to ask questions of the presenters. This presentation covers Absence Without Leave (AWOL) and various aspects of handling leave abuse from placing an employee on leave restriction to taking a disciplinary or adverse action. The session also contains federal case law regarding this topic.", + "modified": "2011-12-15", + "accessLevel": "public", + "identifier": "OPM-ERround-0001-AWOL", + "landingPage": "http://www.opm.gov/policy-data-oversight/employee-relations/training/#url=Roundtables", + "license": "http://www.usa.gov/publicdomain/label/1.0/", + "publisher": { + "@type": "org:Organization", + "name": "U.S. Office of Personnel Management" + }, + "contactPoint": { + "@type": "vcard:Contact", + "fn": "LaShann Freeman", + "hasEmail": "mailto:lashann.freeman@opm.gov" + }, + "distribution": [ + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/1" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/2" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/3" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/4" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/5" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/6" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/7" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/8" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/9" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/10" + }, + { + "@type": "dcat:Distribution", + "accessURL": "https://www.youtube.com/watch?v=08bK8zHH9No", + "format": "video", + "title": "Addressing AWOL" + } + ], + "keyword": [ + "AWOL", + "Absence without Leave", + "LWOP", + "Leave without Pay", + "disciplinary procedures", + "employee relations", + "leave abuse", + "leave restriction", + "practitioners", + "roundtables", + "training", + "webcast" + ], + "bureauCode": [ + "027:00" + ], + "programCode": [ + "027:000" + ], + "language": [ + "en-US" + ] + } + ] +} \ No newline at end of file diff --git a/ckanext/datajson/tests/mock_datajson_source.py b/ckanext/datajson/tests/mock_datajson_source.py index 7519704f..6bc1d8d2 100644 --- a/ckanext/datajson/tests/mock_datajson_source.py +++ b/ckanext/datajson/tests/mock_datajson_source.py @@ -41,6 +41,9 @@ def do_GET(self): elif self.path == '/null-spatial': self.sample_datajson_file = 'null-spatial.data.json' self.test_name = 'null-spatial' + elif self.path == '/many-resources': + self.sample_datajson_file = 'many-resources.data.json' + self.test_name = 'many-resources' elif self.path == '/numerical-title': self.sample_datajson_file = 'numerical-title.data.json' self.test_name = 'numerical-title' diff --git a/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py b/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py index 5861b8f0..d9390ff6 100644 --- a/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py +++ b/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py @@ -613,6 +613,16 @@ def test_harvesting_parent_child_2_collections(self): parent = model.Package.get(parent_package_id) assert parent.title == 'Employee Relations Roundtables 2' + @pytest.mark.ckan_config('ckanext.datajson.max_resource_count', 10) + def test_too_many_resources(self): + url = 'http://127.0.0.1:%s/many-resources' % self.mock_port + self.run_source(url=url) + errors = self.errors + expected_error_stage = "Import" + assert errors[0].stage == expected_error_stage + expected_error_message = "Too many resources. Maximum allowed is 10. Actual size is 11." + assert errors[0].message == expected_error_message + def test_datajson_reserverd_word_as_title(self): url = 'http://127.0.0.1:%s/error-reserved-title' % self.mock_port self.run_source(url=url) From b861210f589cc29af9915ff786559931079252b7 Mon Sep 17 00:00:00 2001 From: Fuhu Xia Date: Tue, 24 Sep 2024 18:01:44 -0400 Subject: [PATCH 4/5] README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 0c5cae52..f20c650e 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,8 @@ To make the datajson validator route and web form available, also add: ckan.plugins = (other plugins here...) datajson_validator +[ Optional ] Set the resource count limit allowed in one record so that fetch-consumer does not run out of memory during harvesting. Default is unlimited. Once set, records with higher resource count will see import errors. + `ckanext.datajson.max_resource_count = 1000` ## Development From 08b911efdd4e4626862490b71196370c801b85e3 Mon Sep 17 00:00:00 2001 From: Fuhu Xia Date: Tue, 24 Sep 2024 18:01:57 -0400 Subject: [PATCH 5/5] bump version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 16725c51..33b30b77 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='ckanext-datajson', - version='0.1.25', + version='0.1.26', description='CKAN extension to generate /data.json', long_description=long_description, long_description_content_type='text/markdown',