diff --git a/Makefile b/Makefile index cca17664..7ea0a97f 100644 --- a/Makefile +++ b/Makefile @@ -2,20 +2,20 @@ CKAN_VERSION ?= 2.10 COMPOSE_FILE ?= docker-compose.yml build: ## Build the docker containers - CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) build + CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) build lint: ## Lint the code - SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f docker-compose.yml run --rm app flake8 ckanext --count --show-source --statistics --exclude ckan + SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f docker-compose.yml run --rm app flake8 ckanext --count --show-source --statistics --exclude ckan clean: ## Clean workspace and containers find . -name *.pyc -delete - SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) down -v --remove-orphan + SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) down -v test: ## Run tests in a new container - SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) run --rm app ./test.sh + SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app ./test.sh up: ## Start the containers - SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) up app + SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up app .DEFAULT_GOAL := help diff --git a/README.md b/README.md index 0c5cae52..f20c650e 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,8 @@ To make the datajson validator route and web form available, also add: ckan.plugins = (other plugins here...) datajson_validator +[ Optional ] Set the resource count limit allowed in one record so that fetch-consumer does not run out of memory during harvesting. Default is unlimited. Once set, records with higher resource count will see import errors. + `ckanext.datajson.max_resource_count = 1000` ## Development diff --git a/ckanext/datajson/datajson.py b/ckanext/datajson/datajson.py index b37b3d1b..163c7636 100644 --- a/ckanext/datajson/datajson.py +++ b/ckanext/datajson/datajson.py @@ -772,7 +772,11 @@ def import_stage(self, harvest_object): extras.append({'key': k, 'value': v}) # Set specific information about the dataset. - self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version) + try: + self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version) + except DataError as e: + self._save_object_error(e.error, harvest_object, 'Import') + return None # Try to update an existing package with the ID set in harvest_object.guid. If that GUID # corresponds with an existing package, get its current metadata. diff --git a/ckanext/datajson/parse_datajson.py b/ckanext/datajson/parse_datajson.py index b9c02230..12323298 100644 --- a/ckanext/datajson/parse_datajson.py +++ b/ckanext/datajson/parse_datajson.py @@ -1,4 +1,6 @@ from ckan.lib.munge import munge_title_to_name +from ckan.lib.navl.dictization_functions import DataError +from ckan.plugins.toolkit import config import re @@ -108,6 +110,12 @@ def parse_datajson_entry(datajson, package, defaults, schema_version): } distribution.append(d) + max_resource_count = config.get('ckanext.datajson.max_resource_count') + if max_resource_count and len(distribution) > max_resource_count: + error_message = (f'Too many resources. Maximum allowed is {max_resource_count}. ' + f'Actual size is {len(distribution)}.') + raise DataError(error_message) + datajson["distribution"] = distribution for d in datajson.get("distribution", []): diff --git a/ckanext/datajson/tests/datajson-samples/many-resources.data.json b/ckanext/datajson/tests/datajson-samples/many-resources.data.json new file mode 100644 index 00000000..cae87879 --- /dev/null +++ b/ckanext/datajson/tests/datajson-samples/many-resources.data.json @@ -0,0 +1,118 @@ +{ + "conformsTo": "https://project-open-data.cio.gov/v1.1/schema", + "describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json", + "@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld", + "@type": "dcat:Catalog", + "dataset": [ + { + "@type": "dcat:Dataset", + "title": "Many Resources", + "description": "range of topics of interest for this community. These training opportunities are interactive and allow the participants to ask questions of the presenters. This presentation covers Absence Without Leave (AWOL) and various aspects of handling leave abuse from placing an employee on leave restriction to taking a disciplinary or adverse action. The session also contains federal case law regarding this topic.", + "modified": "2011-12-15", + "accessLevel": "public", + "identifier": "OPM-ERround-0001-AWOL", + "landingPage": "http://www.opm.gov/policy-data-oversight/employee-relations/training/#url=Roundtables", + "license": "http://www.usa.gov/publicdomain/label/1.0/", + "publisher": { + "@type": "org:Organization", + "name": "U.S. Office of Personnel Management" + }, + "contactPoint": { + "@type": "vcard:Contact", + "fn": "LaShann Freeman", + "hasEmail": "mailto:lashann.freeman@opm.gov" + }, + "distribution": [ + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/1" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/2" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/3" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/4" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/5" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/6" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/7" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/8" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/9" + }, + { + "@type": "dcat:Distribution", + "mediaType": "application/pdf", + "title": "Addressing AWOL", + "downloadURL": "http://www.opm.gov/10" + }, + { + "@type": "dcat:Distribution", + "accessURL": "https://www.youtube.com/watch?v=08bK8zHH9No", + "format": "video", + "title": "Addressing AWOL" + } + ], + "keyword": [ + "AWOL", + "Absence without Leave", + "LWOP", + "Leave without Pay", + "disciplinary procedures", + "employee relations", + "leave abuse", + "leave restriction", + "practitioners", + "roundtables", + "training", + "webcast" + ], + "bureauCode": [ + "027:00" + ], + "programCode": [ + "027:000" + ], + "language": [ + "en-US" + ] + } + ] +} \ No newline at end of file diff --git a/ckanext/datajson/tests/mock_datajson_source.py b/ckanext/datajson/tests/mock_datajson_source.py index 7519704f..6bc1d8d2 100644 --- a/ckanext/datajson/tests/mock_datajson_source.py +++ b/ckanext/datajson/tests/mock_datajson_source.py @@ -41,6 +41,9 @@ def do_GET(self): elif self.path == '/null-spatial': self.sample_datajson_file = 'null-spatial.data.json' self.test_name = 'null-spatial' + elif self.path == '/many-resources': + self.sample_datajson_file = 'many-resources.data.json' + self.test_name = 'many-resources' elif self.path == '/numerical-title': self.sample_datajson_file = 'numerical-title.data.json' self.test_name = 'numerical-title' diff --git a/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py b/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py index 5861b8f0..d9390ff6 100644 --- a/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py +++ b/ckanext/datajson/tests/test_datajson_ckan_all_harvester.py @@ -613,6 +613,16 @@ def test_harvesting_parent_child_2_collections(self): parent = model.Package.get(parent_package_id) assert parent.title == 'Employee Relations Roundtables 2' + @pytest.mark.ckan_config('ckanext.datajson.max_resource_count', 10) + def test_too_many_resources(self): + url = 'http://127.0.0.1:%s/many-resources' % self.mock_port + self.run_source(url=url) + errors = self.errors + expected_error_stage = "Import" + assert errors[0].stage == expected_error_stage + expected_error_message = "Too many resources. Maximum allowed is 10. Actual size is 11." + assert errors[0].message == expected_error_message + def test_datajson_reserverd_word_as_title(self): url = 'http://127.0.0.1:%s/error-reserved-title' % self.mock_port self.run_source(url=url) diff --git a/docker-compose.yml b/docker-compose.yml index a74be2f9..037f4dd7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -44,4 +44,6 @@ services: redis: image: redis solr: - image: ckan/ckan-solr-dev:2.9 + image: datagov/catalog.data.gov.solr:latest + ports: + - "8983:8983" diff --git a/setup.py b/setup.py index 16725c51..33b30b77 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='ckanext-datajson', - version='0.1.25', + version='0.1.26', description='CKAN extension to generate /data.json', long_description=long_description, long_description_content_type='text/markdown',