Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set max_resource_count #158

Merged
merged 5 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@ CKAN_VERSION ?= 2.10
COMPOSE_FILE ?= docker-compose.yml

build: ## Build the docker containers
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) build
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) build

lint: ## Lint the code
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f docker-compose.yml run --rm app flake8 ckanext --count --show-source --statistics --exclude ckan
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f docker-compose.yml run --rm app flake8 ckanext --count --show-source --statistics --exclude ckan

clean: ## Clean workspace and containers
find . -name *.pyc -delete
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) down -v --remove-orphan
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) down -v

test: ## Run tests in a new container
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) run --rm app ./test.sh
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app ./test.sh

up: ## Start the containers
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) up app
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up app


.DEFAULT_GOAL := help
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ To make the datajson validator route and web form available, also add:

ckan.plugins = (other plugins here...) datajson_validator

[ Optional ] Set the resource count limit allowed in one record so that fetch-consumer does not run out of memory during harvesting. Default is unlimited. Once set, records with higher resource count will see import errors.
`ckanext.datajson.max_resource_count = 1000`

## Development

Expand Down
6 changes: 5 additions & 1 deletion ckanext/datajson/datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,11 @@ def import_stage(self, harvest_object):
extras.append({'key': k, 'value': v})

# Set specific information about the dataset.
self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version)
try:
self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version)
except DataError as e:
self._save_object_error(e.error, harvest_object, 'Import')
return None

# Try to update an existing package with the ID set in harvest_object.guid. If that GUID
# corresponds with an existing package, get its current metadata.
Expand Down
8 changes: 8 additions & 0 deletions ckanext/datajson/parse_datajson.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from ckan.lib.munge import munge_title_to_name
from ckan.lib.navl.dictization_functions import DataError
from ckan.plugins.toolkit import config

import re

Expand Down Expand Up @@ -108,6 +110,12 @@ def parse_datajson_entry(datajson, package, defaults, schema_version):
}
distribution.append(d)

max_resource_count = config.get('ckanext.datajson.max_resource_count')
if max_resource_count and len(distribution) > max_resource_count:
error_message = (f'Too many resources. Maximum allowed is {max_resource_count}. '
f'Actual size is {len(distribution)}.')
raise DataError(error_message)

datajson["distribution"] = distribution

for d in datajson.get("distribution", []):
Expand Down
118 changes: 118 additions & 0 deletions ckanext/datajson/tests/datajson-samples/many-resources.data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"@type": "dcat:Catalog",
"dataset": [
{
"@type": "dcat:Dataset",
"title": "Many Resources",
"description": "range of topics of interest for this community. These training opportunities are interactive and allow the participants to ask questions of the presenters. This presentation covers Absence Without Leave (AWOL) and various aspects of handling leave abuse from placing an employee on leave restriction to taking a disciplinary or adverse action. The session also contains federal case law regarding this topic.",
"modified": "2011-12-15",
"accessLevel": "public",
"identifier": "OPM-ERround-0001-AWOL",
"landingPage": "http://www.opm.gov/policy-data-oversight/employee-relations/training/#url=Roundtables",
"license": "http://www.usa.gov/publicdomain/label/1.0/",
"publisher": {
"@type": "org:Organization",
"name": "U.S. Office of Personnel Management"
},
"contactPoint": {
"@type": "vcard:Contact",
"fn": "LaShann Freeman",
"hasEmail": "mailto:lashann.freeman@opm.gov"
},
"distribution": [
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/1"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/2"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/3"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/4"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/5"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/6"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/7"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/8"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/9"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/10"
},
{
"@type": "dcat:Distribution",
"accessURL": "https://www.youtube.com/watch?v=08bK8zHH9No",
"format": "video",
"title": "Addressing AWOL"
}
],
"keyword": [
"AWOL",
"Absence without Leave",
"LWOP",
"Leave without Pay",
"disciplinary procedures",
"employee relations",
"leave abuse",
"leave restriction",
"practitioners",
"roundtables",
"training",
"webcast"
],
"bureauCode": [
"027:00"
],
"programCode": [
"027:000"
],
"language": [
"en-US"
]
}
]
}
3 changes: 3 additions & 0 deletions ckanext/datajson/tests/mock_datajson_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ def do_GET(self):
elif self.path == '/null-spatial':
self.sample_datajson_file = 'null-spatial.data.json'
self.test_name = 'null-spatial'
elif self.path == '/many-resources':
self.sample_datajson_file = 'many-resources.data.json'
self.test_name = 'many-resources'
elif self.path == '/numerical-title':
self.sample_datajson_file = 'numerical-title.data.json'
self.test_name = 'numerical-title'
Expand Down
10 changes: 10 additions & 0 deletions ckanext/datajson/tests/test_datajson_ckan_all_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,16 @@ def test_harvesting_parent_child_2_collections(self):
parent = model.Package.get(parent_package_id)
assert parent.title == 'Employee Relations Roundtables 2'

@pytest.mark.ckan_config('ckanext.datajson.max_resource_count', 10)
def test_too_many_resources(self):
url = 'http://127.0.0.1:%s/many-resources' % self.mock_port
self.run_source(url=url)
errors = self.errors
expected_error_stage = "Import"
assert errors[0].stage == expected_error_stage
expected_error_message = "Too many resources. Maximum allowed is 10. Actual size is 11."
assert errors[0].message == expected_error_message

def test_datajson_reserverd_word_as_title(self):
url = 'http://127.0.0.1:%s/error-reserved-title' % self.mock_port
self.run_source(url=url)
Expand Down
4 changes: 3 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,6 @@ services:
redis:
image: redis
solr:
image: ckan/ckan-solr-dev:2.9
image: datagov/catalog.data.gov.solr:latest
ports:
- "8983:8983"
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name='ckanext-datajson',
version='0.1.25',
version='0.1.26',
description='CKAN extension to generate /data.json',
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down
Loading