Skip to content

Commit

Permalink
Merge pull request #158 from GSA/max-resource
Browse files Browse the repository at this point in the history
Set max_resource_count
  • Loading branch information
FuhuXia authored Sep 25, 2024
2 parents 3506258 + 08b911e commit fe4ccb6
Show file tree
Hide file tree
Showing 9 changed files with 155 additions and 8 deletions.
10 changes: 5 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@ CKAN_VERSION ?= 2.10
COMPOSE_FILE ?= docker-compose.yml

build: ## Build the docker containers
CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) build
CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) build

lint: ## Lint the code
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f docker-compose.yml run --rm app flake8 ckanext --count --show-source --statistics --exclude ckan
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f docker-compose.yml run --rm app flake8 ckanext --count --show-source --statistics --exclude ckan

clean: ## Clean workspace and containers
find . -name *.pyc -delete
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) down -v --remove-orphan
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) down -v

test: ## Run tests in a new container
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) run --rm app ./test.sh
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) run --rm app ./test.sh

up: ## Start the containers
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker-compose -f $(COMPOSE_FILE) up app
SERVICES_VERSION=$(CKAN_VERSION:%.5=%) CKAN_VERSION=$(CKAN_VERSION) docker compose -f $(COMPOSE_FILE) up app


.DEFAULT_GOAL := help
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ To make the datajson validator route and web form available, also add:

ckan.plugins = (other plugins here...) datajson_validator

[ Optional ] Set the resource count limit allowed in one record so that fetch-consumer does not run out of memory during harvesting. Default is unlimited. Once set, records with higher resource count will see import errors.
`ckanext.datajson.max_resource_count = 1000`

## Development

Expand Down
6 changes: 5 additions & 1 deletion ckanext/datajson/datajson.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,7 +772,11 @@ def import_stage(self, harvest_object):
extras.append({'key': k, 'value': v})

# Set specific information about the dataset.
self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version)
try:
self.set_dataset_info(pkg, dataset_processed, dataset_defaults, schema_version)
except DataError as e:
self._save_object_error(e.error, harvest_object, 'Import')
return None

# Try to update an existing package with the ID set in harvest_object.guid. If that GUID
# corresponds with an existing package, get its current metadata.
Expand Down
8 changes: 8 additions & 0 deletions ckanext/datajson/parse_datajson.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from ckan.lib.munge import munge_title_to_name
from ckan.lib.navl.dictization_functions import DataError
from ckan.plugins.toolkit import config

import re

Expand Down Expand Up @@ -108,6 +110,12 @@ def parse_datajson_entry(datajson, package, defaults, schema_version):
}
distribution.append(d)

max_resource_count = config.get('ckanext.datajson.max_resource_count')
if max_resource_count and len(distribution) > max_resource_count:
error_message = (f'Too many resources. Maximum allowed is {max_resource_count}. '
f'Actual size is {len(distribution)}.')
raise DataError(error_message)

datajson["distribution"] = distribution

for d in datajson.get("distribution", []):
Expand Down
118 changes: 118 additions & 0 deletions ckanext/datajson/tests/datajson-samples/many-resources.data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
{
"conformsTo": "https://project-open-data.cio.gov/v1.1/schema",
"describedBy": "https://project-open-data.cio.gov/v1.1/schema/catalog.json",
"@context": "https://project-open-data.cio.gov/v1.1/schema/catalog.jsonld",
"@type": "dcat:Catalog",
"dataset": [
{
"@type": "dcat:Dataset",
"title": "Many Resources",
"description": "range of topics of interest for this community. These training opportunities are interactive and allow the participants to ask questions of the presenters. This presentation covers Absence Without Leave (AWOL) and various aspects of handling leave abuse from placing an employee on leave restriction to taking a disciplinary or adverse action. The session also contains federal case law regarding this topic.",
"modified": "2011-12-15",
"accessLevel": "public",
"identifier": "OPM-ERround-0001-AWOL",
"landingPage": "http://www.opm.gov/policy-data-oversight/employee-relations/training/#url=Roundtables",
"license": "http://www.usa.gov/publicdomain/label/1.0/",
"publisher": {
"@type": "org:Organization",
"name": "U.S. Office of Personnel Management"
},
"contactPoint": {
"@type": "vcard:Contact",
"fn": "LaShann Freeman",
"hasEmail": "mailto:lashann.freeman@opm.gov"
},
"distribution": [
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/1"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/2"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/3"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/4"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/5"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/6"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/7"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/8"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/9"
},
{
"@type": "dcat:Distribution",
"mediaType": "application/pdf",
"title": "Addressing AWOL",
"downloadURL": "http://www.opm.gov/10"
},
{
"@type": "dcat:Distribution",
"accessURL": "https://www.youtube.com/watch?v=08bK8zHH9No",
"format": "video",
"title": "Addressing AWOL"
}
],
"keyword": [
"AWOL",
"Absence without Leave",
"LWOP",
"Leave without Pay",
"disciplinary procedures",
"employee relations",
"leave abuse",
"leave restriction",
"practitioners",
"roundtables",
"training",
"webcast"
],
"bureauCode": [
"027:00"
],
"programCode": [
"027:000"
],
"language": [
"en-US"
]
}
]
}
3 changes: 3 additions & 0 deletions ckanext/datajson/tests/mock_datajson_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ def do_GET(self):
elif self.path == '/null-spatial':
self.sample_datajson_file = 'null-spatial.data.json'
self.test_name = 'null-spatial'
elif self.path == '/many-resources':
self.sample_datajson_file = 'many-resources.data.json'
self.test_name = 'many-resources'
elif self.path == '/numerical-title':
self.sample_datajson_file = 'numerical-title.data.json'
self.test_name = 'numerical-title'
Expand Down
10 changes: 10 additions & 0 deletions ckanext/datajson/tests/test_datajson_ckan_all_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,16 @@ def test_harvesting_parent_child_2_collections(self):
parent = model.Package.get(parent_package_id)
assert parent.title == 'Employee Relations Roundtables 2'

@pytest.mark.ckan_config('ckanext.datajson.max_resource_count', 10)
def test_too_many_resources(self):
url = 'http://127.0.0.1:%s/many-resources' % self.mock_port
self.run_source(url=url)
errors = self.errors
expected_error_stage = "Import"
assert errors[0].stage == expected_error_stage
expected_error_message = "Too many resources. Maximum allowed is 10. Actual size is 11."
assert errors[0].message == expected_error_message

def test_datajson_reserverd_word_as_title(self):
url = 'http://127.0.0.1:%s/error-reserved-title' % self.mock_port
self.run_source(url=url)
Expand Down
4 changes: 3 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,6 @@ services:
redis:
image: redis
solr:
image: ckan/ckan-solr-dev:2.9
image: datagov/catalog.data.gov.solr:latest
ports:
- "8983:8983"
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name='ckanext-datajson',
version='0.1.25',
version='0.1.26',
description='CKAN extension to generate /data.json',
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down

0 comments on commit fe4ccb6

Please sign in to comment.