Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

desy: fix path to attached documents #340

Merged
merged 1 commit into from
Sep 20, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions hepcrawl/spiders/desy_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,10 @@ def _is_local_path(cls, url):
parsed_url = urllib.parse.urlparse(url)
return not parsed_url.scheme.startswith("http")

def _get_full_uri(self, file_name, schema='https'):
def _get_full_uri(self, file_name, subdirectory_name, schema='https'):
file_name = file_name.lstrip('/api/files/')
url = self.s3_url_for_file(file_name, bucket=self.s3_output_bucket)
full_file_s3_path = "{}{}".format(subdirectory_name, file_name)
url = self.s3_url_for_file(full_file_s3_path, bucket=self.s3_output_bucket)
return url

def move_all_files_for_subdirectory(self, prefix):
Expand Down Expand Up @@ -213,7 +214,8 @@ def parse(self, response):

parsed_items = self._parsed_items_from_json(
json_records=json_records,
file_name=file_name
file_name=file_name,
subdirectory_name=response.meta['s3_subdirectory']
)

self.move_all_files_for_subdirectory(
Expand All @@ -228,7 +230,6 @@ def parse(self, response):

self.logger.info('Processed all JSON records in %s', file_name)


def move_file_to_processed(self, file_name, file_bucket=None, output_bucket=None):
file_bucket = file_bucket or self.s3_input_bucket
output_bucket = output_bucket or self.s3_output_bucket
Expand All @@ -243,7 +244,8 @@ def move_file_to_processed(self, file_name, file_bucket=None, output_bucket=None
def _parsed_items_from_json(
self,
json_records,
file_name
file_name,
subdirectory_name
):
self.logger.info('parsing record')
app = Flask('hepcrawl')
Expand All @@ -259,7 +261,7 @@ def _parsed_items_from_json(
self.logger.info("Record has documents: %s", "documents" in parsed_item.record)
for document in parsed_item.record.get('documents', []):
if self._is_local_path(document['url']):
document['url'] = self._get_full_uri(document['url'])
document['url'] = self._get_full_uri(document['url'], subdirectory_name)
self.logger.info("Updating document %s", document)
else:
files_to_download.append(document['url'])
Expand Down
Loading