Skip to content

Commit

Permalink
desy: fix path to attached documents
Browse files Browse the repository at this point in the history
  • Loading branch information
MJedr committed Sep 20, 2023
1 parent c0d5911 commit fd0a45d
Showing 1 changed file with 8 additions and 6 deletions.
14 changes: 8 additions & 6 deletions hepcrawl/spiders/desy_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,10 @@ def _is_local_path(cls, url):
parsed_url = urllib.parse.urlparse(url)
return not parsed_url.scheme.startswith("http")

def _get_full_uri(self, file_name, schema='https'):
def _get_full_uri(self, file_name, subdirectory_name, schema='https'):
file_name = file_name.lstrip('/api/files/')
url = self.s3_url_for_file(file_name, bucket=self.s3_output_bucket)
full_file_s3_path = "{}{}".format(subdirectory_name, file_name)
url = self.s3_url_for_file(full_file_s3_path, bucket=self.s3_output_bucket)
return url

def move_all_files_for_subdirectory(self, prefix):
Expand Down Expand Up @@ -213,7 +214,8 @@ def parse(self, response):

parsed_items = self._parsed_items_from_json(
json_records=json_records,
file_name=file_name
file_name=file_name,
subdirectory_name=response.meta['s3_subdirectory']
)

self.move_all_files_for_subdirectory(
Expand All @@ -228,7 +230,6 @@ def parse(self, response):

self.logger.info('Processed all JSON records in %s', file_name)


def move_file_to_processed(self, file_name, file_bucket=None, output_bucket=None):
file_bucket = file_bucket or self.s3_input_bucket
output_bucket = output_bucket or self.s3_output_bucket
Expand All @@ -243,7 +244,8 @@ def move_file_to_processed(self, file_name, file_bucket=None, output_bucket=None
def _parsed_items_from_json(
self,
json_records,
file_name
file_name,
subdirectory_name
):
self.logger.info('parsing record')
app = Flask('hepcrawl')
Expand All @@ -259,7 +261,7 @@ def _parsed_items_from_json(
self.logger.info("Record has documents: %s", "documents" in parsed_item.record)
for document in parsed_item.record.get('documents', []):
if self._is_local_path(document['url']):
document['url'] = self._get_full_uri(document['url'])
document['url'] = self._get_full_uri(document['url'], subdirectory_name)
self.logger.info("Updating document %s", document)
else:
files_to_download.append(document['url'])
Expand Down

0 comments on commit fd0a45d

Please sign in to comment.