Skip to content

Commit

Permalink
Update get_data.sh (#81)
Browse files Browse the repository at this point in the history
* Update get_data.sh

Only the most recent dump should be used. You are making numerous duplicates of the same data.

* Lint for download_and_convert_md.py and add comments
  • Loading branch information
conceptofmind authored Jun 3, 2024
1 parent 1aa3704 commit 1ef9a1c
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 22 deletions.
19 changes: 4 additions & 15 deletions courtlistener/get_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,11 @@ download_dir="./data/courtlistener/raw"
# Create the directory if it does not exist
mkdir -p "$download_dir"

# Only download the data from most recent CL dump
# The newest dump contains the previous dumps data
# Differences from the previous data should not be included
dates=(
"2022-08-02"
"2022-08-31"
"2022-09-30"
"2022-10-31"
"2022-11-30"
"2022-12-31"
"2023-01-31"
"2023-02-28"
"2023-03-31"
"2023-04-30"
"2023-05-31"
"2023-07-31"
"2023-08-31"
"2023-12-04"
"2024-03-11"
"2024-05-06"
)

max_jobs=8
Expand Down
13 changes: 6 additions & 7 deletions pubmedcentral/download_and_convert_to_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,13 @@
help="Number of processes to use for conversion.",
)


def get_date_from_tree(tree):
date_created = None
# get date from tree
# date can be found under a number of tags
pub_types = ["pub", "epub", "pmc-release", "ppub"]
for pub_type in pub_types:

# try most common location first
date = tree.find(f".//pub-date[@pub-type='{pub_type}']")
if date is not None:
Expand All @@ -64,7 +64,6 @@ def get_date_from_tree(tree):
date_created = f"{year}-01-01"
continue


# if we found the month, try the day
try:
day = date.find("day").text
Expand All @@ -73,12 +72,11 @@ def get_date_from_tree(tree):
date_created = f"{year}-{month}-01"
continue


# If we successfully found all date components,
# convert to YYYY-MM-DD format
date_created = f"{year}-{month}-{day}"
break

# try the next location
date = tree.find(f".//pub-date[@date-type='{pub_type}']")
if date is not None:
Expand All @@ -100,7 +98,6 @@ def get_date_from_tree(tree):
date_created = f"{year}-01-01"
continue


# if we found the month, try the day
try:
day = date.find("day").text
Expand All @@ -109,14 +106,14 @@ def get_date_from_tree(tree):
date_created = f"{year}-{month}-01"
continue


# If we successfully found all date components,
# convert to YYYY-MM-DD format
date_created = f"{year}-{month}-{day}"
break

return date_created


def get_authors_and_date(nxml_file: str, pmcid: str):
# get authors from nxml file
authors = []
Expand All @@ -138,7 +135,9 @@ def get_authors_and_date(nxml_file: str, pmcid: str):
# not a fatal error, just log it
if date_created is None:
logger = logs.get_logger("pubmedcentral")
logger.info(f"Date not found for {pmcid}. Setting to default value of '1900-01-01'")
logger.info(
f"Date not found for {pmcid}. Setting to default value of '1900-01-01'"
)
date_created = "1900-01-01"

return authors, date_created
Expand Down

0 comments on commit 1ef9a1c

Please sign in to comment.