Skip to content

Commit

Permalink
Account for multiple OpenAlex works with the same DOI
Browse files Browse the repository at this point in the history
The dois_from_orcid() now returns a list of unique DOIs, instead of an
iterator of possiblly unique DOIs. This allows a failing test to pass.

The test for openalex_publications_from_dois() was relaxed a bit to look
for 231 or more publications, since a lookup for an individual DOI can
sometimes pull back multiple works.

The number of columns for OpenAlex is now 53 because we added
`institution_assertions`.
  • Loading branch information
edsu committed Sep 26, 2024
1 parent ccf6816 commit 9ae9adc
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 11 deletions.
15 changes: 9 additions & 6 deletions rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def doi_orcids_pickle(authors_csv, pickle_file, limit=None):

def dois_from_orcid(orcid: str, limit=None):
"""
Pass in the ORCID ID and get back an iterator of DOIs for publications authored by that person.
Pass in the ORCID ID and get back a list of DOIs for publications authored by that person.
"""

# TODO: I think we can maybe have this function take a list of orcids and
Expand All @@ -57,16 +57,19 @@ def dois_from_orcid(orcid: str, limit=None):
author_id = authors[0]["id"]

# get all the works for the openalex author id
work_count = 0
dois = set()
for page in (
Works().filter(author={"id": author_id}).select(["doi"]).paginate(per_page=200)
):
for pub in page:
if pub.get("doi"):
work_count += 1
if limit is not None and work_count > limit:
return
yield pub.get("doi").replace("https://doi.org/", "")
doi = pub.get("doi").replace("https://doi.org/", "")
dois.add(doi)
if limit is not None and len(dois) == limit:
return list(dois)

return list(dois)



def publications_csv(dois: list, csv_file: str) -> None:
Expand Down
11 changes: 6 additions & 5 deletions test/harvest/test_openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def test_dois_from_orcid_paging():
# for Shanhui Fan who has a lot of publications (> 1300)
dois = list(openalex.dois_from_orcid("0000-0002-0081-9732", limit=300))
assert len(dois) == 300, "paging is limiting to 200 works"
assert len(set(dois)) == 300, "the dois are unique"
assert len(set(dois)) == len(dois), "the dois are unique"


def test_doi_orcids_pickle(tmp_path):
Expand Down Expand Up @@ -48,11 +48,12 @@ def test_publications_from_dois():

# look up the publication metadata for them
pubs = list(openalex.publications_from_dois(dois))
assert len(pubs) == 231, "should paginate (page size=200)"
assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique"

# >= is used because sometimes there can be multiple works for a DOI!
assert len(pubs) >= 231, "should paginate (page size=200)"
assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for."
assert len(pubs[0].keys()) == 52, "first publication has 52 columns"
assert len(pubs[1].keys()) == 52, "second publication has 52 columns"
assert len(pubs[0].keys()) == 53, "first publication has 53 columns"
assert len(pubs[1].keys()) == 53, "second publication has 53 columns"


def test_publications_from_invalid_dois(caplog):
Expand Down

0 comments on commit 9ae9adc

Please sign in to comment.