Skip to content

Commit

Permalink
Added openalex and dimensions publication links
Browse files Browse the repository at this point in the history
It might be useful to include links to the article for (possibly)
getting fulltext.

* OpenAlex best_oa_location: https://docs.openalex.org/api-entities/works/work-object#best_oa_location
* Dimensions linkout: https://docs.dimensions.ai/dsl/datasource-publications.html
  • Loading branch information
edsu committed Aug 29, 2024
1 parent a7bd397 commit 1083a3c
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 4 deletions.
10 changes: 9 additions & 1 deletion rialto_airflow/harvest/merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def dimensions_pubs_df(dimensions_pubs):
"document_type",
"funders",
"funding_section",
"linkout",
"open_access",
"publisher",
"research_orgs",
Expand All @@ -79,7 +80,14 @@ def openalex_pubs_df(openalex_pubs):
)
df = df.select(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
pl.col("apc_paid", "grants", "publication_year", "title", "type"),
pl.col(
"apc_paid",
"grants",
"publication_year",
"title",
"type",
"best_oa_location",
),
)
df = df.rename(lambda column_name: "openalex_" + column_name)
return df
Expand Down
1 change: 1 addition & 0 deletions rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ def normalize_publication(pub) -> dict:
"apc_paid",
"best_oa_location",
"biblio",
"citation_normalized_percentile",
"cited_by_api_url",
"cited_by_count",
"cited_by_percentile_year",
Expand Down
9 changes: 8 additions & 1 deletion test/harvest/test_merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def dimensions_pubs_csv(tmp_path):
"doi",
"funders",
"funding_section",
"linkout",
"open_access",
"publisher",
"research_orgs",
Expand All @@ -37,6 +38,7 @@ def dimensions_pubs_csv(tmp_path):
"10.0000/aAaA",
"[]",
"[]",
"https://example.com/my-awesome-paper",
"True",
"publisher",
"[]",
Expand All @@ -55,6 +57,7 @@ def dimensions_pubs_csv(tmp_path):
"10.0000/1234",
"[]",
"[]",
"https://example.com/yet-another-awesome-paper",
"True",
"publisher",
"[]",
Expand All @@ -81,6 +84,7 @@ def openalex_pubs_csv(tmp_path):
"title",
"type",
"doi",
"best_oa_location",
]
writer.writerow(header)
writer.writerow(
Expand All @@ -93,6 +97,7 @@ def openalex_pubs_csv(tmp_path):
"A Publication",
"article",
"https://doi.org/10.0000/cccc",
'{is_oa: true, landing_page_url: "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1398957", pdf_url: null, source: { id: "https://openalex.org/S2764455111", display_name: "PubMed Central", issn_l: null, issn: null, host_organization: "https://openalex.org/I1299303238", type: "repository" }, license: null, version: "publishedVersion"}',
]
)
writer.writerow(
Expand All @@ -105,6 +110,7 @@ def openalex_pubs_csv(tmp_path):
"A Research Article",
"article",
"https://doi.org/10.0000/1234",
"",
]
)
return fixture_file
Expand Down Expand Up @@ -159,6 +165,7 @@ def test_openalex_pubs_df(openalex_pubs_csv):
df = lazy_df.collect()
assert df.shape[0] == 2
assert "bogus" not in df.columns, "Unneeded columns have been dropped"
assert "openalex_best_oa_location" in df.columns
assert df["openalex_doi"].to_list() == ["10.0000/cccc", "10.0000/1234"]


Expand Down Expand Up @@ -186,7 +193,7 @@ def test_merge(tmp_path, sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv):
assert output.is_file(), "output file has been created"
df = pl.read_parquet(output)
assert df.shape[0] == 5
assert df.shape[1] == 23
assert df.shape[1] == 25
assert set(df["doi"].to_list()) == set(
["10.0000/aaaa", "10.0000/1234", "10.0000/cccc", "10.0000/dddd", "10.0000/eeee"]
)
4 changes: 2 additions & 2 deletions test/harvest/test_openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def test_publications_from_dois():
assert len(pubs) == 231, "should paginate (page size=200)"
assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique"
assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for."
assert len(pubs[0].keys()) == 51, "first publication has 51 columns"
assert len(pubs[1].keys()) == 51, "second publication has 51 columns"
assert len(pubs[0].keys()) == 52, "first publication has 52 columns"
assert len(pubs[1].keys()) == 52, "second publication has 52 columns"


def test_publications_from_invalid_dois(caplog):
Expand Down

0 comments on commit 1083a3c

Please sign in to comment.