Added openalex and dimensions publication links

It might be useful to include links to the article for (possibly) getting fulltext. * OpenAlex best_oa_location: https://docs.openalex.org/api-entities/works/work-object#best_oa_location * Dimensions linkout: https://docs.dimensions.ai/dsl/datasource-publications.html
sul-dlss-labs · Aug 29, 2024 · 1083a3c · 1083a3c
1 parent a7bd397
commit 1083a3c
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 4 deletions.
diff --git a/rialto_airflow/harvest/merge_pubs.py b/rialto_airflow/harvest/merge_pubs.py
@@ -57,6 +57,7 @@ def dimensions_pubs_df(dimensions_pubs):
             "document_type",
             "funders",
             "funding_section",
+            "linkout",
             "open_access",
             "publisher",
             "research_orgs",
@@ -79,7 +80,14 @@ def openalex_pubs_df(openalex_pubs):
     )
     df = df.select(
         pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
-        pl.col("apc_paid", "grants", "publication_year", "title", "type"),
+        pl.col(
+            "apc_paid",
+            "grants",
+            "publication_year",
+            "title",
+            "type",
+            "best_oa_location",
+        ),
     )
     df = df.rename(lambda column_name: "openalex_" + column_name)
     return df

diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py
@@ -127,6 +127,7 @@ def normalize_publication(pub) -> dict:
     "apc_paid",
     "best_oa_location",
     "biblio",
+    "citation_normalized_percentile",
     "cited_by_api_url",
     "cited_by_count",
     "cited_by_percentile_year",

diff --git a/test/harvest/test_merge_pubs.py b/test/harvest/test_merge_pubs.py
@@ -19,6 +19,7 @@ def dimensions_pubs_csv(tmp_path):
             "doi",
             "funders",
             "funding_section",
+            "linkout",
             "open_access",
             "publisher",
             "research_orgs",
@@ -37,6 +38,7 @@ def dimensions_pubs_csv(tmp_path):
                 "10.0000/aAaA",
                 "[]",
                 "[]",
+                "https://example.com/my-awesome-paper",
                 "True",
                 "publisher",
                 "[]",
@@ -55,6 +57,7 @@ def dimensions_pubs_csv(tmp_path):
                 "10.0000/1234",
                 "[]",
                 "[]",
+                "https://example.com/yet-another-awesome-paper",
                 "True",
                 "publisher",
                 "[]",
@@ -81,6 +84,7 @@ def openalex_pubs_csv(tmp_path):
             "title",
             "type",
             "doi",
+            "best_oa_location",
         ]
         writer.writerow(header)
         writer.writerow(
@@ -93,6 +97,7 @@ def openalex_pubs_csv(tmp_path):
                 "A Publication",
                 "article",
                 "https://doi.org/10.0000/cccc",
+                '{is_oa: true, landing_page_url: "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1398957", pdf_url: null, source: { id: "https://openalex.org/S2764455111", display_name: "PubMed Central", issn_l: null, issn: null, host_organization: "https://openalex.org/I1299303238", type: "repository" }, license: null, version: "publishedVersion"}',
             ]
         )
         writer.writerow(
@@ -105,6 +110,7 @@ def openalex_pubs_csv(tmp_path):
                 "A Research Article",
                 "article",
                 "https://doi.org/10.0000/1234",
+                "",
             ]
         )
     return fixture_file
@@ -159,6 +165,7 @@ def test_openalex_pubs_df(openalex_pubs_csv):
     df = lazy_df.collect()
     assert df.shape[0] == 2
     assert "bogus" not in df.columns, "Unneeded columns have been dropped"
+    assert "openalex_best_oa_location" in df.columns
     assert df["openalex_doi"].to_list() == ["10.0000/cccc", "10.0000/1234"]
 
 
@@ -186,7 +193,7 @@ def test_merge(tmp_path, sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv):
     assert output.is_file(), "output file has been created"
     df = pl.read_parquet(output)
     assert df.shape[0] == 5
-    assert df.shape[1] == 23
+    assert df.shape[1] == 25
     assert set(df["doi"].to_list()) == set(
         ["10.0000/aaaa", "10.0000/1234", "10.0000/cccc", "10.0000/dddd", "10.0000/eeee"]
     )
diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py
@@ -51,8 +51,8 @@ def test_publications_from_dois():
     assert len(pubs) == 231, "should paginate (page size=200)"
     assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique"
     assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for."
-    assert len(pubs[0].keys()) == 51, "first publication has 51 columns"
-    assert len(pubs[1].keys()) == 51, "second publication has 51 columns"
+    assert len(pubs[0].keys()) == 52, "first publication has 52 columns"
+    assert len(pubs[1].keys()) == 52, "second publication has 52 columns"
 
 
 def test_publications_from_invalid_dois(caplog):