diff --git a/rialto_airflow/harvest/merge_pubs.py b/rialto_airflow/harvest/merge_pubs.py index 2df32c5..3ba11de 100644 --- a/rialto_airflow/harvest/merge_pubs.py +++ b/rialto_airflow/harvest/merge_pubs.py @@ -57,6 +57,7 @@ def dimensions_pubs_df(dimensions_pubs): "document_type", "funders", "funding_section", + "linkout", "open_access", "publisher", "research_orgs", @@ -79,7 +80,14 @@ def openalex_pubs_df(openalex_pubs): ) df = df.select( pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String), - pl.col("apc_paid", "grants", "publication_year", "title", "type"), + pl.col( + "apc_paid", + "grants", + "publication_year", + "title", + "type", + "best_oa_location", + ), ) df = df.rename(lambda column_name: "openalex_" + column_name) return df diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index c6b932d..c51368c 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -127,6 +127,7 @@ def normalize_publication(pub) -> dict: "apc_paid", "best_oa_location", "biblio", + "citation_normalized_percentile", "cited_by_api_url", "cited_by_count", "cited_by_percentile_year", diff --git a/test/harvest/test_merge_pubs.py b/test/harvest/test_merge_pubs.py index 2674e78..28c8807 100644 --- a/test/harvest/test_merge_pubs.py +++ b/test/harvest/test_merge_pubs.py @@ -19,6 +19,7 @@ def dimensions_pubs_csv(tmp_path): "doi", "funders", "funding_section", + "linkout", "open_access", "publisher", "research_orgs", @@ -37,6 +38,7 @@ def dimensions_pubs_csv(tmp_path): "10.0000/aAaA", "[]", "[]", + "https://example.com/my-awesome-paper", "True", "publisher", "[]", @@ -55,6 +57,7 @@ def dimensions_pubs_csv(tmp_path): "10.0000/1234", "[]", "[]", + "https://example.com/yet-another-awesome-paper", "True", "publisher", "[]", @@ -81,6 +84,7 @@ def openalex_pubs_csv(tmp_path): "title", "type", "doi", + "best_oa_location", ] writer.writerow(header) writer.writerow( @@ -93,6 +97,7 @@ def openalex_pubs_csv(tmp_path): "A Publication", "article", "https://doi.org/10.0000/cccc", + '{is_oa: true, landing_page_url: "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1398957", pdf_url: null, source: { id: "https://openalex.org/S2764455111", display_name: "PubMed Central", issn_l: null, issn: null, host_organization: "https://openalex.org/I1299303238", type: "repository" }, license: null, version: "publishedVersion"}', ] ) writer.writerow( @@ -105,6 +110,7 @@ def openalex_pubs_csv(tmp_path): "A Research Article", "article", "https://doi.org/10.0000/1234", + "", ] ) return fixture_file @@ -159,6 +165,7 @@ def test_openalex_pubs_df(openalex_pubs_csv): df = lazy_df.collect() assert df.shape[0] == 2 assert "bogus" not in df.columns, "Unneeded columns have been dropped" + assert "openalex_best_oa_location" in df.columns assert df["openalex_doi"].to_list() == ["10.0000/cccc", "10.0000/1234"] @@ -186,7 +193,7 @@ def test_merge(tmp_path, sul_pubs_csv, openalex_pubs_csv, dimensions_pubs_csv): assert output.is_file(), "output file has been created" df = pl.read_parquet(output) assert df.shape[0] == 5 - assert df.shape[1] == 23 + assert df.shape[1] == 25 assert set(df["doi"].to_list()) == set( ["10.0000/aaaa", "10.0000/1234", "10.0000/cccc", "10.0000/dddd", "10.0000/eeee"] ) diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index ad87591..bbb0cab 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -51,8 +51,8 @@ def test_publications_from_dois(): assert len(pubs) == 231, "should paginate (page size=200)" assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique" assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for." - assert len(pubs[0].keys()) == 51, "first publication has 51 columns" - assert len(pubs[1].keys()) == 51, "second publication has 51 columns" + assert len(pubs[0].keys()) == 52, "first publication has 52 columns" + assert len(pubs[1].keys()) == 52, "second publication has 52 columns" def test_publications_from_invalid_dois(caplog):