Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get article references, either just PMIDs or details #1407

Merged
merged 3 commits into from
May 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 41 additions & 4 deletions indra/literature/pubmed_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,7 +367,10 @@ def _find_date(element):

def _parse_author(author_info, include_details=False):
if not include_details:
return author_info.find("LastName").text
last_name = author_info.find("LastName")
if last_name is None:
return None
return last_name.text

parsed_info = {
"last_name": None,
Expand Down Expand Up @@ -397,6 +400,27 @@ def _parse_author(author_info, include_details=False):
return parsed_info


def _get_references(reference_list, only_pmid=True):
"""Return a list of references for an article."""
if reference_list is None:
return None

references = []
for reference in reference_list.findall('Reference'):
pmid = _find_elem_text(reference, '*/ArticleId[@IdType="pubmed"]')
if only_pmid:
references.append(pmid)
else:
ref_dict = {
'pmid': pmid,
'doi': _find_elem_text(reference, '*/ArticleId[@IdType="doi"]'),
'pmcid': _find_elem_text(reference, '*/ArticleId[@IdType="pmcid"]'),
'citation': _find_elem_text(reference, 'Citation'),
}
references.append(ref_dict)
return references


def _get_article_info(medline_citation, pubmed_data, detailed_authors=False):
article = medline_citation.find('Article')
pmid = _find_elem_text(medline_citation, './PMID')
Expand Down Expand Up @@ -431,7 +455,8 @@ def _get_article_info(medline_citation, pubmed_data, detailed_authors=False):

def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
get_abstracts=False, prepend_title=False,
mesh_annotations=True, detailed_authors=False):
mesh_annotations=True, detailed_authors=False,
references_included=None):
"""Get metadata for an XML tree containing PubmedArticle elements.

Documentation on the XML structure can be found at:
Expand Down Expand Up @@ -459,6 +484,9 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
If True, extract as many of the author details as possible, such as
first name, identifiers, and institutions. If false, only last names
are returned. Default: False
references_included : Optional[str]
If 'detailed', include detailed references in the results. If 'pmid', only include
the PMID of the reference. If None, don't include references. Default: None

Returns
-------
Expand All @@ -483,6 +511,11 @@ def get_metadata_from_xml_tree(tree, get_issns_from_nlm=False,
if mesh_annotations:
context_info = _get_annotations(medline_citation)
result.update(context_info)
if references_included:
references = _get_references(pubmed_data.find('ReferenceList'),
only_pmid=(references_included == 'pmid'))
result['references'] = references

publication_date = _get_pubmed_publication_date(pubmed_data)
result['publication_date'] = publication_date

Expand Down Expand Up @@ -566,7 +599,7 @@ def _major_topic(e):

def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,
get_abstracts=False, prepend_title=False,
detailed_authors=False):
detailed_authors=False, references_included=None):
"""Get article metadata for up to 200 PMIDs from the Pubmed database.

Parameters
Expand All @@ -586,6 +619,9 @@ def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,
If True, extract as many of the author details as possible, such as
first name, identifiers, and institutions. If false, only last names
are returned. Default: False
references_included : Optional[str]
If 'detailed', include detailed references in the results. If 'pmid', only include
the PMID of the reference. If None, don't include references. Default: None

Returns
-------
Expand All @@ -604,7 +640,8 @@ def get_metadata_for_ids(pmid_list, get_issns_from_nlm=False,
return None
return get_metadata_from_xml_tree(tree, get_issns_from_nlm, get_abstracts,
prepend_title,
detailed_authors=detailed_authors)
detailed_authors=detailed_authors,
references_included=references_included)


@lru_cache(maxsize=1000)
Expand Down
16 changes: 16 additions & 0 deletions indra/tests/test_pubmed_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def test_get_complex_title():
assert title.lower().startswith('atomic structures')
assert title.lower().endswith('vascular plants.')


@pytest.mark.webservice
def test_expand_pagination():
time.sleep(0.5)
Expand Down Expand Up @@ -144,6 +145,21 @@ def test_get_metadata_for_ids():
metadata2[pmids1[0]]['authors'][0]['affiliations'][0]['name']


@pytest.mark.webservice
def test_get_paper_references():
time.sleep(0.5)
pmids = ['27123883', '27121204', '27115606']
test_pmid = '27121204'
referenced_pmid = '25439075'
metadata_1 = pubmed_client.get_metadata_for_ids(pmids, references_included='pmid')
assert len(metadata_1[test_pmid]['references']) != 0
assert metadata_1[test_pmid]['references'][0] == referenced_pmid

metadata_2 = pubmed_client.get_metadata_for_ids(pmids, references_included='detailed')
assert len(metadata_2[test_pmid]['references']) != 0
assert metadata_2[test_pmid]['references'][0]['pmid'] == referenced_pmid


@pytest.mark.webservice
def test_get_pub_date():
time.sleep(0.5)
Expand Down