From b3d99a12f17d513e9bea198682c7027188408917 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Thu, 8 Dec 2022 23:28:53 +0100 Subject: [PATCH 1/2] Add built-in pager --- README.md | 54 +++++++-------------------------- pyalex/api.py | 36 ++++++++++++++++++++++ tests/test_pyalex.py | 71 ++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 114 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index ad2f81c..8141f5f 100644 --- a/README.md +++ b/README.md @@ -193,57 +193,23 @@ PyAlex, although cursor paging seems to be easier to implement and less error-pr ##### Basic paging -See limitations of [basic paging](https://docs.openalex.org/api#basic-paging) in the OpenAlex documentation. - -```python -from pyalex import Authors - -# example query -query = Authors().search_filter(display_name="einstein") - -# set the page -page = 1 - -# store the results -results = [] - -# loop till page is None -while page is not None: - - # get the results - r, m = query.get(return_meta=True, per_page=200, page=page) - - # results - results.append(r) - - page = m["page"] + 1 if page is not None else None -``` +See limitations of [basic paging] +(https://docs.openalex.org/api#basic-paging) in the OpenAlex documentation. +It's relatively easy to implement basic paging with PyAlex, however it is +advised to use the built-in pager based on cursor paging. ##### Cursor paging +Use `paginate()` for paging results. By default, `paginate`s argument `n_max` +is set to 10000. Use `None` to retrieve all results. + ```python from pyalex import Authors -# example query -query = Authors().search_filter(display_name="einstein") - -# set the next_cursor (to *) -next_cursor = "*" - -# store the results -results = [] - -# loop till next_cursor is None -while next_cursor is not None: - - # get the results - r, m = query.get(return_meta=True, per_page=200, cursor=next_cursor) - - # results - results.extend(r) +pager = Authors().search_filter(display_name="einstein").paginate(per_page=200) - # set the next cursor - next_cursor = m["next_cursor"] +for page in pager: + print(len(page)) ``` ### Get N-grams diff --git a/pyalex/api.py b/pyalex/api.py index f4daee9..36df0d1 100644 --- a/pyalex/api.py +++ b/pyalex/api.py @@ -93,6 +93,38 @@ class Concept(OpenAlexEntity): pass +class CursorPaginator(object): + def __init__(self, alex_class=None, per_page=None, cursor="*", n_max=None): + + self.alex_class = alex_class + self.per_page = per_page + self.cursor = cursor + self.n_max = n_max + + def __iter__(self): + + self.n = 0 + + return self + + def __next__(self): + + if self.n_max and self.n >= self.n_max: + raise StopIteration + + r, m = self.alex_class.get( + return_meta=True, per_page=self.per_page, cursor=self.cursor + ) + + if m["next_cursor"] is None: + raise StopIteration + + self.n = self.n + len(r) + self.cursor = m["next_cursor"] + + return r + + class BaseOpenAlex(object): """Base class for OpenAlex objects.""" @@ -153,6 +185,10 @@ def get(self, return_meta=False, page=None, per_page=None, cursor=None): else: return results + def paginate(self, per_page=None, cursor="*", n_max=10000): + + return CursorPaginator(self, per_page=per_page, cursor=cursor, n_max=n_max) + def random(self): return self.__getitem__("random") diff --git a/tests/test_pyalex.py b/tests/test_pyalex.py index b77ebd2..796d1dd 100644 --- a/tests/test_pyalex.py +++ b/tests/test_pyalex.py @@ -157,7 +157,7 @@ def test_search_filter(): assert r["meta"]["count"] == m["count"] -def test_cursor_paging(): +def test_cursor_by_hand(): # example query query = Authors().search_filter(display_name="einstein") @@ -170,8 +170,6 @@ def test_cursor_paging(): # loop till next_cursor is None while next_cursor is not None: - print(next_cursor) - # get the results r, m = query.get(return_meta=True, per_page=200, cursor=next_cursor) @@ -184,6 +182,73 @@ def test_cursor_paging(): assert len(results) > 200 +def test_basic_paging(): + + # example query + query = Authors().search_filter(display_name="einstein") + + # set the page + page = 1 + + # store the results + results = [] + + # loop till page is None + while page is not None: + + # get the results + r, m = query.get(return_meta=True, per_page=200, page=page) + + # results + results.append(r) + + page = m["page"] + 1 if page is not None else None + + assert len(results) > 200 + + +def test_cursor_paging(): + + # example query + pager = Authors().search_filter(display_name="einstein").paginate(per_page=200) + + for page in pager: + + assert len(page) >= 1 and len(page) <= 200 + + +def test_cursor_paging_n_max(): + + # example query + pager = ( + Authors() + .search_filter(display_name="einstein") + .paginate(per_page=200, n_max=400) + ) + + n = 0 + for page in pager: + + n = n + len(page) + + assert n == 400 + + +def test_cursor_paging_n_max_none(): + + # example query + pager = ( + Authors() + .search_filter(display_name="einstein") + .paginate(per_page=200, n_max=None) + ) + + n = 0 + for page in pager: + + n = n + len(page) + + def test_referenced_works(): # the work to extract the referenced works of From 791e543c1b274511e41118d4c2a90a2413accac6 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Thu, 8 Dec 2022 23:42:13 +0100 Subject: [PATCH 2/2] Fix unit test --- tests/test_pyalex.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/test_pyalex.py b/tests/test_pyalex.py index 796d1dd..a7464c5 100644 --- a/tests/test_pyalex.py +++ b/tests/test_pyalex.py @@ -200,9 +200,8 @@ def test_basic_paging(): r, m = query.get(return_meta=True, per_page=200, page=page) # results - results.append(r) - - page = m["page"] + 1 if page is not None else None + results.extend(r) + page = None if len(r) == 0 else m["page"] + 1 assert len(results) > 200