From cf7d6241a259c9ee061856bff2b05b2ff9b8e056 Mon Sep 17 00:00:00 2001 From: Mirko Lenz Date: Wed, 22 Jun 2022 14:44:48 +0200 Subject: [PATCH 1/4] Add sort_order parameter for search api --- twarc/client2.py | 16 +++++++++++++++- twarc/command2.py | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/twarc/client2.py b/twarc/client2.py index b9ca272f..03813cab 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -181,7 +181,7 @@ def _prepare_params(self, **kwargs): ) # Any other parameters passed as is, - # these include backfill_minutes, next_token, pagination_token + # these include backfill_minutes, next_token, pagination_token, sort_order params = {**params, **{k: v for k, v in kwargs.items() if v is not None}} return params @@ -201,6 +201,7 @@ def _search( media_fields, poll_fields, place_fields, + sort_order, next_token=None, granularity=None, sleep_between=0, @@ -217,6 +218,7 @@ def _search( start_time=start_time, end_time=end_time, next_token=next_token, + sort_order=sort_order ) if granularity: @@ -657,6 +659,7 @@ def search_recent( poll_fields=None, place_fields=None, next_token=None, + sort_order=None, ): """ Search Twitter for the given query in the last seven days, @@ -677,6 +680,8 @@ def search_recent( Return all tweets before this time (UTC datetime). max_results (int): The maximum number of results per request. Max is 100. + sort_order (string): + Order tweets based on relevancy or recency. Returns: generator[dict]: a generator, dict for each paginated response. @@ -696,6 +701,7 @@ def search_recent( poll_fields=poll_fields, place_fields=place_fields, next_token=next_token, + sort_order=sort_order, ) @requires_app_auth @@ -714,6 +720,7 @@ def search_all( poll_fields=None, place_fields=None, next_token=None, + sort_order=None, ): """ Search Twitter for the given query in the full archive, @@ -735,6 +742,8 @@ def search_all( Return all tweets before this time (UTC datetime). max_results (int): The maximum number of results per request. Max is 500. + sort_order (string): + Order tweets based on relevancy or recency. Returns: generator[dict]: a generator, dict for each paginated response. @@ -762,6 +771,7 @@ def search_all( place_fields=place_fields, next_token=next_token, sleep_between=1.05, + sort_order=sort_order, ) @requires_app_auth @@ -794,6 +804,8 @@ def counts_recent( granularity (str): Count aggregation level: `day`, `hour`, `minute`. Default is `hour`. + sort_order (string): + Order tweets based on relevancy or recency. Returns: generator[dict]: a generator, dict for each paginated response. @@ -813,6 +825,7 @@ def counts_recent( poll_fields=None, place_fields=None, granularity=granularity, + sort_order=None, ) @requires_app_auth @@ -867,6 +880,7 @@ def counts_all( next_token=next_token, granularity=granularity, sleep_between=1.05, + sort_order=None, ) def tweet_lookup( diff --git a/twarc/command2.py b/twarc/command2.py index 66c2c0d3..22628b66 100644 --- a/twarc/command2.py +++ b/twarc/command2.py @@ -240,6 +240,7 @@ def _search( media_fields, poll_fields, place_fields, + sort_order, ): """ Common function to Search for tweets. @@ -281,6 +282,7 @@ def _search( media_fields=media_fields, poll_fields=poll_fields, place_fields=place_fields, + sort_order=sort_order, ): _write(result, outfile) tweet_ids = [t["id"] for t in result.get("data", [])] @@ -617,6 +619,11 @@ def command_line_verbose_options(f): @twarc2.command("search") +@click.option( + "--sort-order", + type=click.Choice(["recency", "relevancy"]), + help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")' +) @command_line_search_options @command_line_search_archive_options @command_line_expansions_shortcuts @@ -1290,6 +1297,11 @@ def mentions(T, user_id, outfile, hide_progress, **kwargs): @command_line_expansions_options @command_line_progressbar_option @click.option("--limit", default=0, help="Maximum number of tweets to return") +@click.option( + "--sort-order", + type=click.Choice(["recency", "relevancy"]), + help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")' +) @click.argument("user_id", type=str) @click.argument("outfile", type=click.File("w"), default="-") @click.pass_obj @@ -1307,6 +1319,7 @@ def timeline( exclude_retweets, exclude_replies, hide_progress, + sort_order, **kwargs, ): """ @@ -1363,6 +1376,7 @@ def timeline( end_time=end_time, exclude_retweets=exclude_retweets, exclude_replies=exclude_replies, + sort_order=sort_order, **kwargs, ) @@ -1394,6 +1408,11 @@ def timeline( default=0, help="Maximum number of tweets to return per-timeline", ) +@click.option( + "--sort-order", + type=click.Choice(["recency", "relevancy"]), + help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")' +) @command_line_search_options @command_line_timelines_options @command_line_expansions_shortcuts @@ -1408,6 +1427,7 @@ def timelines( limit, timeline_limit, use_search, + sort_order, hide_progress, **kwargs, ): @@ -1489,6 +1509,7 @@ def timelines( tweets = _timeline_tweets( T, use_search=use_search, + sort_order=sort_order, user_id=user, **kwargs, ) @@ -1516,6 +1537,7 @@ def _timeline_tweets( end_time, exclude_retweets, exclude_replies, + sort_order, **kwargs, ): if use_search: @@ -1530,6 +1552,7 @@ def _timeline_tweets( until_id=until_id, start_time=start_time, end_time=end_time, + sort_order=sort_order, **kwargs, ) else: @@ -1549,6 +1572,11 @@ def _timeline_tweets( @twarc2.command("searches") @command_line_search_options @command_line_search_archive_options +@click.option( + "--sort-order", + type=click.Choice(["recency", "relevancy"]), + help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")' +) @click.option( "--counts-only", is_flag=True, @@ -1591,6 +1619,7 @@ def searches( granularity, combine_queries, hide_progress, + sort_order, **kwargs, ): """ @@ -1641,6 +1670,7 @@ def searches( kwargs.pop("media_fields", None) kwargs.pop("poll_fields", None) kwargs.pop("place_fields", None) + kwargs.pop("sort_order", None) kwargs = { **kwargs, **{ @@ -1665,6 +1695,7 @@ def searches( "start_time": start_time, "end_time": end_time, "max_results": max_results, + "sort_order": sort_order, }, } @@ -1768,6 +1799,11 @@ def searches( @twarc2.command("conversation") +@click.option( + "--sort-order", + type=click.Choice(["recency", "relevancy"]), + help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")' +) @command_line_search_options @command_line_search_archive_options @command_line_expansions_shortcuts @@ -1804,6 +1840,11 @@ def conversation( default=0, help="Maximum number of tweets to return per-conversation", ) +@click.option( + "--sort-order", + type=click.Choice(["recency", "relevancy"]), + help='Filter tweets based on their date ("recency") (default) or based on their relevance as indicated by Twitter ("relevancy")' +) @command_line_search_options @command_line_search_archive_options @command_line_expansions_shortcuts From 6d2f988f5b7817a446200efed2d2b8275f836dad Mon Sep 17 00:00:00 2001 From: Mirko Lenz Date: Wed, 22 Jun 2022 16:34:45 +0200 Subject: [PATCH 2/4] Fix docstrings of client --- twarc/client2.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/twarc/client2.py b/twarc/client2.py index 03813cab..47cb0539 100644 --- a/twarc/client2.py +++ b/twarc/client2.py @@ -680,7 +680,7 @@ def search_recent( Return all tweets before this time (UTC datetime). max_results (int): The maximum number of results per request. Max is 100. - sort_order (string): + sort_order (str): Order tweets based on relevancy or recency. Returns: @@ -742,7 +742,7 @@ def search_all( Return all tweets before this time (UTC datetime). max_results (int): The maximum number of results per request. Max is 500. - sort_order (string): + sort_order (str): Order tweets based on relevancy or recency. Returns: @@ -804,8 +804,6 @@ def counts_recent( granularity (str): Count aggregation level: `day`, `hour`, `minute`. Default is `hour`. - sort_order (string): - Order tweets based on relevancy or recency. Returns: generator[dict]: a generator, dict for each paginated response. From 02f32183782ed726af3169879d657a13ef1f26ca Mon Sep 17 00:00:00 2001 From: Mirko Lenz Date: Wed, 22 Jun 2022 16:48:20 +0200 Subject: [PATCH 3/4] Add test for sort_order --- test_twarc2.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test_twarc2.py b/test_twarc2.py index 19db9600..429c0b5c 100644 --- a/test_twarc2.py +++ b/test_twarc2.py @@ -99,12 +99,13 @@ def test_sample(): assert count == 11 -def test_search_recent(): +@pytest.mark.parametrize("sort_order", ["recency", "relevancy"]) +def test_search_recent(sort_order): found_tweets = 0 pages = 0 - for response_page in T.search_recent("#auspol"): + for response_page in T.search_recent("#auspol", sort_order=sort_order): pages += 1 tweets = response_page["data"] found_tweets += len(tweets) From c4f7fc9219c29f86f74839642981b82680d14acf Mon Sep 17 00:00:00 2001 From: Mirko Lenz Date: Wed, 22 Jun 2022 16:54:30 +0200 Subject: [PATCH 4/4] Add docs for sort_order --- docs/twarc2_en_us.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/twarc2_en_us.md b/docs/twarc2_en_us.md index 5391352d..8f32271b 100644 --- a/docs/twarc2_en_us.md +++ b/docs/twarc2_en_us.md @@ -133,6 +133,13 @@ leave off the `--start-time`: twarc2 search --end-time 2014-07-24 '"eric garner"' tweets.jsonl +### Sort Order + +By default, Twitter returns the results ordered by their published date with the newest tweets being first. +To alter this behavior, it is possible to specify the `--sort-order` parameter. +Currently, it supports `recency` (the default) or `relevancy`. +In the latter case, tweets are ordered based on what Twitter determines to be the best results for your query. + ## Searches Searches works like the [search](#search) command, but instead of taking a single query, it reads from a file containing many queries. You can use the same limit and time options just like a single search command, but it will be applied to every query.