From 5a19cc104440375a09f5cd2a9af18b7356cbb5fe Mon Sep 17 00:00:00 2001 From: Deep Ganguli Date: Thu, 6 Mar 2014 16:17:24 -0800 Subject: [PATCH 1/2] Update README.md Timeseries with postaggregators and a picture! --- README.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/README.md b/README.md index 1c4a25cc..88e4c64e 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,39 @@ pydruid exposes a simple API to create, execute, and analyze [Druid](http://drui #documentation #examples + +The following exampes show how to execute and analyze the results of three types of queries:timeseries, topN, and groupby. We analyze the twitter data set + +## timeseries query + +What was the average tweet length, per day, surrounding the 2014 Sochi olympics? + +```python +from pydruid.client import * +from pylab import plt + +query = PyDruid(bard_url_goes_here, 'druid/v2') + +ts = query.timeseries( + datasource='twitterstream', + granularity='day', + intervals='2014-02-02/p4w', + aggregations={'length': doublesum('tweet_length'), 'count': doublesum('count')}, + post_aggregations={'avg_tweet_length': (Field('length') / Field('count'))}, + filter=Dimension('first_hashtag') == 'sochi2014' +) +df = query.export_pandas() +df['timestamp'] = df['timestamp'].map(lambda x: x.split('T')[0]) +df.plot(x='timestamp', y='avg_tweet_length', ylim=(80, 140), rot=20, + title='Sochi 2014') +plt.ylabel('avg tweet length (chars)') +plt.show() +``` + +![alt text](https://github.com/metamx/pydruid/raw/docs/docs/figures/avg_tweet_length.png "Avg. tweet length") + + + + + + From 1c97eb3b081c8f362583dcebe94be0b9319180ef Mon Sep 17 00:00:00 2001 From: Deep Ganguli Date: Thu, 6 Mar 2014 16:22:52 -0800 Subject: [PATCH 2/2] Update README.md topN example --- README.md | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 88e4c64e..bef86f1a 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ pydruid exposes a simple API to create, execute, and analyze [Druid](http://drui The following exampes show how to execute and analyze the results of three types of queries:timeseries, topN, and groupby. We analyze the twitter data set -## timeseries query +## timeseries What was the average tweet length, per day, surrounding the 2014 Sochi olympics? @@ -37,6 +37,42 @@ plt.show() ![alt text](https://github.com/metamx/pydruid/raw/docs/docs/figures/avg_tweet_length.png "Avg. tweet length") +## topN + +Who were the top ten mentions (@user_name) during the 2014 Oscars? + +```python +top = query.topn( + datasource='twitterstream', + granularity='all', + intervals='2014-03-03/p1d', # utc time of 2014 oscars + aggregations={'count': doublesum('count')}, + dimension='user_mention_name', + filter=(Dimension('user_lang') == 'en') & (Dimension('first_hashtag') == 'oscars') & + (Dimension('user_time_zone') == 'Pacific Time (US & Canada)') & + ~(Dimension('user_mention_name') == 'No Mention'), + metric='count', + threshold=10 +) + +df = query.export_pandas() +print df + + count timestamp user_mention_name +0 1303 2014-03-03T00:00:00.000Z TheEllenShow +1 44 2014-03-03T00:00:00.000Z TheAcademy +2 21 2014-03-03T00:00:00.000Z MTV +3 21 2014-03-03T00:00:00.000Z peoplemag +4 17 2014-03-03T00:00:00.000Z THR +5 16 2014-03-03T00:00:00.000Z ItsQueenElsa +6 16 2014-03-03T00:00:00.000Z eonline +7 15 2014-03-03T00:00:00.000Z PerezHilton +8 14 2014-03-03T00:00:00.000Z realjohngreen +9 12 2014-03-03T00:00:00.000Z KevinSpacey + +``` + +