-
Notifications
You must be signed in to change notification settings - Fork 63
/
user_stats_individual.py
119 lines (105 loc) · 6.52 KB
/
user_stats_individual.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Twitter APIv2 individual user statistics
"""
from common.lib.helpers import UserInput
from processors.twitter.base_twitter_stats import TwitterStatsBase
from common.lib.exceptions import ProcessorException
__author__ = "Dale Wahl"
__credits__ = ["Dale Wahl"]
__maintainer__ = "Dale Wahl"
__email__ = "4cat@oilab.eu"
class TwitterStats(TwitterStatsBase):
"""
Collect Twitter statistics. Build to emulate TCAT statistic.
"""
type = "twitter-user-stats-individual" # job type ID
category = "Twitter Analysis" # category
title = "Individual User Statistics" # title displayed in UI
description = "Lists users and their number of tweets, number of followers, number of friends, how many times they are listed, their UTC time offset, whether the user has a verified account and how many times they appear in the data set." # description displayed in UI
extension = "csv" # extension of result file, used internally and in UI
sorted = "Tweets (in interval)"
options = {
"timeframe": {
"type": UserInput.OPTION_CHOICE,
"default": "month",
"options": {"all": "Overall", "year": "Year", "month": "Month", "week": "Week", "day": "Day",
"hour": "Hour", "minute": "Minute"},
"help": "Produce counts per"
},
# Padding would require padding for all authors/users to make any sense! That's a bit more complex that existing code allows
# Disabling for now
# "pad": {
# "type": UserInput.OPTION_TOGGLE,
# "default": True,
# "help": "Include dates where the count is zero",
# "tooltip": "Makes the counts continuous. For example, if there are posts in May and July but not June, June will be included with 0 posts."
# }
}
@classmethod
def is_compatible_with(cls, module=None, user=None):
"""
Determine if processor is compatible with dataset
:param module: Dataset or processor to determine compatibility with
"""
return module.type in ["twitterv2-search", "dmi-tcat-search"]
def map_data(self, post):
"""
Maps a post to collect aggregate data. Returns a key for grouping data, a dictionary of aggregate data that can
be summed when encountered again and a dictionary of static data that should be updated.
E.g. number of tweets might be aggregated (summed over interval), but username of tweeter will be static.
"""
group_by_key_category = "Username"
group_by_key = str(post.get("author_user").get("username"))
if group_by_key == 'REDACTED':
# Cannot calculate user stats when users have been removed!
raise ProcessorException("Author information has been removed; cannot calculate user stats")
# Use set as hashtag/mention is either in tweet or not AND adding it from the reference tweet should not duplicate
hashtags = set([tag["tag"] for tag in post.get("entities", {}).get("hashtags", [])])
mentions = set([tag["username"] for tag in post.get("entities", {}).get("mentions", [])])
urls = set([tag["expanded_url"] for tag in post.get("entities", {}).get("urls", [])])
# Update hashtags and mentions
for ref_tweet in post.get('referenced_tweets', []):
if ref_tweet.get('type') in ['retweeted', 'quoted']:
hashtags.update([tag['tag'] for tag in ref_tweet.get('entities', {}).get('hashtags', [])])
mentions.update([tag['username'] for tag in ref_tweet.get('entities', {}).get('mentions', [])])
urls.update([tag["expanded_url"] for tag in post.get("entities", {}).get("urls", [])])
num_urls = len(urls)
num_hashtags = len(hashtags)
num_mentions = len(mentions)
num_images = len(
[item["url"] for item in post.get("attachments", {}).get("media_keys", []) if
type(item) is dict and item.get("type") == "photo"])
sum_map = {
"Tweets (in interval)": 1,
"Retweets (in interval)": 1 if any([ref.get("type") == "retweeted" for ref in post.get("referenced_tweets", [])]) else 0,
"Quotes (in interval)": 1 if any([ref.get("type") == "quoted" for ref in post.get("referenced_tweets", [])]) else 0,
"Replies (in interval)": 1 if any(
[ref.get("type") == "replied_to" for ref in post.get("referenced_tweets", [])]) else 0,
"Number of Tweets with URL (in interval)": 1 if num_urls > 0 else 0,
"Total Number of URLs Used (in interval)": num_urls,
"Number of Tweets with Hashtag (in interval)": 1 if num_hashtags > 0 else 0,
"Total Number of Hashtags Used (in interval)": num_hashtags,
"Number of Tweets with Mention (in interval)": 1 if num_mentions > 0 else 0,
"Total Number of Mentions Used (in interval)": num_mentions,
"Number of Tweets with Image (in interval)": 1 if num_images > 0 else 0,
"Total Number of Images Used (in interval)": num_images,
"Total Retweets of User's Tweets (in interval)": post.get('public_metrics').get('retweet_count'),
"Total Replies of User's Tweets (in interval)": post.get('public_metrics').get('reply_count'),
"Total Likes of User's Tweets (in interval)": post.get('public_metrics').get('like_count'),
"Total Quotes of User's Tweets (in interval)": post.get('public_metrics').get('quote_count'),
}
# These are user-specific metrics and not per tweet/post like above
static_map = {
"User ID": post["author_user"]["id"],
"Name": post["author_user"]["name"],
"Location": post["author_user"].get("location"),
"Verified": post["author_user"].get("verified"),
"Number User is Following (at time of collection)": post.get("author_user").get('public_metrics').get(
'following_count'),
"Number Followers of User (at time of collection)": post.get("author_user").get('public_metrics').get(
'followers_count'),
"Total Number of Tweets (at time of collection)": post.get("author_user").get('public_metrics').get(
'tweet_count'),
}
list_map = {}
return group_by_key_category, group_by_key, sum_map, static_map, list_map