-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse.py
170 lines (129 loc) · 3.58 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import json
import networkx as nx
import os
import sys
import pickle
import copy
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
# local
import gutils
import User
import Review
import community
# OPTIONS
# delete nodes that are friendless
OPT_DELETE_FRIENDLESS = True
def parseJSON(line):
return json.loads(line.replace('\r', '\\r').replace('\n', '\\n'))
def parseUser(filename):
"""
Load filename and parse into dictionary
"""
with open(filename) as fp:
lines = fp.readlines()
d = {}
for line in lines:
# remove newlines
line = line.strip()
user = parseJSON(line)
d[user['user_id']+'u'] = User.User(user)
return d
def parseUserFile(filename, ret_nx = True):
"""
Load filename and parse into dictionary
"""
users = {}
with open(filename) as fp:
lines = fp.readlines()
users = {}
for line in lines:
# remove newlines
line = line.strip()
user = parseJSON(line)
users[user['user_id']+'u'] = User.User(user)
G = nx.Graph()
# add all nodes
for user_id, user in users.iteritems():
if OPT_DELETE_FRIENDLESS and len(user['friends']) != 0:
# Don't add nodes without any friends
G.add_node(user_id, user)
# add edges
for user_id, user in users.iteritems():
for friend_id in user['friends']:
# Note: we double add each edge, but that seems to be okay?
G.add_edge(user_id, friend_id)
if ret_nx:
return G
else:
return users
def parseReviewFile(filename, ratio=1.0):
'''
Input:
ratio - percentage of reviews to do
Return:
networkx bipartite graph of users to businesses
'''
# kinda weird cause I'm generating users/businesses from here rather than
# the users of business files...
reviews = list()
users = set()
businesses = set()
with open(filename) as fp:
lines = fp.readlines()
print len(lines)
lines = lines[:int(len(lines) * ratio)]
print len(lines)
for line in lines:
# remove newlines
line = line.strip()
review = parseJSON(line)
reviews.append(Review.Review(review))
G = nx.Graph()
# add all nodes
for r in reviews:
# check if new user
if r['user_id'] not in users:
users.add(r['user_id'])
G.add_node(r['user_id'], bipartite=0)
for r in reviews:
# check if new business
if r['business_id'] not in businesses:
businesses.add(r['business_id'])
G.add_node(r['business_id'], bipartite=1)
# add the edge
G.add_edge(r['user_id'], r['business_id'], stars=r['stars'])
return G
def parseBusinessFile(filename):
'''
Create a dic where keys are business ID's and the value is the Yelp avg star
'''
Biz = {}
with open(filename) as fp:
lines = fp.readlines()
for line in lines:
# remove newlines
line = line.strip()
business = parseJSON(line)
Biz[business['business_id']+'b'] = business['stars']
return Biz
def calcCommunities(graph):
'''
BEWARE. I'm not sure if byUser and byGroup have the same group_id keys
Return:
dict['byUser'] = dict where key is user_id and value is the group
dict['byGroup'] = dict where key is group_id and value is a list of users
'''
partition = community.best_partition(graph)
communities = {}
communities['byUser'] = partition.copy()
communities['byGroup'] = {}
size = float(len(set(partition.values())))
print "Total communities are: " + str(size)
count = 0
for com in set(partition.values()) :
list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
communities['byGroup'][count] = list_nodes
count = count + 1
return communities