-
Notifications
You must be signed in to change notification settings - Fork 0
/
CreatingDictionaryformsofProfiles.py
168 lines (153 loc) · 6.35 KB
/
CreatingDictionaryformsofProfiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse as sp
from scipy.special import expit
import pickle
import csv
import copy
import itertools
from lightfm import LightFM
import lightfm.evaluation
import sys
"""
these cv files are the result of creating profiles for hotels and companies in previous step
"""
header=['uid','type','value']
sideinfoU = pd.read_csv('C:/Users/marman/preprocessing/Userprofile.csv',sep=',', header=None,skiprows=[0],names=header, error_bad_lines=False, engine='python')
header=['mid','type','value']
sideinfoI = pd.read_csv('C:/Users/marman/preprocessing/itemprofile.csv',sep=',', header=None,names=header, skiprows=[0], error_bad_lines=False, engine='python')
"""
this csv file is the result of creating utility matrix and scaled value of them.
"""
header=['uid','mid','rate']
ratings=pd.read_csv('C:/Users/marman/preprocessing/utilityMatrix_scale.csv',sep=',', header=None, skiprows=[0], names=header,error_bad_lines=False, engine='python')
def threshold_rates(df, uid_min, mid_min):
"""
this function is considering a threshold for making the utility matrix more dense,
I considered all of the companies with more than 5 times booking and considering hotels which they are booked at least 5 times.
"""
n_users = df.uid.unique().shape[0]
n_items = df.mid.unique().shape[0]
sparsity = float(df.shape[0]) / float(n_users*n_items) * 100
print('Starting rates info')
print('Number of users: {}'.format(n_users))
print('Number of models: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(sparsity))
done = False
while not done:
starting_shape = df.shape[0]
mid_counts = df.groupby('uid').mid.count()
df = df[~df.uid.isin(mid_counts[mid_counts < mid_min].index.tolist())]
uid_counts = df.groupby('mid').uid.count()
df = df[~df.mid.isin(uid_counts[uid_counts < uid_min].index.tolist())]
ending_shape = df.shape[0]
if starting_shape == ending_shape:
done = True
assert(df.groupby('uid').mid.count().min() >= mid_min)
assert(df.groupby('mid').uid.count().min() >= uid_min)
n_users = df.uid.unique().shape[0]
n_items = df.mid.unique().shape[0]
sparsity = float(df.shape[0]) / float(n_users*n_items) * 100
print('Ending rates info')
print('Number of users: {}'.format(n_users))
print('Number of models: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(sparsity))
return df
df_lim = threshold_rates(ratings, 5, 5)
"""
this part is related to create an index mapping for names of companies and hotels.
"""
mid_to_idx = {}
idx_to_mid = {}
for (idx, mid) in enumerate(df_lim.mid.unique().tolist()):
mid_to_idx[mid] = idx
idx_to_mid[idx] = mid
uid_to_idx = {}
idx_to_uid = {}
for (idx, uid) in enumerate(df_lim.uid.unique().tolist()):
uid_to_idx[uid] = idx
idx_to_uid[idx] = uid
def map_ids(row, mapper):
return mapper[row]
"""
creating a sparse matrix in COOrdinate format which is the triplet format(user, item, value).
"""
from scipy.sparse import coo_matrix
I = df_lim.uid.apply(map_ids, args=[uid_to_idx]).as_matrix()
J = df_lim.mid.apply(map_ids, args=[mid_to_idx]).as_matrix()
V = df_lim['rate'].values
likes = coo_matrix((V, (I, J)), dtype=np.float64)
likes = likes.tocsr()
import math
"""
There's probably a fancy pandas groupby way to do
this but I couldn't figure it out :(
Build list of dictionaries containing features
and weights in same order as idx_to_mid prescribes.
featdlistU_newprofile.txt : Saving the result of this transforming cpmpany profile to a list of dictionaries in a txt file.
"""
feat_dlistU = [{} for _ in idx_to_uid]
dictfeat={}
for idx, row in sideinfoU.iterrows():
if row.type=='AvgPrice' :
row.value = math.ceil(float(row.value)**(1/3))
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
elif row.type=='Nbbooking':
row.value = math.ceil(float(row.value)**(1/3))
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
elif row.type=='Month':
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
elif row.type=='Nb_city':
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
elif row.type=='NoNight':
row.value = math.ceil(float(row.value)**(1/2))
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
else:
feat_key = '{}'.format(row.type)
idx =uid_to_idx.get(row.uid)
if idx is not None:
if '_' not in feat_key:
feat_dlistU[idx][feat_key] = 1 #row.value in case of keeping different weights
else:
feat_dlistU[idx][feat_key] = 1
import pickle
with open('featdlistU_newprofile.txt', 'wb') as handle:
pickle.dump(feat_dlistU, handle)
"""
The same holds for items.
feat_dlistI_newprofile.txt : Saving the result of this transforming hotel profile to a list of dictionaries in a txt file.
"""
feat_dlistI = [{} for _ in idx_to_mid]
for idx, row in sideinfoI.iterrows():
if row.type=='AvgPrice' :
row.value = math.ceil(float(row.value)**(1/3))
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
elif row.type=='Nbbooking':
row.value = math.ceil(float(row.value)**(1/3))
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
elif row.type=='city':
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
elif row.type=='AvgLeadTime':
row.value = math.ceil(float(row.value))
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
elif row.type=='NoNight':
row.value = math.ceil(float(row.value)**(1/2))
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
elif row.type=='AvgCommission':
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
elif row.type=='suppliercode':
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
elif row.type=='Month':
feat_key = '{}_{}'.format(row.type, str(row.value).lower())
else:
feat_key = '{}'.format(row.type)
idx =mid_to_idx.get(row.mid)
if idx is not None:
if '_' not in feat_key:
feat_dlistI[idx][feat_key] = 1 #row.value in case of keeping different weights
else:
feat_dlistI[idx][feat_key] = 1
import pickle
with open('feat_dlistI_newprofile.txt', 'wb') as handle:
pickle.dump(feat_dlistI, handle)