-
Notifications
You must be signed in to change notification settings - Fork 2
/
CF.py
150 lines (138 loc) · 7.15 KB
/
CF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 6 10:00:24 2017
@author: Dovla
"""
import time
import pandas as pd
import numpy as np
import random
from random import randint
from sklearn.metrics import mean_squared_error
from math import sqrt
def similarity(ratings, kind='user', epsilon=1e-9):
if kind == 'user':
sim = ratings.dot(ratings.T) + epsilon
elif kind == 'item':
sim = ratings.T.dot(ratings) + epsilon
norms = np.array([np.sqrt(np.diagonal(sim))])
return (sim / norms / norms.T)
# RMSE is root mean squared error calculated between each test value, and
# actual predicted score. Its cruical that only values that exist in test
# are calculated hence nonzero function used.
def rmse1(prediction, test):
prediction = prediction[test.nonzero()].flatten()
test = test[test.nonzero()].flatten()
return np.sqrt(((prediction - test) ** 2).mean())
# ============================================================
# Read all three datasets
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', encoding='latin1')
books = pd.read_csv('BX-Books.csv', sep=';', encoding='latin1', error_bad_lines=False, warn_bad_lines=False)
users = pd.read_csv('BX-Users.csv', sep=';', encoding='latin1', error_bad_lines=False, warn_bad_lines=False)
# Drop zero ratings as they represent nothing really
ratings.drop(ratings[ratings['Book-Rating'] < 1].index, inplace=True)
# Count how many books each user rated
ratingsUserGroup = ratings.groupby(ratings['User-ID'], as_index=False)['Book-Rating'].count()
# Sort starting with users with highest number of ratings
ratingsUserGroupSort = ratingsUserGroup.sort_values(['Book-Rating'], ascending=False)
# Add user rating count to ratings data
ratingsAndUserCount = pd.merge(ratings,ratingsUserGroupSort, how='left', on='User-ID')
# Count how many times each book is rated
ratingsISBNGroup = ratings.groupby(ratings['ISBN'], as_index=False)['Book-Rating'].count()
# Sort starting with books with highest number of ratings
ratingsISBNGroupSort = ratingsISBNGroup.sort_values(['Book-Rating'], ascending=False)
# Select books that are rated more than X times, 10 as optimal speed/quality contraint
topBooks = ratingsISBNGroupSort[ratingsISBNGroupSort['Book-Rating'] > 10]
# Select same topBooks but with data about user rating count
topBooksAllData = ratingsAndUserCount[ratingsAndUserCount['ISBN'].isin(topBooks['ISBN'])]
# From topBooks select only those users who rated more than X books, 10 as optimal speed/quality constraint
filteredRatings = pd.DataFrame(topBooksAllData[topBooksAllData['Book-Rating_y'] > 10])
#Clean columns names
colnames = ['User-ID','ISBN','Book-Rating', 'UserRatingCount']
filteredRatings.columns = colnames
#Generate catcodes for User ID so that it can be used for generating matrix
# based on its index and later retrivied particular book/user if needed
filteredRatings['User-ID'] = pd.Categorical(filteredRatings['User-ID'])
filteredRatings['Ucode'] = filteredRatings['User-ID'].cat.codes
#Generate catcodes for ISBN ID so that it can be used for generating matrix
# based on its index and later retrivied particular book/user if needed
filteredRatings['ISBN'] = pd.Categorical(filteredRatings['ISBN'])
filteredRatings['Icode'] = filteredRatings['ISBN'].cat.codes
# Number of unique users
nUsers = filteredRatings['Ucode'].unique().shape[0]
# Number of unique books
nItems = filteredRatings['Icode'].unique().shape[0]
# Add index columns to be used for selecting test/train data
completeData = filteredRatings.reset_index(drop=True)
completeData['Index'] = completeData.index
# ============================================================
# Create empty variable to store rmse for each round
errTot = []
# specify number of data folds (20% hence 5)
kfolds = 5
# calculate subset size in each fold
subsetSize = len(completeData)/kfolds
# Print basic data info
print("Data consists of " + str(nUsers) + " who rated " + str(nItems) +" books with " + str(completeData['Book-Rating'].count())+" ratings.")
for i in range(kfolds):
start = time.time()
# Split data into train and test based on size of kfolds
testData = completeData.iloc[int(subsetSize)*i:int(subsetSize)*(i+1)]
# Take all data except what is in test
trainData = completeData[~completeData['Index'].isin(testData['Index'])]
# Generate utility matrix for train data, by indexing on newly generated catCodes
trainMatrix = np.zeros((nUsers, nItems))
for line in trainData.itertuples():
trainMatrix[line[5]-1, line[6]-1] = line[3]/2
# Generate utility matrix for test data, by indexing on newly generated catCodes
testMatrix = np.zeros((nUsers, nItems))
for line in testData.itertuples():
testMatrix[line[5]-1, line[6]-1] = line[3]/2
# ============================================================
# The following ~20 lines are used to prepare data for calculation of cosine similarity
# Make sure train matrix is array
trainMatrixArr = np.array(trainMatrix)
# Make sure array is of type float
trainMatrixArrFl = trainMatrixArr.astype('float')
# Turn all 0 into nans
trainMatrixArrFl[trainMatrixArrFl == 0] = 'nan'
# Calculate row means (note 0s are not included)
trainMatrixMean = np.nanmean(trainMatrixArrFl,axis=1)
# Subtract mean from rating (generate mean normalized ratings)
trainMatrixMinMean = trainMatrixArrFl - trainMatrixMean[:,np.newaxis]
# Create a dataframe copy of data
trainMatrixDf = pd.DataFrame(trainMatrixArrFl)
# Convert all nans to 0s
trainMatrixDfZ = np.nan_to_num(trainMatrixDf)
# Create a dataframe copy of data
trainDfMinMean = pd.DataFrame(trainMatrixMinMean)
# Convert all nans to 0s
trainMatrixDfMinMeanZ = np.nan_to_num(trainMatrixMinMean)
# ============================================================
# generate cosine similarity matrix
sim = similarity(trainMatrixDfMinMeanZ)
# Set minimum value in similarity at 0 (negatives turned to 0)
simMinZero = sim.clip(min = 0)
# ============================================================
# Following ~10 lines used for prediction ratings in two steps
# First step, generate similarity weighted sum of ratings via dot product
predictStep1 = simMinZero.dot(trainMatrixDfZ)
# Set all ratings to 1, necessery for next calculation step
temp1 = trainMatrixDfZ.clip(max=1)
# Get only similarities that are used in data (0 rating, 0 sim)
temp2 = simMinZero.dot(temp1)
# Second step, predict average similarity based rating
predict = (predictStep1 + 0.0000001) / (temp2 + 0.0000001) # - 1
# ============================================================
# Calculate and print rmse, and append it to total rmse for later calc of avg rmse
err = rmse1(predict, testMatrix)
errTot.append(err)
print ('\n CF RMSE: '+ str(round(err,4)))
# Print which round it is, and how long it took to complete it.
end = time.time()
print("\nRound completed: " + str(i+1) + " of " + str(kfolds))
print("Total time for round: " + str(round((end - start),2)) + " seconds")
print("=====================================================")
# Calculate and print average RMSE for the kfold specified rounds
print("\nAverage RMSE: " + str(round(sum(errTot)/len(errTot),4)))