-
Notifications
You must be signed in to change notification settings - Fork 8
/
utils.py
74 lines (64 loc) · 2.34 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
'''
Utility class with some helper functions
'''
import pandas as pd
import numpy as np
import random
import os
class Util(object):
def read_data(self, folder):
'''
Function to read data required to
build the recommender system
'''
print("Reading the data")
ratings = pd.read_csv(os.path.join(folder, "ratings.csv"))
to_read = pd.read_csv(os.path.join(folder, "to_read.csv"))
books = pd.read_csv(os.path.join(folder, "books.csv"))
return ratings, to_read, books
def clean_subset(self, ratings, num_rows):
'''
Function to clean and subset the data according
to individual machine power
'''
print("Extracting num_rows from ratings")
temp = ratings.sort_values(by=['user_id'], ascending=True)
ratings = temp.iloc[:num_rows, :]
return ratings
def preprocess(self, ratings):
'''
Preprocess data for feeding into the network
'''
print("Preprocessing the dataset")
ratings = ratings.reset_index(drop=True)
ratings['List Index'] = ratings.index
readers_group = ratings.groupby("user_id")
total = []
for readerID, curReader in readers_group:
temp = np.zeros(len(ratings))
for num, book in curReader.iterrows():
temp[book['List Index']] = book['rating']/5.0
total.append(temp)
return total
def split_data(self, total_data):
'''
Function to split into training and validation sets
'''
print("Free energy required, dividing into train and validation sets")
random.shuffle(total_data)
n = len(total_data)
print("Total size of the data is: {0}".format(n))
size_train = int(n * 0.75)
X_train = total_data[:size_train]
X_valid = total_data[size_train:]
print("Size of the training data is: {0}".format(len(X_train)))
print("Size of the validation data is: {0}".format(len(X_valid)))
return X_train, X_valid
def free_energy(self, v_sample, W, vb, hb):
'''
Function to compute the free energy
'''
wx_b = np.dot(v_sample, W) + hb
vbias_term = np.dot(v_sample, vb)
hidden_term = np.sum(np.log(1 + np.exp(wx_b)), axis = 1)
return -hidden_term - vbias_term