-
Notifications
You must be signed in to change notification settings - Fork 2
/
Logistic_CTR.py
141 lines (116 loc) · 6.15 KB
/
Logistic_CTR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 25 20:10:49 2017
@author: khelanpatel
"""
import pandas as pd
import datetime
import numpy as np
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from math import sqrt
data_frame = pd.DataFrame()
train_cols = []
train_op = pd.DataFrame()
def encode_label(col_names, data_frame):
'''
Encode the lables using "LabelEncoder" methods to integer values.
'''
le = preprocessing.LabelEncoder()
for col in col_names:
le.fit(np.array(data_frame[col]))
data_frame[col] = le.transform(np.array(data_frame[col]))
return data_frame
def preprocess_data():
'''
This function reads the data from the CSV file generated by the "dataset_creation.py" file.
and pre-processes the data as per below.
1) Remove the columns that have highest number of unique values
2) conver the hour change format (Convert Hour column in data to Day of week, Date and Hour of day)
3) encode the lables using "LabelEncoder" methods to integer values.
4) Cut the output label column (click) from training data frame and create seperate
dataframe for it.
'''
#please, enter the appropriate path value where the output file from "dataset_creation.py" has been generated
path = '/Users/khelanpatel/Desktop/CSC591-BI-Spring2017/Capstone project/ctr_dataset.csv'
data_frame = pd.read_csv(path)
#removing the output label column
X, y = data_frame.drop('click', 1), data_frame['click']
# Splitting data into training and validation set
data_frame, test_data_frame, train_op, test_op = train_test_split(X, y, test_size=0.33, random_state=42)
data_frame = data_frame.append(test_data_frame)
return data_frame, len(test_data_frame), train_op, test_op
def col_remove(data_frame):
'''
Calculate number of unique values in each columns and Dropping the columns with all or maximum unique values
'''
for col in list(data_frame.columns.values):
if (len(data_frame) * 0.95) <= len(data_frame[col].unique()):
data_frame = data_frame.drop(col, 1)
return data_frame
def hour_change(data_frame):
'''
convert the hour change format (Convert Hour column in data to Day of week, Date and Hour of day)
'''
data_frame['date'] = data_frame['hour'].apply(lambda x: x%10000/100)
data_frame['day_hour'] = data_frame['hour'].apply(lambda x: x%100)
data_frame['dow'] = data_frame['hour'].apply(lambda x: datetime.datetime.strptime(str(((x - x%100)/100) + 20000000), '%Y%m%d').strftime('%u'))
data_frame = data_frame.drop('hour', 1)
return data_frame
def logisticModel(train_data, test_data, train_op, test_op):
'''part of Generalized Linear Model : When output label is of binary type.
creating a logistic regression model and fitting with train data.'''
model = LogisticRegression()
model.fit(train_data, train_op)
#print(len(train_data),len(train_op),len(test_data),len(test_op))
#Predicting the probability of each data row to belonging to the either of class
predicted = model.predict_proba(test_data)
#Log loss of the logistic regression model
#print(len(train_data),len(train_op),len(test_data),len(predicted),len(test_op))
print 'Log Loss of simple logistic regression model (Model evaluation parameter) :', metrics.log_loss(test_op, predicted[0:,1])
print 'RMSE = ', sqrt(mean_squared_error(test_op, predicted[0:,1]))
def logisticSGDModel(train_data, test_data, train_op, test_op):
'''Stochastic Gradient Descent. Provides functionality to fit linear models for classification
using different (convex) loss functions and different penalties. particularly useful when the number of samples
(and the number of features) is very large - as in our case.'''
model = linear_model.SGDClassifier(alpha=0.00025, loss="log", penalty="l2")
model.fit(train_data, train_op)
predicted = model._predict_proba(test_data)
print 'Log Loss of Logistic Model with SGD and L2 regularization ((Model evaluation parameter) :', metrics.log_loss(test_op, predicted[0:,1])
print 'RMSE = ', sqrt(mean_squared_error(test_op, predicted[0:,1]))
def main():
data_frame, test_len, train_op, test_op = preprocess_data()
#data_frame, test_len, train_op, test_op =data_frame, len(test_data_frame), train_op, test_op
#Removing the columns that have maximum unique values.
data_frame = col_remove(data_frame)
# Convert Hour column in data to Day of week, Date and Hour of day
#suggested by the discussion forum on the competition page of Kaggle.
data_frame = hour_change(data_frame)
#creating a list of columns whose values will be encoded
col_names_encode_list = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_model', 'device_ip']
# Encodes non-float labels to new unique interger values
data_frame = encode_label(col_names_encode_list, data_frame)
# creating a train and testing set (validation set)
train_data = data_frame[1:(len(data_frame) - test_len + 1)]
test_data = data_frame[(len(train_data)):]
# Standardize features by removing the mean and scaling to unit variance
sl_scaler = preprocessing.StandardScaler()
sl_scaler.fit(train_data)
train_data = sl_scaler.transform(train_data)
sl_scaler.fit(test_data)
test_data = sl_scaler.transform(test_data)
print('************** (Generelized linear model : Link="Logit")Logistic Regression Model ********')
#print (len(train_data),len(train_op),len(test_data),len(test_op))
logisticModel(train_data, test_data, train_op, test_op)
print('************** (Generelized linear model) Logistic Regression Model with SGD and Regulerization ********')
logisticSGDModel(train_data, test_data, train_op, test_op)
print main()