Logistic_CTR.py

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 25 20:10:49 2017

@author: khelanpatel
"""

import pandas as pd
import datetime
import numpy as np
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from math import sqrt


data_frame = pd.DataFrame()
train_cols = []
train_op = pd.DataFrame()

def encode_label(col_names, data_frame):
    '''
    Encode the lables using "LabelEncoder" methods to integer values.
    '''
    le = preprocessing.LabelEncoder()
    for col in col_names:
        le.fit(np.array(data_frame[col]))
        data_frame[col] = le.transform(np.array(data_frame[col]))
    return data_frame
    
def preprocess_data():
    '''
    This function reads the data from the CSV file generated by the "dataset_creation.py" file.
    and pre-processes the data as per below.
    1) Remove the columns that have highest number of unique values
    2) conver the hour change format (Convert Hour column in data to Day of week, Date and Hour of day)
    3) encode the lables using "LabelEncoder" methods to integer values.
    4) Cut the output label column (click) from training data frame and create seperate 
       dataframe for it.
    '''
    
    #please, enter the appropriate path value where the output file from "dataset_creation.py" has been generated
    path = '/Users/khelanpatel/Desktop/CSC591-BI-Spring2017/Capstone project/ctr_dataset.csv'
    data_frame = pd.read_csv(path)
    
    #removing the output label column
    X, y = data_frame.drop('click', 1), data_frame['click']

    # Splitting data into training and validation set
    data_frame, test_data_frame, train_op, test_op = train_test_split(X, y, test_size=0.33, random_state=42)
    data_frame = data_frame.append(test_data_frame)
    return data_frame, len(test_data_frame), train_op, test_op


def col_remove(data_frame):
    '''
    Calculate number of unique values in each columns and Dropping the columns with all or maximum unique values
    '''
    
    for col in list(data_frame.columns.values):
        if (len(data_frame) * 0.95) <= len(data_frame[col].unique()):
            data_frame = data_frame.drop(col, 1)
    return data_frame
    
def hour_change(data_frame):
    '''
    convert the hour change format (Convert Hour column in data to Day of week, Date and Hour of day)
    '''
    data_frame['date'] = data_frame['hour'].apply(lambda x: x%10000/100)
    data_frame['day_hour'] = data_frame['hour'].apply(lambda x: x%100)
    data_frame['dow'] = data_frame['hour'].apply(lambda x: datetime.datetime.strptime(str(((x - x%100)/100) + 20000000), '%Y%m%d').strftime('%u'))
    data_frame = data_frame.drop('hour', 1)
    return data_frame

    
def logisticModel(train_data, test_data, train_op, test_op):
    '''part of Generalized Linear Model : When output label is of binary type.
    creating a logistic regression model and fitting with train data.'''
    model = LogisticRegression()
    model.fit(train_data, train_op)
    #print(len(train_data),len(train_op),len(test_data),len(test_op))
    #Predicting the probability of each data row to belonging to the either of class
    predicted = model.predict_proba(test_data)
    #Log loss of the logistic regression model
    #print(len(train_data),len(train_op),len(test_data),len(predicted),len(test_op))
    print 'Log Loss of simple logistic regression model (Model evaluation parameter) :', metrics.log_loss(test_op, predicted[0:,1])
    print 'RMSE = ', sqrt(mean_squared_error(test_op, predicted[0:,1]))
    
def logisticSGDModel(train_data, test_data, train_op, test_op):
    '''Stochastic Gradient Descent. Provides functionality to fit linear models for classification 
    using different (convex) loss functions and different penalties. particularly useful when the number of samples 
    (and the number of features) is very large - as in our case.'''
    model = linear_model.SGDClassifier(alpha=0.00025, loss="log", penalty="l2")
    model.fit(train_data, train_op)
    predicted = model._predict_proba(test_data)
    print 'Log Loss of Logistic Model with SGD and L2 regularization ((Model evaluation parameter)  :', metrics.log_loss(test_op, predicted[0:,1])
    print 'RMSE = ', sqrt(mean_squared_error(test_op, predicted[0:,1]))
    
def main():

    data_frame, test_len, train_op, test_op = preprocess_data()
    #data_frame, test_len, train_op, test_op =data_frame, len(test_data_frame), train_op, test_op
    #Removing the columns that have maximum unique values.
    data_frame = col_remove(data_frame)

    # Convert Hour column in data to Day of week, Date and Hour of day
    #suggested by the discussion forum on the competition page of Kaggle.
    data_frame = hour_change(data_frame)
    
    #creating a list of columns whose values will be encoded
    col_names_encode_list = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
                             'device_id', 'device_model', 'device_ip']

    # Encodes non-float labels to new unique interger values
    data_frame = encode_label(col_names_encode_list, data_frame)

    # creating a train and testing set (validation set)
    train_data = data_frame[1:(len(data_frame) - test_len + 1)]
    test_data = data_frame[(len(train_data)):]

    # Standardize features by removing the mean and scaling to unit variance
    sl_scaler = preprocessing.StandardScaler()
    sl_scaler.fit(train_data)
    train_data = sl_scaler.transform(train_data)
    sl_scaler.fit(test_data)
    test_data = sl_scaler.transform(test_data)

    print('************** (Generelized linear model : Link="Logit")Logistic Regression Model ********')
    #print (len(train_data),len(train_op),len(test_data),len(test_op))
    logisticModel(train_data, test_data, train_op, test_op)
    print('************** (Generelized linear model) Logistic Regression Model with SGD and Regulerization ********')
    logisticSGDModel(train_data, test_data, train_op, test_op)
    
print main()