demo/cross_validation.jl

using XGBoost

# load file from text file, also binary buffer generated by xgboost

const DATAPATH = joinpath(@__DIR__, "../data")
dtrain = DMatrix(joinpath(DATAPATH, "agaricus.txt.train"))
dtest = DMatrix(joinpath(DATAPATH, "agaricus.txt.test"))

#Defining parameters for xgboost

param = ["max_depth" => 2,
         "eta" => 1,
         "silent" => 1,
         "objective" => "binary:logistic"]
num_round = 2
nfold = 5

print("running cross validation\n")
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
nfold_cv(dtrain, num_round, nfold, param = param, metrics = ["error"], seed = 0)

print("running cross validation, disable standard deviation display\n")
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
nfold_cv(dtrain, num_round, nfold, param = param, metrics = ["error"], seed = 0, show_stdv = false)

print("running cross validation, with preprocessing function\n")
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
function fpreproc(dtrain::DMatrix, dtest::DMatrix, param)
    label = get_info(dtrain, "label")
    ratio = sum(label == 0) / sum(label == 1)
    param["scale_pos_weight"] = ratio
    return dtrain, dtest, param
end
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
nfold_cv(dtrain, num_round, nfold, param = param, metrics = ["auc"],
         seed = 0, show_stdv = false, fpreproc = fpreproc)

print("running cross validation, with customized loss function\n")
###
# you can also do cross validation with cutomized loss function
# See custom_objective.py
##

function logregobj(preds::Vector{Float32}, dtrain::DMatrix)
        labels = get_info(dtrain, "label")
        preds = 1.0 ./ (1.0 + exp(-preds))
        grad = preds - labels
        hess = preds .* (1. - preds)
        return grad, hess
end

function evalerror(preds::Vector{Float32}, dtrain::DMatrix)
    labels = get_info(dtrain, "label")
    # return a pair metric_name, result
    # since preds are margin(before logistic transformation, cutoff at 0)
    return "self-error", sum((preds .> 0.0) .!= labels) / float(size(preds)[1])
end

# train with customized objective
nfold_cv(dtrain, num_round, nfold, metrics = [], seed = 0, obj = logregobj, feval = evalerror,
         max_depth = 2, eta = 1, silent = 1)