Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
nicovdijk authored Oct 18, 2021
2 parents b0e639c + 4fd149b commit b24f9ec
Show file tree
Hide file tree
Showing 69 changed files with 1,065 additions and 832 deletions.
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
project(xgboost LANGUAGES CXX C VERSION 1.5.0)
project(xgboost LANGUAGES CXX C VERSION 1.6.0)
include(cmake/Utils.cmake)
list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
cmake_policy(SET CMP0022 NEW)
Expand Down Expand Up @@ -28,6 +28,7 @@ set_default_configuration_release()
option(BUILD_C_DOC "Build documentation for C APIs using Doxygen." OFF)
option(USE_OPENMP "Build with OpenMP support." ON)
option(BUILD_STATIC_LIB "Build static library" OFF)
option(FORCE_SHARED_CRT "Build with dynamic CRT on Windows (/MD)" OFF)
option(RABIT_BUILD_MPI "Build MPI" OFF)
## Bindings
option(JVM_BINDINGS "Build JVM bindings" OFF)
Expand Down Expand Up @@ -160,6 +161,9 @@ endif (USE_NCCL)

# dmlc-core
msvc_use_static_runtime()
if (FORCE_SHARED_CRT)
set(DMLC_FORCE_SHARED_CRT ON)
endif ()
add_subdirectory(${xgboost_SOURCE_DIR}/dmlc-core)

if (MSVC)
Expand Down
4 changes: 2 additions & 2 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ The Project Management Committee(PMC) consists group of active committers that m
- Tianqi is a Ph.D. student working on large-scale machine learning. He is the creator of the project.
* [Michael Benesty](https://github.com/pommedeterresautee)
- Michael is a lawyer and data scientist in France. He is the creator of XGBoost interactive analysis module in R.
* [Yuan Tang](https://github.com/terrytangyuan), Ant Group
- Yuan is a software engineer in Ant Group. He contributed mostly in R and Python packages.
* [Yuan Tang](https://github.com/terrytangyuan), Akuity
- Yuan is a founding engineer at Akuity. He contributed mostly in R and Python packages.
* [Nan Zhu](https://github.com/CodingCat), Uber
- Nan is a software engineer in Uber. He contributed mostly in JVM packages.
* [Jiaming Yuan](https://github.com/trivialfis)
Expand Down
4 changes: 2 additions & 2 deletions R-package/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: xgboost
Type: Package
Title: Extreme Gradient Boosting
Version: 1.5.0.1
Date: 2020-08-28
Version: 1.6.0.1
Date: 2021-09-25
Authors@R: c(
person("Tianqi", "Chen", role = c("aut"),
email = "tianqi.tchen@gmail.com"),
Expand Down
1 change: 1 addition & 0 deletions R-package/R/xgb.DMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
#' It is useful when a 0 or some other extreme value represents missing values in data.
#' @param silent whether to suppress printing an informational message after loading from a file.
#' @param nthread Number of threads used for creating DMatrix.
#' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list.
#'
#' @examples
Expand Down
11 changes: 10 additions & 1 deletion R-package/man/xgb.DMatrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion cmake/Utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ endfunction(auto_source_group)

# Force static runtime for MSVC
function(msvc_use_static_runtime)
if(MSVC)
if(MSVC AND (NOT BUILD_SHARED_LIBS) AND (NOT FORCE_SHARED_CRT))
set(variables
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_MINSIZEREL
Expand Down
2 changes: 2 additions & 0 deletions demo/guide-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ XGBoost Python Feature Walkthrough
* [External Memory](external_memory.py)
* [Training continuation](continuation.py)
* [Feature weights for column sampling](feature_weights.py)
* [Basic Categorical data support](categorical.py)
* [Compare builtin categorical data support with one-hot encoding](cat_in_the_dat.py)
118 changes: 118 additions & 0 deletions demo/guide-python/cat_in_the_dat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""A simple demo for categorical data support using dataset from Kaggle categorical data
tutorial.
The excellent tutorial is at:
https://www.kaggle.com/shahules/an-overview-of-encoding-techniques
And the data can be found at:
https://www.kaggle.com/shahules/an-overview-of-encoding-techniques/data
.. versionadded 1.6.0
"""

from __future__ import annotations
from time import time
import os
from tempfile import TemporaryDirectory

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import xgboost as xgb


def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:
"""Assuming you have already downloaded the data into `input` directory."""

df_train = pd.read_csv("./input/cat-in-the-dat/train.csv")

print(
"train data set has got {} rows and {} columns".format(
df_train.shape[0], df_train.shape[1]
)
)
X = df_train.drop(["target"], axis=1)
y = df_train["target"]

for i in range(0, 5):
X["bin_" + str(i)] = X["bin_" + str(i)].astype("category")

for i in range(0, 5):
X["nom_" + str(i)] = X["nom_" + str(i)].astype("category")

for i in range(5, 10):
X["nom_" + str(i)] = X["nom_" + str(i)].apply(int, base=16)

for i in range(0, 6):
X["ord_" + str(i)] = X["ord_" + str(i)].astype("category")

print(X.shape)

print(
"train data set has got {} rows and {} columns".format(X.shape[0], X.shape[1])
)
return X, y


params = {"tree_method": "gpu_hist", "use_label_encoder": False, "n_estimators": 32}


def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
"""Train using builtin categorical data support from XGBoost"""
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=1994, test_size=0.2
)

clf = xgb.XGBClassifier(**params, enable_categorical=True)
clf.fit(
X_train,
y_train,
eval_set=[(X_test, y_test), (X_train, y_train)],
eval_metric="auc",
)
print(clf.n_classes_)
clf.save_model(os.path.join(output_dir, "categorical.json"))

y_score = clf.predict_proba(X_test)[:, 1] # proba of positive samples
auc = roc_auc_score(y_test, y_score)
print("AUC of using builtin categorical data support:", auc)


def onehot_encoding_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
"""Train using one-hot encoded data."""

X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42, test_size=0.2
)
print(X_train.shape, y_train.shape)

clf = xgb.XGBClassifier(**params, enable_categorical=False)
clf.fit(
X_train,
y_train,
eval_set=[(X_test, y_test), (X_train, y_train)],
eval_metric="auc",
)
clf.save_model(os.path.join(output_dir, "one-hot.json"))

y_score = clf.predict_proba(X_test)[:, 1] # proba of positive samples
auc = roc_auc_score(y_test, y_score)
print("AUC of using onehot encoding:", auc)


if __name__ == "__main__":
X, y = load_cat_in_the_dat()

with TemporaryDirectory() as tmpdir:
start = time()
categorical_model(X, y, tmpdir)
end = time()
print("Duration:categorical", end - start)

X = pd.get_dummies(X)
start = time()
onehot_encoding_model(X, y, tmpdir)
end = time()
print("Duration:onehot", end - start)
37 changes: 15 additions & 22 deletions demo/guide-python/external_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,24 @@
import os
import xgboost
from typing import Callable, List, Tuple
from sklearn.datasets import make_regression
import tempfile
import numpy as np


def make_batches(
n_samples_per_batch: int, n_features: int, n_batches: int
) -> Tuple[List[np.ndarray], List[np.ndarray]]:
"""Generate random batches."""
X = []
y = []
n_samples_per_batch: int, n_features: int, n_batches: int, tmpdir: str,
) -> List[Tuple[str, str]]:
files: List[Tuple[str, str]] = []
rng = np.random.RandomState(1994)
for i in range(n_batches):
_X = rng.randn(n_samples_per_batch, n_features)
_y = rng.randn(n_samples_per_batch)
X.append(_X)
y.append(_y)
return X, y
X, y = make_regression(n_samples_per_batch, n_features, random_state=rng)
X_path = os.path.join(tmpdir, "X-" + str(i) + ".npy")
y_path = os.path.join(tmpdir, "y-" + str(i) + ".npy")
np.save(X_path, X)
np.save(y_path, y)
files.append((X_path, y_path))
return files


class Iterator(xgboost.DataIter):
Expand All @@ -38,8 +39,8 @@ def __init__(self, file_paths: List[Tuple[str, str]]):

def load_file(self) -> Tuple[np.ndarray, np.ndarray]:
X_path, y_path = self._file_paths[self._it]
X = np.loadtxt(X_path)
y = np.loadtxt(y_path)
X = np.load(X_path)
y = np.load(y_path)
assert X.shape[0] == y.shape[0]
return X, y

Expand All @@ -66,15 +67,7 @@ def reset(self) -> None:

def main(tmpdir: str) -> xgboost.Booster:
# generate some random data for demo
batches = make_batches(1024, 17, 31)
files = []
for i, (X, y) in enumerate(zip(*batches)):
X_path = os.path.join(tmpdir, "X-" + str(i) + ".txt")
np.savetxt(X_path, X)
y_path = os.path.join(tmpdir, "y-" + str(i) + ".txt")
np.savetxt(y_path, y)
files.append((X_path, y_path))

files = make_batches(1024, 17, 31, tmpdir)
it = Iterator(files)
# For non-data arguments, specify it here once instead of passing them by the `next`
# method.
Expand All @@ -83,7 +76,7 @@ def main(tmpdir: str) -> xgboost.Booster:

# Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some
# caveats. This is still an experimental feature.
booster = xgboost.train({"tree_method": "approx"}, Xy)
booster = xgboost.train({"tree_method": "approx"}, Xy, evals=[(Xy, "Train")])
return booster


Expand Down
Loading

0 comments on commit b24f9ec

Please sign in to comment.