Merge branch 'master' into master

dmlc · Oct 18, 2021 · b24f9ec · b24f9ec
2 parents b0e639c + 4fd149b
commit b24f9ec
Show file tree

Hide file tree

Showing 69 changed files with 1,065 additions and 832 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
-project(xgboost LANGUAGES CXX C VERSION 1.5.0)
+project(xgboost LANGUAGES CXX C VERSION 1.6.0)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)
@@ -28,6 +28,7 @@ set_default_configuration_release()
 option(BUILD_C_DOC "Build documentation for C APIs using Doxygen." OFF)
 option(USE_OPENMP "Build with OpenMP support." ON)
 option(BUILD_STATIC_LIB "Build static library" OFF)
+option(FORCE_SHARED_CRT "Build with dynamic CRT on Windows (/MD)" OFF)
 option(RABIT_BUILD_MPI "Build MPI" OFF)
 ## Bindings
 option(JVM_BINDINGS "Build JVM bindings" OFF)
@@ -160,6 +161,9 @@ endif (USE_NCCL)
 
 # dmlc-core
 msvc_use_static_runtime()
+if (FORCE_SHARED_CRT)
+  set(DMLC_FORCE_SHARED_CRT ON)
+endif ()
 add_subdirectory(${xgboost_SOURCE_DIR}/dmlc-core)
 
 if (MSVC)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -10,8 +10,8 @@ The Project Management Committee(PMC) consists group of active committers that m
   - Tianqi is a Ph.D. student working on large-scale machine learning. He is the creator of the project.
 * [Michael Benesty](https://github.com/pommedeterresautee)
   - Michael is a lawyer and data scientist in France. He is the creator of XGBoost interactive analysis module in R.
-* [Yuan Tang](https://github.com/terrytangyuan), Ant Group
-  - Yuan is a software engineer in Ant Group. He contributed mostly in R and Python packages.
+* [Yuan Tang](https://github.com/terrytangyuan), Akuity
+  - Yuan is a founding engineer at Akuity. He contributed mostly in R and Python packages.
 * [Nan Zhu](https://github.com/CodingCat), Uber
   - Nan is a software engineer in Uber. He contributed mostly in JVM packages.
 * [Jiaming Yuan](https://github.com/trivialfis)

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 1.5.0.1
-Date: 2020-08-28
+Version: 1.6.0.1
+Date: 2021-09-25
 Authors@R: c(
   person("Tianqi", "Chen", role = c("aut"),
          email = "tianqi.tchen@gmail.com"),

diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
@@ -11,6 +11,7 @@
 #' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
 #'        It is useful when a 0 or some other extreme value represents missing values in data.
 #' @param silent whether to suppress printing an informational message after loading from a file.
+#' @param nthread Number of threads used for creating DMatrix.
 #' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list.
 #'
 #' @examples

diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
@@ -15,7 +15,7 @@ endfunction(auto_source_group)
 
 # Force static runtime for MSVC
 function(msvc_use_static_runtime)
-  if(MSVC)
+  if(MSVC AND (NOT BUILD_SHARED_LIBS) AND (NOT FORCE_SHARED_CRT))
       set(variables
           CMAKE_C_FLAGS_DEBUG
           CMAKE_C_FLAGS_MINSIZEREL

diff --git a/demo/guide-python/README.md b/demo/guide-python/README.md
@@ -16,3 +16,5 @@ XGBoost Python Feature Walkthrough
 * [External Memory](external_memory.py)
 * [Training continuation](continuation.py)
 * [Feature weights for column sampling](feature_weights.py)
+* [Basic Categorical data support](categorical.py)
+* [Compare builtin categorical data support with one-hot encoding](cat_in_the_dat.py)
diff --git a/demo/guide-python/cat_in_the_dat.py b/demo/guide-python/cat_in_the_dat.py
@@ -0,0 +1,118 @@
+"""A simple demo for categorical data support using dataset from Kaggle categorical data
+tutorial.
+
+The excellent tutorial is at:
+https://www.kaggle.com/shahules/an-overview-of-encoding-techniques
+
+And the data can be found at:
+https://www.kaggle.com/shahules/an-overview-of-encoding-techniques/data
+
+    .. versionadded 1.6.0
+
+"""
+
+from __future__ import annotations
+from time import time
+import os
+from tempfile import TemporaryDirectory
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score
+
+import xgboost as xgb
+
+
+def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:
+    """Assuming you have already downloaded the data into `input` directory."""
+
+    df_train = pd.read_csv("./input/cat-in-the-dat/train.csv")
+
+    print(
+        "train data set has got {} rows and {} columns".format(
+            df_train.shape[0], df_train.shape[1]
+        )
+    )
+    X = df_train.drop(["target"], axis=1)
+    y = df_train["target"]
+
+    for i in range(0, 5):
+        X["bin_" + str(i)] = X["bin_" + str(i)].astype("category")
+
+    for i in range(0, 5):
+        X["nom_" + str(i)] = X["nom_" + str(i)].astype("category")
+
+    for i in range(5, 10):
+        X["nom_" + str(i)] = X["nom_" + str(i)].apply(int, base=16)
+
+    for i in range(0, 6):
+        X["ord_" + str(i)] = X["ord_" + str(i)].astype("category")
+
+    print(X.shape)
+
+    print(
+        "train data set has got {} rows and {} columns".format(X.shape[0], X.shape[1])
+    )
+    return X, y
+
+
+params = {"tree_method": "gpu_hist", "use_label_encoder": False, "n_estimators": 32}
+
+
+def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
+    """Train using builtin categorical data support from XGBoost"""
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, random_state=1994, test_size=0.2
+    )
+
+    clf = xgb.XGBClassifier(**params, enable_categorical=True)
+    clf.fit(
+        X_train,
+        y_train,
+        eval_set=[(X_test, y_test), (X_train, y_train)],
+        eval_metric="auc",
+    )
+    print(clf.n_classes_)
+    clf.save_model(os.path.join(output_dir, "categorical.json"))
+
+    y_score = clf.predict_proba(X_test)[:, 1]  # proba of positive samples
+    auc = roc_auc_score(y_test, y_score)
+    print("AUC of using builtin categorical data support:", auc)
+
+
+def onehot_encoding_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
+    """Train using one-hot encoded data."""
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, random_state=42, test_size=0.2
+    )
+    print(X_train.shape, y_train.shape)
+
+    clf = xgb.XGBClassifier(**params, enable_categorical=False)
+    clf.fit(
+        X_train,
+        y_train,
+        eval_set=[(X_test, y_test), (X_train, y_train)],
+        eval_metric="auc",
+    )
+    clf.save_model(os.path.join(output_dir, "one-hot.json"))
+
+    y_score = clf.predict_proba(X_test)[:, 1]  # proba of positive samples
+    auc = roc_auc_score(y_test, y_score)
+    print("AUC of using onehot encoding:", auc)
+
+
+if __name__ == "__main__":
+    X, y = load_cat_in_the_dat()
+
+    with TemporaryDirectory() as tmpdir:
+        start = time()
+        categorical_model(X, y, tmpdir)
+        end = time()
+        print("Duration:categorical", end - start)
+
+        X = pd.get_dummies(X)
+        start = time()
+        onehot_encoding_model(X, y, tmpdir)
+        end = time()
+        print("Duration:onehot", end - start)
diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py
@@ -8,23 +8,24 @@
 import os
 import xgboost
 from typing import Callable, List, Tuple
+from sklearn.datasets import make_regression
 import tempfile
 import numpy as np
 
 
 def make_batches(
-    n_samples_per_batch: int, n_features: int, n_batches: int
-) -> Tuple[List[np.ndarray], List[np.ndarray]]:
-    """Generate random batches."""
-    X = []
-    y = []
+    n_samples_per_batch: int, n_features: int, n_batches: int, tmpdir: str,
+) -> List[Tuple[str, str]]:
+    files: List[Tuple[str, str]] = []
     rng = np.random.RandomState(1994)
     for i in range(n_batches):
-        _X = rng.randn(n_samples_per_batch, n_features)
-        _y = rng.randn(n_samples_per_batch)
-        X.append(_X)
-        y.append(_y)
-    return X, y
+        X, y = make_regression(n_samples_per_batch, n_features, random_state=rng)
+        X_path = os.path.join(tmpdir, "X-" + str(i) + ".npy")
+        y_path = os.path.join(tmpdir, "y-" + str(i) + ".npy")
+        np.save(X_path, X)
+        np.save(y_path, y)
+        files.append((X_path, y_path))
+    return files
 
 
 class Iterator(xgboost.DataIter):
@@ -38,8 +39,8 @@ def __init__(self, file_paths: List[Tuple[str, str]]):
 
     def load_file(self) -> Tuple[np.ndarray, np.ndarray]:
         X_path, y_path = self._file_paths[self._it]
-        X = np.loadtxt(X_path)
-        y = np.loadtxt(y_path)
+        X = np.load(X_path)
+        y = np.load(y_path)
         assert X.shape[0] == y.shape[0]
         return X, y
 
@@ -66,15 +67,7 @@ def reset(self) -> None:
 
 def main(tmpdir: str) -> xgboost.Booster:
     # generate some random data for demo
-    batches = make_batches(1024, 17, 31)
-    files = []
-    for i, (X, y) in enumerate(zip(*batches)):
-        X_path = os.path.join(tmpdir, "X-" + str(i) + ".txt")
-        np.savetxt(X_path, X)
-        y_path = os.path.join(tmpdir, "y-" + str(i) + ".txt")
-        np.savetxt(y_path, y)
-        files.append((X_path, y_path))
-
+    files = make_batches(1024, 17, 31, tmpdir)
     it = Iterator(files)
     # For non-data arguments, specify it here once instead of passing them by the `next`
     # method.
@@ -83,7 +76,7 @@ def main(tmpdir: str) -> xgboost.Booster:
 
     # Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some
     # caveats.  This is still an experimental feature.
-    booster = xgboost.train({"tree_method": "approx"}, Xy)
+    booster = xgboost.train({"tree_method": "approx"}, Xy, evals=[(Xy, "Train")])
     return booster