dmlc · trivialfis · Feb 13, 2020 · Jan 31, 2020 · Feb 11, 2020 · Feb 11, 2020
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
@@ -139,6 +139,8 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #' @param reshape whether to reshape the vector of predictions to a matrix form when there are several
 #'        prediction outputs per case. This option has no effect when either of predleaf, predcontrib,
 #'        or predinteraction flags is TRUE.
+#' @param training whether is the prediction result used for training.  For dart booster,
+#'        training predicting will perform dropout.
 #' @param ... Parameters passed to \code{predict.xgb.Booster}
 #'
 #' @details

diff --git a/R-package/man/agaricus.test.Rd b/R-package/man/agaricus.test.Rd
diff --git a/R-package/man/agaricus.train.Rd b/R-package/man/agaricus.train.Rd
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
@@ -31,7 +31,6 @@ num_round <- 2
 test_that("custom objective works", {
   bst <- xgb.train(param, dtrain, num_round, watchlist)
   expect_equal(class(bst), "xgb.Booster")
-  expect_equal(length(bst$raw), 1100)
   expect_false(is.null(bst$evaluation_log))
   expect_false(is.null(bst$evaluation_log$eval_error))
   expect_lt(bst$evaluation_log[num_round, eval_error], 0.03)
@@ -58,5 +57,4 @@ test_that("custom objective using DMatrix attr works", {
   param$objective = logregobjattr
   bst <- xgb.train(param, dtrain, num_round, watchlist)
   expect_equal(class(bst), "xgb.Booster")
-  expect_equal(length(bst$raw), 1100)
 })
diff --git a/doc/python/convert_090to100.py b/doc/python/convert_090to100.py
@@ -0,0 +1,76 @@
+'''This is a simple script that converts a pickled XGBoost
+Scikit-Learn interface object from 0.90 to a native model.  Pickle
+format is not stable as it's a direct serialization of Python object.
+We advice not to use it when stability is needed.
+
+'''
+import pickle
+import json
+import os
+import argparse
+import numpy as np
+import xgboost
+import warnings
+
+
+def save_label_encoder(le):
+    '''Save the label encoder in XGBClassifier'''
+    meta = dict()
+    for k, v in le.__dict__.items():
+        if isinstance(v, np.ndarray):
+            meta[k] = v.tolist()
+        else:
+            meta[k] = v
+    return meta
+
+
+def xgboost_skl_90to100(skl_model):
+    '''Extract the model and related metadata in SKL model.'''
+    model = {}
+    with open(skl_model, 'rb') as fd:
+        old = pickle.load(fd)
+        if not isinstance(old, xgboost.XGBModel):
+            raise TypeError(
+                'The script only handes Scikit-Learn interface object')
+
+    # Save Scikit-Learn specific Python attributes into a JSON document.
+    for k, v in old.__dict__.items():
+        if k == '_le':
+            model[k] = save_label_encoder(v)
+        elif k == 'classes_':
+            model[k] = v.tolist()
+        elif k == '_Booster':
+            continue
+        else:
+            try:
+                json.dumps({k: v})
+                model[k] = v
+            except TypeError:
+                warnings.warn(str(k) + ' is not saved in Scikit-Learn meta.')
+    booster = old.get_booster()
+    # Store the JSON serialization as an attribute
+    booster.set_attr(scikit_learn=json.dumps(model))
+
+    # Save it into a native model.
+    i = 0
+    while True:
+        path = str(i) + '_xgboost_model_from_old_pickle.model'
+        if os.path.exists(path):
+            i += 1
+            continue
+        booster.save_model(path)
+        break
+
+
+if __name__ == '__main__':
+    assert xgboost.__version__ != '1.0.0', ('Please use the XGBoost version'
+                                            ' that generates this pickle.')
+    parser = argparse.ArgumentParser(
+        description=('A simple script to convert pickle generated by'
+                     ' XGBoost 0.90 to XGBoost 1.0.0 model (not pickle).')
+    )
+    parser.add_argument('--old-pickle', type=str,
+                        help='Path to old pickle file.')
+    args = parser.parse_args()
+
+    xgboost_skl_90to100(args.old_pickle)
diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst
@@ -91,7 +91,12 @@ Loading pickled file from different version of XGBoost
 
 As noted, pickled model is neither portable nor stable, but in some cases the pickled
 models are valuable.  One way to restore it in the future is to load it back with that
-specific version of Python and XGBoost, export the model by calling `save_model`.
+specific version of Python and XGBoost, export the model by calling `save_model`.  To help
+easing the mitigation, we created a simple script for converting pickled XGBoost 0.90
+Scikit-Learn interface object to XGBoost 1.0.0 native model.  Please note that the script
+suites simple use cases, and it's adviced not to use pickle when stability is needed.
+It's located in ``xgboost/doc/ptyhon`` with the name ``convert_090to100.py``.  See
+comments in the script for more details.
 
 ********************************************************
 Saving and Loading the internal parameters configuration

diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
@@ -208,6 +208,8 @@ struct LearnerModelParam {
   // As the old `LearnerModelParamLegacy` is still used by binary IO, we keep
   // this one as an immutable copy.
   LearnerModelParam(LearnerModelParamLegacy const& user_param, float base_margin);
+  /* \brief Whether this parameter is initialized with LearnerModelParamLegacy. */
+  bool Initialized() const { return num_feature != 0; }
 };
 
 }  // namespace xgboost

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -896,11 +896,12 @@ def slice(self, rindex, allow_groups=False):
         res = DMatrix(None, feature_names=self.feature_names,
                       feature_types=self.feature_types)
         res.handle = ctypes.c_void_p()
-        _check_call(_LIB.XGDMatrixSliceDMatrixEx(self.handle,
-                                                 c_array(ctypes.c_int, rindex),
-                                                 c_bst_ulong(len(rindex)),
-                                                 ctypes.byref(res.handle),
-                                                 ctypes.c_int(1 if allow_groups else 0)))
+        _check_call(_LIB.XGDMatrixSliceDMatrixEx(
+            self.handle,
+            c_array(ctypes.c_int, rindex),
+            c_bst_ulong(len(rindex)),
+            ctypes.byref(res.handle),
+            ctypes.c_int(1 if allow_groups else 0)))
         return res
 
     @property
@@ -954,7 +955,8 @@ def feature_names(self, feature_names):
             if not all(isinstance(f, STRING_TYPES) and
                        not any(x in f for x in set(('[', ']', '<')))
                        for f in feature_names):
-                raise ValueError('feature_names must be string, and may not contain [, ] or <')
+                raise ValueError('feature_names must be string, and may not '
+                                 'contain [, ] or <')
         else:
             # reset feature_types also
             self.feature_types = None
@@ -996,7 +998,8 @@ def feature_types(self, feature_types):
             valid = ('int', 'float', 'i', 'q')
             if not all(isinstance(f, STRING_TYPES) and f in valid
                        for f in feature_types):
-                raise ValueError('All feature_names must be {int, float, i, q}')
+                raise ValueError(
+                    'All feature_names must be {int, float, i, q}')
         self._feature_types = feature_types
 
 
@@ -1024,7 +1027,8 @@ def __init__(self, params=None, cache=(), model_file=None):
         """
         for d in cache:
             if not isinstance(d, DMatrix):
-                raise TypeError('invalid cache item: {}'.format(type(d).__name__), cache)
+                raise TypeError('invalid cache item: {}'.format(
+                    type(d).__name__), cache)
             self._validate_features(d)
 
         dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
@@ -1033,7 +1037,7 @@ def __init__(self, params=None, cache=(), model_file=None):
                                          ctypes.byref(self.handle)))
 
         if isinstance(params, dict) and \
-            'validate_parameters' not in params.keys():
+           'validate_parameters' not in params.keys():
             params['validate_parameters'] = 1
         self.set_param(params or {})
         if (params is not None) and ('booster' in params):
@@ -1162,7 +1166,8 @@ def attr(self, key):
         Returns
         -------
         value : str
-            The attribute value of the key, returns None if attribute do not exist.
+            The attribute value of the key, returns None if attribute do not
+            exist.
         """
         ret = ctypes.c_char_p()
         success = ctypes.c_int()
@@ -1177,8 +1182,8 @@ def attributes(self):
 
         Returns
         -------
-        result : dictionary of  attribute_name: attribute_value pairs of strings.
-            Returns an empty dict if there's no attributes.
+        result : dictionary of attribute_name: attribute_value pairs of
+            strings.  Returns an empty dict if there's no attributes.
         """
         length = c_bst_ulong()
         sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -1194,7 +1199,8 @@ def set_attr(self, **kwargs):
         Parameters
         ----------
         **kwargs
-            The attributes to set. Setting a value to None deletes an attribute.
+            The attributes to set. Setting a value to None deletes an
+            attribute.
         """
         for key, value in kwargs.items():
             if value is not None:
@@ -1267,9 +1273,11 @@ def boost(self, dtrain, grad, hess):
 
         """
         if len(grad) != len(hess):
-            raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess)))
+            raise ValueError('grad / hess length mismatch: {} / {}'.format(
+                len(grad), len(hess)))
         if not isinstance(dtrain, DMatrix):
-            raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__))
+            raise TypeError('invalid training matrix: {}'.format(
+                type(dtrain).__name__))
         self._validate_features(dtrain)
 
         _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle,
@@ -1619,14 +1627,16 @@ def get_fscore(self, fmap=''):
 
         .. note:: Feature importance is defined only for tree boosters
 
-            Feature importance is only defined when the decision tree model is chosen as base
-            learner (`booster=gbtree`). It is not defined for other base learner types, such
-            as linear learners (`booster=gblinear`).
+            Feature importance is only defined when the decision tree model is
+            chosen as base learner (`booster=gbtree`). It is not defined for
+            other base learner types, such as linear learners
+            (`booster=gblinear`).
 
         .. note:: Zero-importance features will not be included
 
-           Keep in mind that this function does not include zero-importance feature, i.e.
-           those features that have not been used in any split conditions.
+           Keep in mind that this function does not include zero-importance
+           feature, i.e.  those features that have not been used in any split
+           conditions.
 
         Parameters
         ----------
@@ -1640,31 +1650,40 @@ def get_score(self, fmap='', importance_type='weight'):
         """Get feature importance of each feature.
         Importance type can be defined as:
 
-        * 'weight': the number of times a feature is used to split the data across all trees.
+        * 'weight': the number of times a feature is used to split the data
+          across all trees.
         * 'gain': the average gain across all splits the feature is used in.
-        * 'cover': the average coverage across all splits the feature is used in.
-        * 'total_gain': the total gain across all splits the feature is used in.
-        * 'total_cover': the total coverage across all splits the feature is used in.
+        * 'cover': the average coverage across all splits the feature is used
+          in.
+        * 'total_gain': the total gain across all splits the feature is used
+          in.
+        * 'total_cover': the total coverage across all splits the feature is
+          used in.
 
         .. note:: Feature importance is defined only for tree boosters
 
-            Feature importance is only defined when the decision tree model is chosen as base
-            learner (`booster=gbtree`). It is not defined for other base learner types, such
-            as linear learners (`booster=gblinear`).
+            Feature importance is only defined when the decision tree
+            model is chosen as base learner (`booster=gbtree`). It is
+            not defined for other base learner types, such as linear
+            learners (`booster=gblinear`).
 
         Parameters
         ----------
         fmap: str or os.PathLike (optional)
            The name of feature map file.
         importance_type: str, default 'weight'
             One of the importance types defined above.
+
         """
         fmap = os_fspath(fmap)
-        if getattr(self, 'booster', None) is not None and self.booster not in {'gbtree', 'dart'}:
-            raise ValueError('Feature importance is not defined for Booster type {}'
-                             .format(self.booster))
+        if getattr(self, 'booster', None) is not None and self.booster not in {
+                'gbtree', 'dart'}:
+            raise ValueError(
+                'Feature importance is not defined for Booster type {}'
+                .format(self.booster))
 
-        allowed_importance_types = ['weight', 'gain', 'cover', 'total_gain', 'total_cover']
+        allowed_importance_types = ['weight', 'gain', 'cover', 'total_gain',
+                                    'total_cover']
         if importance_type not in allowed_importance_types:
             msg = ("importance_type mismatch, got '{}', expected one of " +
                    repr(allowed_importance_types))
@@ -1716,7 +1735,8 @@ def get_score(self, fmap='', importance_type='weight'):
                 if len(arr) == 1:
                     continue
 
-                # look for the closing bracket, extract only info within that bracket
+                # look for the closing bracket, extract only info within that
+                # bracket
                 fid = arr[1].split(']')
 
                 # extract gain or cover from string after closing bracket
@@ -1743,9 +1763,9 @@ def get_score(self, fmap='', importance_type='weight'):
     def trees_to_dataframe(self, fmap=''):
         """Parse a boosted tree model text dump into a pandas DataFrame structure.
 
-        This feature is only defined when the decision tree model is chosen as base
-        learner (`booster in {gbtree, dart}`). It is not defined for other base learner
-        types, such as linear learners (`booster=gblinear`).
+        This feature is only defined when the decision tree model is chosen as
+        base learner (`booster in {gbtree, dart}`). It is not defined for other
+        base learner types, such as linear learners (`booster=gblinear`).
 
         Parameters
         ----------
@@ -1758,7 +1778,8 @@ def trees_to_dataframe(self, fmap=''):
             raise Exception(('pandas must be available to use this method.'
                              'Install pandas before calling again.'))
 
-        if getattr(self, 'booster', None) is not None and self.booster not in {'gbtree', 'dart'}:
+        if getattr(self, 'booster', None) is not None and self.booster not in {
+                'gbtree', 'dart'}:
             raise ValueError('This method is not defined for Booster type {}'
                              .format(self.booster))
 
@@ -1814,7 +1835,8 @@ def trees_to_dataframe(self, fmap=''):
                     gains.append(float(stats[7]))
                     covers.append(float(stats[9]))
 
-        ids = [str(t_id) + '-' + str(n_id) for t_id, n_id in zip(tree_ids, node_ids)]
+        ids = [str(t_id) + '-' + str(n_id)
+               for t_id, n_id in zip(tree_ids, node_ids)]
         df = DataFrame({'Tree': tree_ids, 'Node': node_ids, 'ID': ids,
                         'Feature': fids, 'Split': splits, 'Yes': y_directs,
                         'No': n_directs, 'Missing': missings, 'Gain': gains,
@@ -1894,5 +1916,6 @@ def get_split_value_histogram(self, feature, fmap='', bins=None,
             return DataFrame(nph, columns=['SplitValue', 'Count'])
         if as_pandas and not PANDAS_INSTALLED:
             sys.stderr.write(
-                "Returning histogram as ndarray (as_pandas == True, but pandas is not installed).")
+                "Returning histogram as ndarray (as_pandas == True, but pandas"
+                " is not installed).")
         return nph