diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index daf7bea44d06..ae16d6fa0c98 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -996,10 +996,22 @@ def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False,
         """
         Predict with data.
 
-        NOTE: This function is not thread safe.
-              For each booster object, predict can only be called from one thread.
-              If you want to run prediction using multiple thread, call bst.copy() to make copies
-              of model object and then call predict
+        .. note:: This function is not thread safe.
+
+          For each booster object, predict can only be called from one thread.
+          If you want to run prediction using multiple thread, call ``bst.copy()`` to make copies
+          of model object and then call ``predict()``.
+
+        .. note:: Using ``predict()`` with DART booster
+
+          If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only
+          some of the trees will be evaluated. This will produce incorrect results if ``data`` is
+          not the training data. To obtain correct results on test sets, set ``ntree_limit`` to
+          a nonzero value, e.g.
+
+          .. code-block:: python
+
+            preds = bst.predict(dtest, ntree_limit=num_round)
 
         Parameters
         ----------
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 173f8e51024d..69784b68299f 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -578,10 +578,24 @@ def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
     def predict(self, data, output_margin=False, ntree_limit=0):
         """
         Predict with `data`.
-        NOTE: This function is not thread safe.
-              For each booster object, predict can only be called from one thread.
-              If you want to run prediction using multiple thread, call xgb.copy() to make copies
-              of model object and then call predict
+
+        .. note:: This function is not thread safe.
+
+          For each booster object, predict can only be called from one thread.
+          If you want to run prediction using multiple thread, call ``xgb.copy()`` to make copies
+          of model object and then call ``predict()``.
+
+        .. note:: Using ``predict()`` with DART booster
+
+          If the booster object is DART type, ``predict()`` will perform dropouts, i.e. only
+          some of the trees will be evaluated. This will produce incorrect results if ``data`` is
+          not the training data. To obtain correct results on test sets, set ``ntree_limit`` to
+          a nonzero value, e.g.
+
+          .. code-block:: python
+
+            preds = bst.predict(dtest, ntree_limit=num_round)
+
         Parameters
         ----------
         data : DMatrix