"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[20:47:10] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:12] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:13] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:14] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:16] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:17] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:18] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:19] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:20] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:21] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:23] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:24] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:25] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:26] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:28] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:29] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:30] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:32] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:33] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:34] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:36] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:37] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:39] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:40] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:41] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:43] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:44] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:46] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:47] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:48] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:49] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:51] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:52] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:53] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:54] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:55] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:57] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:58] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:47:59] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:48:01] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:48:02] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:48:03] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:48:05] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:48:06] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:48:07] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:48:08] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:48:09] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:48:10] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:48:12] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[20:48:13] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/user/venv/local-feast/lib/python3.7/site-packages/yellowbrick/base.py:259: DeprecationWarning:\n",
- "\n",
- "this method is deprecated, please use show() instead\n",
- "\n"
- ]
- },
- {
- "data": {
- "image/png": "\n",
- "text/plain": [
- "
"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": 10,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "coefficients = pd.DataFrame(model.feature_importances_)\n",
- "column_df = pd.DataFrame(cols)\n",
- "coef_sumry = (pd.merge(coefficients, column_df, left_index=True,\n",
- " right_index=True, how=\"left\"))\n",
- "coef_sumry.columns = [\"coefficients\", \"features\"]\n",
- "coef_sumry = coef_sumry.sort_values(by=\"coefficients\", ascending=False)\n",
- "\n",
- "print(model)\n",
- "print(\"\\n Classification report : \\n\", classification_report(testing_y, predictions))\n",
- "print(\"Accuracy Score : \", accuracy_score(testing_y, predictions))\n",
- "# confusion matrix\n",
- "conf_matrix = confusion_matrix(testing_y, predictions)\n",
- "# roc_auc_score\n",
- "model_roc_auc = roc_auc_score(testing_y, predictions)\n",
- "print(\"Area under curve : \", model_roc_auc, \"\\n\")\n",
- "fpr, tpr, thresholds = roc_curve(testing_y, probabilities[:, 1])\n",
- "\n",
- "# plot confusion matrix\n",
- "trace1 = go.Heatmap(z=conf_matrix,\n",
- " x=[\"Not churn\", \"Churn\"],\n",
- " y=[\"Not churn\", \"Churn\"],\n",
- " showscale=False, colorscale=\"Picnic\",\n",
- " name=\"matrix\")\n",
- "\n",
- "# plot roc curve\n",
- "trace2 = go.Scatter(x=fpr, y=tpr,\n",
- " name=\"Roc : \" + str(model_roc_auc),\n",
- " line=dict(color=('rgb(22, 96, 167)'), width=2))\n",
- "trace3 = go.Scatter(x=[0, 1], y=[0, 1],\n",
- " line=dict(color=('rgb(205, 12, 24)'), width=2,\n",
- " dash='dot'))\n",
- "\n",
- "# plot coeffs\n",
- "trace4 = go.Bar(x=coef_sumry[\"features\"], y=coef_sumry[\"coefficients\"],\n",
- " name=\"coefficients\",\n",
- " marker=dict(color=coef_sumry[\"coefficients\"],\n",
- " colorscale=\"Picnic\",\n",
- " line=dict(width=.6, color=\"black\")))\n",
- "\n",
- "# subplots\n",
- "fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],\n",
- " subplot_titles=('Confusion Matrix',\n",
- " 'Receiver operating characteristic',\n",
- " 'Feature Importances'))\n",
- "\n",
- "fig.append_trace(trace1, 1, 1)\n",
- "fig.append_trace(trace2, 1, 2)\n",
- "fig.append_trace(trace3, 1, 2)\n",
- "fig.append_trace(trace4, 2, 1)\n",
- "\n",
- "fig['layout'].update(showlegend=False, title=\"Model performance\",\n",
- " autosize=False, height=900, width=800,\n",
- " plot_bgcolor='rgba(240,240,240, 0.95)',\n",
- " paper_bgcolor='rgba(240,240,240, 0.95)',\n",
- " margin=dict(b=195))\n",
- "fig[\"layout\"][\"xaxis2\"].update(dict(title=\"false positive rate\"))\n",
- "fig[\"layout\"][\"yaxis2\"].update(dict(title=\"true positive rate\"))\n",
- "fig[\"layout\"][\"xaxis3\"].update(dict(showgrid=True, tickfont=dict(size=10),\n",
- " tickangle=90))\n",
- "py.iplot(fig)\n",
- "\n",
- "visualizer = DiscriminationThreshold(model)\n",
- "visualizer.fit(training_x, training_y)\n",
- "visualizer.poof()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## 2. Churn Modelling (with Feast)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In the rest of the tutorial we will\n",
- "1. Use our already processed feature data to define feature schemas (through a feature set)\n",
- "1. Load/ingest the feature data into Feast\n",
- "1. Create a training dataset based on an entity dataframe and train an XGBoost model\n",
- "1. Make online predictions for our model using Feast for online serving"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2.1 Configure Feast"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Feast Core acts as the central feature registry\n",
- "FEAST_CORE_URL = os.getenv('FEAST_CORE_URL', 'localhost:6565')\n",
- "\n",
- "# Feast Online Serving allows for the retrieval of real-time feature data\n",
- "FEAST_ONLINE_SERVING_URL = os.getenv('FEAST_ONLINE_SERVING_URL', 'localhost:6566')\n",
- "\n",
- "# Feast Batch Serving allows for the retrieval of historical feature data\n",
- "FEAST_HISTORICAL_SERVING_URL = os.getenv('FEAST_HISTORICAL_SERVING_URL', 'localhost:6567')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create the Feast client we will use to register our feature set"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "client = Client(core_url=FEAST_CORE_URL)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2.2 Create a Feature Set"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "First we define a `customer_churn` feature set. A feature set is a storage level grouping of features. It is used when loading and ingesting data into Feast, and to describe the properties of features. \n",
- "\n",
- "During retrieval, features can be directly accessed from any feature sets."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "customer_churn_fs = FeatureSet('customer_churn')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "All data loaded into Feast must have a time dimension. This allows us to ensure point-in-time correctness."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [],
- "source": [
- "telcom['datetime'] = pd.Series([dt.datetime.now()] * len(telcom))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Next we will infer the schema of the data from the Pandas DataFrame of features"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Entity customer_id(ValueType.STRING) manually updated (replacing an existing field).\n",
- "Feature gender (ValueType.INT64) added from dataframe.\n",
- "Feature seniorcitizen (ValueType.INT64) added from dataframe.\n",
- "Feature partner (ValueType.INT64) added from dataframe.\n",
- "Feature dependents (ValueType.INT64) added from dataframe.\n",
- "Feature phoneservice (ValueType.INT64) added from dataframe.\n",
- "Feature onlinesecurity (ValueType.INT64) added from dataframe.\n",
- "Feature onlinebackup (ValueType.INT64) added from dataframe.\n",
- "Feature deviceprotection (ValueType.INT64) added from dataframe.\n",
- "Feature techsupport (ValueType.INT64) added from dataframe.\n",
- "Feature streamingtv (ValueType.INT64) added from dataframe.\n",
- "Feature streamingmovies (ValueType.INT64) added from dataframe.\n",
- "Feature paperlessbilling (ValueType.INT64) added from dataframe.\n",
- "Feature churn (ValueType.INT64) added from dataframe.\n",
- "Feature multiplelines_no (ValueType.INT64) added from dataframe.\n",
- "Feature multiplelines_no_phone_service (ValueType.INT64) added from dataframe.\n",
- "Feature multiplelines_yes (ValueType.INT64) added from dataframe.\n",
- "Feature internetservice_dsl (ValueType.INT64) added from dataframe.\n",
- "Feature internetservice_fiber_optic (ValueType.INT64) added from dataframe.\n",
- "Feature internetservice_no (ValueType.INT64) added from dataframe.\n",
- "Feature contract_month_to_month (ValueType.INT64) added from dataframe.\n",
- "Feature contract_one_year (ValueType.INT64) added from dataframe.\n",
- "Feature contract_two_year (ValueType.INT64) added from dataframe.\n",
- "Feature paymentmethod_bank_transfer_automatic (ValueType.INT64) added from dataframe.\n",
- "Feature paymentmethod_credit_card_automatic (ValueType.INT64) added from dataframe.\n",
- "Feature paymentmethod_electronic_check (ValueType.INT64) added from dataframe.\n",
- "Feature paymentmethod_mailed_check (ValueType.INT64) added from dataframe.\n",
- "Feature tenure_group_tenure_0_12 (ValueType.INT64) added from dataframe.\n",
- "Feature tenure_group_tenure_12_24 (ValueType.INT64) added from dataframe.\n",
- "Feature tenure_group_tenure_24_48 (ValueType.INT64) added from dataframe.\n",
- "Feature tenure_group_tenure_48_60 (ValueType.INT64) added from dataframe.\n",
- "Feature tenure_group_tenure_gt_60 (ValueType.INT64) added from dataframe.\n",
- "Feature tenure (ValueType.DOUBLE) added from dataframe.\n",
- "Feature monthlycharges (ValueType.DOUBLE) added from dataframe.\n",
- "Feature totalcharges (ValueType.DOUBLE) added from dataframe.\n",
- "\n"
- ]
- }
- ],
- "source": [
- "customer_churn_fs.infer_fields_from_df(telcom, entities=[Entity(name='customer_id', dtype=ValueType.STRING)])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now we will register our feature set with Feast. This will create the necessary tables in our historical store as well as start any ingestion jobs that populate the stores (BigQuery, Redis) with data that is ingested."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Feature set created: \"customer_churn\"\n"
- ]
- }
- ],
- "source": [
- "client.apply(customer_churn_fs)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Next we will confirm that our feature set schema looks correct and that it is in a READY state"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{\n",
- " \"spec\": {\n",
- " \"name\": \"customer_churn\",\n",
- " \"entities\": [\n",
- " {\n",
- " \"name\": \"customer_id\",\n",
- " \"valueType\": \"STRING\"\n",
- " }\n",
- " ],\n",
- " \"features\": [\n",
- " {\n",
- " \"name\": \"tenure\",\n",
- " \"valueType\": \"DOUBLE\"\n",
- " },\n",
- " {\n",
- " \"name\": \"churn\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"monthlycharges\",\n",
- " \"valueType\": \"DOUBLE\"\n",
- " },\n",
- " {\n",
- " \"name\": \"paymentmethod_credit_card_automatic\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"paymentmethod_electronic_check\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"dependents\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"internetservice_no\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"onlinesecurity\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"streamingtv\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"multiplelines_no_phone_service\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"contract_two_year\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"totalcharges\",\n",
- " \"valueType\": \"DOUBLE\"\n",
- " },\n",
- " {\n",
- " \"name\": \"streamingmovies\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"internetservice_dsl\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"contract_one_year\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"tenure_group_tenure_24_48\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"multiplelines_no\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"techsupport\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"paperlessbilling\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"phoneservice\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"tenure_group_tenure_12_24\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"gender\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"partner\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"paymentmethod_mailed_check\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"contract_month_to_month\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"tenure_group_tenure_48_60\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"tenure_group_tenure_0_12\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"onlinebackup\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"seniorcitizen\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"multiplelines_yes\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"deviceprotection\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"paymentmethod_bank_transfer_automatic\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"internetservice_fiber_optic\",\n",
- " \"valueType\": \"INT64\"\n",
- " },\n",
- " {\n",
- " \"name\": \"tenure_group_tenure_gt_60\",\n",
- " \"valueType\": \"INT64\"\n",
- " }\n",
- " ],\n",
- " \"source\": {\n",
- " \"type\": \"KAFKA\",\n",
- " \"kafkaSourceConfig\": {\n",
- " \"bootstrapServers\": \"localhost:9094\",\n",
- " \"topic\": \"feast-features\"\n",
- " }\n",
- " },\n",
- " \"project\": \"default\"\n",
- " },\n",
- " \"meta\": {\n",
- " \"createdTimestamp\": \"2020-06-26T12:48:29Z\",\n",
- " \"status\": \"STATUS_PENDING\"\n",
- " }\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "customer_churn_fs = client.get_feature_set('customer_churn')\n",
- "print(client.get_feature_set('customer_churn'))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2.3 Load Features Into Feast"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The ingest method pushes data to Feast for storage. All stores (online, historical) will be updated with this feature data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Waiting for feature set to be ready for ingestion...\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|██████████| 7032/7032 [00:04<00:00, 1735.70rows/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Ingestion complete!\n",
- "\n",
- "Ingestion statistics:\n",
- "Success: 7032/7032\n",
- "Removing temporary file(s)...\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "'ca7c1deb-4cdd-3989-8173-3a5c1f37adfb'"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "client.ingest(customer_churn_fs, telcom)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2.4 (Optional) Create a Model Class"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The following is a wrapper class that we have created for this tutorial. It neatly wraps a Feast Client as well as model training and serving into a single object. \n",
- "\n",
- "This ChurnModel object can be used to\n",
- "* Train a model\n",
- "* Serve the model in production\n",
- "\n",
- "Do note that the use of this class is not a requirement. The Feast retrieval methods below can be used in a standalone manner as well."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [],
- "source": [
- "import joblib\n",
- "import os\n",
- "\n",
- "from feast import Client\n",
- "from feast.serving.ServingService_pb2 import GetOnlineFeaturesRequest\n",
- "from feast.types.Value_pb2 import Value\n",
- "from sklearn.model_selection import train_test_split\n",
- "from xgboost import XGBClassifier\n",
- "from pandas import DataFrame\n",
- "\n",
- "\n",
- "class ChurnModel:\n",
- " def __init__(self, features, target, model_path=None):\n",
- " # Set up Feast clients to retrieve training and online serving data\n",
- " self._feast_online_client = Client(serving_url=os.environ['FEAST_ONLINE_SERVING_URL'])\n",
- " self._feast_batch_client = Client(serving_url=os.environ['FEAST_HISTORICAL_SERVING_URL'],\n",
- " core_url=os.environ['FEAST_CORE_URL'])\n",
- " \n",
- " # Path to either save models after training or load models for serving\n",
- " self._model = None\n",
- " self._model_path = model_path\n",
- " self._features = features\n",
- " self._target = target\n",
- "\n",
- " def train(self, entity_df):\n",
- " # Initialize a new XGBoost model\n",
- " self._model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
- " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n",
- " max_depth=7, min_child_weight=1, missing=None, n_estimators=50,\n",
- " n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,\n",
- " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
- " silent=True, subsample=1)\n",
- "\n",
- " # Get training dataset from Feast\n",
- " train_x, test_x, train_y, test_y = self._get_training_data(entity_df)\n",
- "\n",
- " # Train model\n",
- " self._model.fit(train_x, train_y, early_stopping_rounds=40, eval_set=[(test_x, test_y)])\n",
- "\n",
- " def _get_training_data(self, entity_df):\n",
- " # Add the target variable to our feature list\n",
- " features = self._features + [self._target]\n",
- "\n",
- " # Retrieve training dataset from Feast\n",
- " dataset = self._feast_batch_client.get_historical_features(\n",
- " feature_refs=features,\n",
- " entity_rows=entity_df).to_dataframe()\n",
- "\n",
- " # Split into a train and test set\n",
- " return train_test_split(dataset[self._features], dataset[self._target],\n",
- " test_size=0.25, random_state=111)\n",
- "\n",
- " def predict(self, customer_ids):\n",
- " # Load external model if no model exists\n",
- " if not self._model:\n",
- " self._model = joblib.load(self._model_path)\n",
- "\n",
- " # Get online features from Feast for the list of customers\n",
- " prediction = self._model.predict(data=self._get_online_features(customer_ids))\n",
- "\n",
- " # Return churn predictions\n",
- " return prediction\n",
- "\n",
- " def _get_online_features(self, customer_ids):\n",
- " # Build request out of entities (customer ids)\n",
- " entity_rows = []\n",
- " for customer_id in customer_ids:\n",
- " entity_rows.append(\n",
- " {'customer_id': customer_id}\n",
- " )\n",
- "\n",
- " # Retrieve online features from Feast for given entities\n",
- " data = self._feast_online_client.get_online_features(feature_refs=self._features,\n",
- " entity_rows=entity_rows)\n",
- " # Convert to Pandas dataframe\n",
- " features_dict = dict.fromkeys(self._features)\n",
- " for row in data.field_values:\n",
- " for feature in features_dict.keys():\n",
- " if features_dict[feature] is None:\n",
- " features_dict[feature] = []\n",
- " features_dict[feature].append(row.fields[feature].int64_val)\n",
- " return DataFrame.from_dict(features_dict)\n",
- "\n",
- " def save_model(self, model_path=None):\n",
- " # Export trained model to local path\n",
- " if not model_path:\n",
- " model_path = self._model_path\n",
- " joblib.dump(self._model, model_path)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2.5 Define Features and Target"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Next we will create our list of features and target variable"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [],
- "source": [
- "features = [\n",
- " 'gender',\n",
- " 'seniorcitizen',\n",
- " 'partner',\n",
- " 'dependents',\n",
- " 'phoneservice',\n",
- " 'onlinesecurity',\n",
- " 'onlinebackup',\n",
- " 'deviceprotection',\n",
- " 'techsupport',\n",
- " 'streamingtv',\n",
- " 'streamingmovies',\n",
- " 'paperlessbilling',\n",
- " 'multiplelines_no',\n",
- " 'multiplelines_no_phone_service',\n",
- " 'multiplelines_yes',\n",
- " 'internetservice_dsl',\n",
- " 'internetservice_fiber_optic',\n",
- " 'internetservice_no',\n",
- " 'contract_month_to_month',\n",
- " 'contract_one_year',\n",
- " 'contract_two_year',\n",
- " 'paymentmethod_bank_transfer_automatic',\n",
- " 'paymentmethod_credit_card_automatic',\n",
- " 'paymentmethod_electronic_check',\n",
- " 'paymentmethod_mailed_check',\n",
- " 'tenure_group_tenure_0_12',\n",
- " 'tenure_group_tenure_12_24',\n",
- " 'tenure_group_tenure_24_48',\n",
- " 'tenure_group_tenure_48_60',\n",
- " 'tenure_group_tenure_gt_60',\n",
- " 'tenure',\n",
- " 'monthlycharges',\n",
- " 'totalcharges']\n",
- "\n",
- "target = 'churn'"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We create an instance of our ChurnModel class. This will also initialize the Feast clients"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [],
- "source": [
- "cm = ChurnModel(features=features, target=target)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2.6 Train Model"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The entity DataFrame here is used to retrieve features. Each row in this dataframe represents some `customer_id` and timestamp on which we want to make a prediction. This DataFrame will be loaded into Feast, after which Feast will join features onto the DataFrame in a point-in-time correct way, and return it to the user for training their model."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [],
- "source": [
- "entity_df = telcom[['customer_id']].copy()\n",
- "entity_df['datetime'] = pd.Series([dt.datetime.now()] * len(entity_df))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Next we train our model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[04:13:07] WARNING: /workspace/src/learner.cc:480: \n",
- "Parameters: { silent } might not be used.\n",
- "\n",
- " This may not be accurate due to some parameters are only used in language bindings but\n",
- " passed down to XGBoost core. Or some parameters are not used but slip through this\n",
- " verification. Please open an issue if you find above cases.\n",
- "\n",
- "\n",
- "[0]\tvalidation_0-error:0.21388\n",
- "Will train until validation_0-error hasn't improved in 40 rounds.\n",
- "[1]\tvalidation_0-error:0.21786\n",
- "[2]\tvalidation_0-error:0.21388\n",
- "[3]\tvalidation_0-error:0.20876\n",
- "[4]\tvalidation_0-error:0.21331\n",
- "[5]\tvalidation_0-error:0.20762\n",
- "[6]\tvalidation_0-error:0.20762\n",
- "[7]\tvalidation_0-error:0.21217\n",
- "[8]\tvalidation_0-error:0.21445\n",
- "[9]\tvalidation_0-error:0.21047\n",
- "[10]\tvalidation_0-error:0.21160\n",
- "[11]\tvalidation_0-error:0.21502\n",
- "[12]\tvalidation_0-error:0.21616\n",
- "[13]\tvalidation_0-error:0.21274\n",
- "[14]\tvalidation_0-error:0.21388\n",
- "[15]\tvalidation_0-error:0.21331\n",
- "[16]\tvalidation_0-error:0.21502\n",
- "[17]\tvalidation_0-error:0.21672\n",
- "[18]\tvalidation_0-error:0.21843\n",
- "[19]\tvalidation_0-error:0.21786\n",
- "[20]\tvalidation_0-error:0.21900\n",
- "[21]\tvalidation_0-error:0.21729\n",
- "[22]\tvalidation_0-error:0.21502\n",
- "[23]\tvalidation_0-error:0.21729\n",
- "[24]\tvalidation_0-error:0.21559\n",
- "[25]\tvalidation_0-error:0.21502\n",
- "[26]\tvalidation_0-error:0.21616\n",
- "[27]\tvalidation_0-error:0.21672\n",
- "[28]\tvalidation_0-error:0.21331\n",
- "[29]\tvalidation_0-error:0.21445\n",
- "[30]\tvalidation_0-error:0.21388\n",
- "[31]\tvalidation_0-error:0.21331\n",
- "[32]\tvalidation_0-error:0.21331\n",
- "[33]\tvalidation_0-error:0.21445\n",
- "[34]\tvalidation_0-error:0.21502\n",
- "[35]\tvalidation_0-error:0.21445\n",
- "[36]\tvalidation_0-error:0.21616\n",
- "[37]\tvalidation_0-error:0.21616\n",
- "[38]\tvalidation_0-error:0.21616\n",
- "[39]\tvalidation_0-error:0.21559\n",
- "[40]\tvalidation_0-error:0.21616\n",
- "[41]\tvalidation_0-error:0.21616\n",
- "[42]\tvalidation_0-error:0.21502\n",
- "[43]\tvalidation_0-error:0.21616\n",
- "[44]\tvalidation_0-error:0.21616\n",
- "[45]\tvalidation_0-error:0.21616\n",
- "Stopping. Best iteration:\n",
- "[5]\tvalidation_0-error:0.20762\n",
- "\n"
- ]
- }
- ],
- "source": [
- "cm.train(entity_df)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### 2.7 Predict"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Once the model is trained we can use it to make predictions using Feast Online Serving"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [],
- "source": [
- "churn_prediction = cm.predict(['9237-HQITU', '9305-CDSKC', '7892-POOKP', '4190-MFLUW', '6467-CHFZW', '8665-UTDHZ'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[0 0 0 1 1 1]\n"
- ]
- }
- ],
- "source": [
- "print(churn_prediction)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "local-feast",
- "language": "python",
- "name": "local-feast"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.6"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/examples/statistics/Historical Feature Statistics with Feast, TFDV and Facets.ipynb b/examples/statistics/Historical Feature Statistics with Feast, TFDV and Facets.ipynb
deleted file mode 100644
index c4d91340cf..0000000000
--- a/examples/statistics/Historical Feature Statistics with Feast, TFDV and Facets.ipynb
+++ /dev/null
@@ -1,690 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Historical Feature Statistics with Feast, TFDV and Facets\n",
- "\n",
- "This tutorial covers how Feast can be used in conjunction with TFDV and Facets to retrieve statistics about feature datasets. \n",
- "\n",
- "The notebook showcases how Feast's integration with TFDV allows users to:\n",
- "\n",
- "1. Define TFX feature schemas and persist these properties in the Feature Store\n",
- "2. Validate new data against the defined schema\n",
- "3. Validate data already in Feast against the defined schema\n",
- "\n",
- "**Prerequisites**:\n",
- "\n",
- "- Feast running with at least 1 BigQuery warehouse store. This example uses a bigquery store with the name `historical`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "setting project to statistics...\n"
- ]
- }
- ],
- "source": [
- "import pandas as pd\n",
- "import pytest\n",
- "import pytz\n",
- "import uuid\n",
- "import time\n",
- "from datetime import datetime, timedelta\n",
- "\n",
- "from feast.client import Client\n",
- "from feast.entity import Entity\n",
- "from feast.feature import Feature\n",
- "from feast.feature_set import FeatureSet\n",
- "from feast.type_map import ValueType\n",
- "from google.protobuf import json_format\n",
- "from google.protobuf.duration_pb2 import Duration\n",
- "from tensorflow_metadata.proto.v0 import statistics_pb2\n",
- "from tensorflow_metadata.proto.v0 import schema_pb2\n",
- "import tensorflow_data_validation as tfdv\n",
- "\n",
- "PROJECT_NAME = \"statistics\"\n",
- "IRIS_DATASET = \"http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data\"\n",
- "BIGQUERY_STORE_NAME = \"historical\"\n",
- "client = Client(core_url=\"localhost:6565\")\n",
- "print(f\"setting project to {PROJECT_NAME}...\")\n",
- "client.set_project(PROJECT_NAME)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In this example, we are using the iris dataset. More information about this dataset can be found [here](http://archive.ics.uci.edu/ml/datasets/iris)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
sepal_length
\n",
- "
sepal_width
\n",
- "
petal_length
\n",
- "
petal_width
\n",
- "
class
\n",
- "
datetime
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
0
\n",
- "
5.1
\n",
- "
3.5
\n",
- "
1.4
\n",
- "
0.2
\n",
- "
Iris-setosa
\n",
- "
2020-05-25 07:31:28.230582+00:00
\n",
- "
\n",
- "
\n",
- "
1
\n",
- "
4.9
\n",
- "
3.0
\n",
- "
1.4
\n",
- "
0.2
\n",
- "
Iris-setosa
\n",
- "
2020-05-25 07:31:28.230582+00:00
\n",
- "
\n",
- "
\n",
- "
2
\n",
- "
4.7
\n",
- "
3.2
\n",
- "
1.3
\n",
- "
0.2
\n",
- "
Iris-setosa
\n",
- "
2020-05-25 07:31:28.230582+00:00
\n",
- "
\n",
- "
\n",
- "
3
\n",
- "
4.6
\n",
- "
3.1
\n",
- "
1.5
\n",
- "
0.2
\n",
- "
Iris-setosa
\n",
- "
2020-05-25 07:31:28.230582+00:00
\n",
- "
\n",
- "
\n",
- "
4
\n",
- "
5.0
\n",
- "
3.6
\n",
- "
1.4
\n",
- "
0.2
\n",
- "
Iris-setosa
\n",
- "
2020-05-25 07:31:28.230582+00:00
\n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " sepal_length sepal_width petal_length petal_width class \\\n",
- "0 5.1 3.5 1.4 0.2 Iris-setosa \n",
- "1 4.9 3.0 1.4 0.2 Iris-setosa \n",
- "2 4.7 3.2 1.3 0.2 Iris-setosa \n",
- "3 4.6 3.1 1.5 0.2 Iris-setosa \n",
- "4 5.0 3.6 1.4 0.2 Iris-setosa \n",
- "\n",
- " datetime \n",
- "0 2020-05-25 07:31:28.230582+00:00 \n",
- "1 2020-05-25 07:31:28.230582+00:00 \n",
- "2 2020-05-25 07:31:28.230582+00:00 \n",
- "3 2020-05-25 07:31:28.230582+00:00 \n",
- "4 2020-05-25 07:31:28.230582+00:00 "
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "iris_feature_names = [\"sepal_length\",\"sepal_width\",\"petal_length\",\"petal_width\"]\n",
- "df = pd.read_csv(IRIS_DATASET, names=iris_feature_names + [\"class\"])\n",
- "\n",
- "# Add datetime to satisfy Feast\n",
- "current_datetime = datetime.utcnow().replace(tzinfo=pytz.utc)\n",
- "df['datetime'] = current_datetime - timedelta(days=1)\n",
- "\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## TFDV schema as part of the feature set definition\n",
- "\n",
- "An integral part of TFDV is the feature [schemas](https://github.com/tensorflow/metadata/blob/master/tensorflow_metadata/proto/v0/schema.proto) that describe the expected properties of the data in a dataset, such as:\n",
- "- expected feature presence\n",
- "- type\n",
- "- expected domains of features\n",
- "\n",
- "These schemas, which can be [manually defined or generated by TFDV](https://www.tensorflow.org/tfx/data_validation/get_started#inferring_a_schema_over_the_data), can be then used to extend the definition of features within the feature set. As part of the spec, the schema is persisted within Feast, and is used for both in-flight data validation, as well as offline integration with TFDV.\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "WARNING:root:Ignoring feature datetime of type datetime64[ns, UTC]\n",
- "/Users/zhiling/.pyenv/versions/3.7.2/envs/test-feast/lib/python3.7/site-packages/tensorflow_data_validation/arrow/arrow_util.py:236: FutureWarning: Calling .data on ChunkedArray is provided for compatibility after Column was removed, simply drop this attribute\n",
- " types.FeaturePath([column_name]), column.data.chunk(0), weights):\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Entity class(ValueType.STRING) manually updated (replacing an existing field).\n",
- "Feature sepal_length (ValueType.DOUBLE) added from dataframe.\n",
- "Feature sepal_width (ValueType.DOUBLE) added from dataframe.\n",
- "Feature petal_length (ValueType.DOUBLE) added from dataframe.\n",
- "Feature petal_width (ValueType.DOUBLE) added from dataframe.\n",
- "\n",
- "{\n",
- " \"spec\": {\n",
- " \"name\": \"iris\",\n",
- " \"entities\": [\n",
- " {\n",
- " \"name\": \"class\",\n",
- " \"valueType\": \"STRING\"\n",
- " }\n",
- " ],\n",
- " \"features\": [\n",
- " {\n",
- " \"name\": \"sepal_length\",\n",
- " \"valueType\": \"DOUBLE\",\n",
- " \"presence\": {\n",
- " \"minFraction\": 1.0,\n",
- " \"minCount\": \"1\"\n",
- " },\n",
- " \"shape\": {\n",
- " \"dim\": [\n",
- " {\n",
- " \"size\": \"1\"\n",
- " }\n",
- " ]\n",
- " }\n",
- " },\n",
- " {\n",
- " \"name\": \"sepal_width\",\n",
- " \"valueType\": \"DOUBLE\",\n",
- " \"presence\": {\n",
- " \"minFraction\": 1.0,\n",
- " \"minCount\": \"1\"\n",
- " },\n",
- " \"shape\": {\n",
- " \"dim\": [\n",
- " {\n",
- " \"size\": \"1\"\n",
- " }\n",
- " ]\n",
- " }\n",
- " },\n",
- " {\n",
- " \"name\": \"petal_length\",\n",
- " \"valueType\": \"DOUBLE\",\n",
- " \"presence\": {\n",
- " \"minFraction\": 1.0,\n",
- " \"minCount\": \"1\"\n",
- " },\n",
- " \"shape\": {\n",
- " \"dim\": [\n",
- " {\n",
- " \"size\": \"1\"\n",
- " }\n",
- " ]\n",
- " }\n",
- " },\n",
- " {\n",
- " \"name\": \"petal_width\",\n",
- " \"valueType\": \"DOUBLE\",\n",
- " \"presence\": {\n",
- " \"minFraction\": 1.0,\n",
- " \"minCount\": \"1\"\n",
- " },\n",
- " \"shape\": {\n",
- " \"dim\": [\n",
- " {\n",
- " \"size\": \"1\"\n",
- " }\n",
- " ]\n",
- " },\n",
- " \"floatDomain\": {\n",
- " \"min\": 0.0\n",
- " }\n",
- " }\n",
- " ]\n",
- " },\n",
- " \"meta\": {}\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "# Infer a schema over the iris dataset. These values can be tweaked as necessary.\n",
- "stats = tfdv.generate_statistics_from_dataframe(df)\n",
- "schema = tfdv.infer_schema(statistics=stats)\n",
- "width_domain = schema_pb2.FloatDomain(min=0)\n",
- "tfdv.set_domain(schema, 'petal_width', width_domain)\n",
- "\n",
- "# Create a new FeatureSet or retrieve an existing FeatureSet in Feast\n",
- "feature_set = FeatureSet(name=\"iris\")\n",
- "feature_set.infer_fields_from_df(df[['datetime'] + iris_feature_names], \n",
- " entities=[Entity(name=\"class\", dtype=ValueType.STRING)])\n",
- "\n",
- "# Update the entities and features with constraints defined in the schema\n",
- "feature_set.import_tfx_schema(schema)\n",
- "print(feature_set)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Computing statistics over an ingested dataset\n",
- "\n",
- "Feast is able to compute statistics for any data that has been ingested into the system. Statistics can be computed over either discrete datasets using *dataset_ids* or periods of time using a specified time range.\n",
- "\n",
- "These statistics are computed at a historical store (caveat: only BQ is supported at the moment). The feature statistics returned in the form of TFX's `DatasetFeatureStatisticsList`, which can then be directly fed back into TFDV methods to either visualise the data statistics, or validate the dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Feature set created: \"iris\"\n",
- "Waiting for feature set to be ready for ingestion...\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|██████████| 150/150 [00:01<00:00, 122.33rows/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Ingestion complete!\n",
- "\n",
- "Ingestion statistics:\n",
- "Success: 150/150\n",
- "Removing temporary file(s)...\n",
- "\n",
- "ingestion id: 73ed84b1-1218-3702-b4c6-673503233264\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "# Apply the featureset\n",
- "client.apply(feature_set)\n",
- "\n",
- "# When a dataset is ingested into Feast, a unique ingestion id referencing the ingested dataset is returned. \n",
- "ingestion_id = client.ingest(feature_set, df)\n",
- "print(\"\\ningestion id: \" + ingestion_id)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Get statistics from Feast for the ingested dataset.\n",
- "# The statistics are calculated over the data in the store specified.\n",
- "stats = client.get_statistics(\n",
- " feature_set_id='iris', \n",
- " store=BIGQUERY_STORE_NAME, \n",
- " features=iris_feature_names, \n",
- " ingestion_ids=[ingestion_id])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Visualising statistics with facets\n",
- "\n",
- "Since Feast outputs statistics in a format compatible with the TFDV API, the stats object can be directly passed to `tfdv.visualize_statistics()` to visualise, in-line, the output statistics on [Facets](https://pair-code.github.io/facets/), allowing for easy and interactive exploration of the shape and distribution of the data inside Feast."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- " "
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "tfdv.visualize_statistics(stats)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Validating correctness of subsequent datasets \n",
- "\n",
- "While it is useful to explore dataset statistics using facets, since we have already defined a schema that specifies a dataset's bounds of correctness, we can leverage TFDV's `validate_statistics` to validate if subsequent datasets are problematic or not. "
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "It is possible to validate correctness of a new dataset prior to ingestion by retrieving the schema from the feature set, and comparing computed statistics against that schema. \n",
- "\n",
- "This can be useful if we want to avoid ingesting problematic data into Feast."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "WARNING:root:Ignoring feature datetime of type datetime64[ns, UTC]\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
Anomaly short description
\n",
- "
Anomaly long description
\n",
- "
\n",
- "
\n",
- "
Feature name
\n",
- "
\n",
- "
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
'petal_width'
\n",
- "
Out-of-range values
\n",
- "
Unexpectedly low values: -1<-1(upto six significant digits)
\n",
- "
\n",
- "
\n",
- "
'class'
\n",
- "
New column
\n",
- "
New column (column in data but not in schema)
\n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Anomaly short description \\\n",
- "Feature name \n",
- "'petal_width' Out-of-range values \n",
- "'class' New column \n",
- "\n",
- " Anomaly long description \n",
- "Feature name \n",
- "'petal_width' Unexpectedly low values: -1<-1(upto six significant digits) \n",
- "'class' New column (column in data but not in schema) "
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Ingest a new dataset with obviously incorrect data\n",
- "df_2 = pd.DataFrame(\n",
- " {\n",
- " \"datetime\": current_datetime,\n",
- " \"class\": [\"Iris-setosa\", \"Iris-virginica\", \"Iris-nonsensica\"],\n",
- " \"sepal_length\": [4.3, 6.9, 12],\n",
- " \"sepal_width\": [3.0, 2.8, 1.1],\n",
- " \"petal_length\": [1.2, 4.9, 2.2],\n",
- " \"petal_width\": [0.1, 1.8, -1.0]\n",
- " }\n",
- ")\n",
- "\n",
- "# Validate correctness\n",
- "stats_2 = tfdv.generate_statistics_from_dataframe(df_2)\n",
- "anomalies = tfdv.validate_statistics(statistics=stats_2, schema=feature_set.export_tfx_schema())\n",
- "tfdv.display_anomalies(anomalies)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Alternatively, the data can be ingested into Feast, and the statistics computed at the store. This has the benefit of offloading statistics computation for large datasets to Feast."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "scrolled": false
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\r",
- " 0%| | 0/3 [00:00, ?rows/s]"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Waiting for feature set to be ready for ingestion...\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "100%|██████████| 3/3 [00:01<00:00, 2.85rows/s]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Ingestion complete!\n",
- "\n",
- "Ingestion statistics:\n",
- "Success: 3/3\n",
- "Removing temporary file(s)...\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
Anomaly short description
\n",
- "
Anomaly long description
\n",
- "
\n",
- "
\n",
- "
Feature name
\n",
- "
\n",
- "
\n",
- "
\n",
- " \n",
- " \n",
- "
\n",
- "
'petal_width'
\n",
- "
Out-of-range values
\n",
- "
Unexpectedly low values: -1<-1(upto six significant digits)
\n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Anomaly short description \\\n",
- "Feature name \n",
- "'petal_width' Out-of-range values \n",
- "\n",
- " Anomaly long description \n",
- "Feature name \n",
- "'petal_width' Unexpectedly low values: -1<-1(upto six significant digits) "
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Ingest the data into Feast\n",
- "ingestion_id_2 = client.ingest(feature_set, df_2)\n",
- "time.sleep(10) # Sleep is not necessary if not using DirectRunner\n",
- "\n",
- "# Compute statistics over the new dataset\n",
- "stats_2 = client.get_statistics(\n",
- " feature_set_id='iris', \n",
- " store=BIGQUERY_STORE_NAME, \n",
- " features=iris_feature_names, \n",
- " ingestion_ids=[ingestion_id_2])\n",
- "\n",
- "# Detect anomalies in the dataset\n",
- "anomalies = tfdv.validate_statistics(statistics=stats_2, schema=feature_set.export_tfx_schema())\n",
- "tfdv.display_anomalies(anomalies)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.2"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
\ No newline at end of file