From aa6b24755a4328bfe605b927e0c9f2f072498449 Mon Sep 17 00:00:00 2001 From: Scott Graham Date: Wed, 20 Mar 2019 11:48:26 -0400 Subject: [PATCH 01/30] adding lightgbm on spark notebook --- .../02_model/mmlspark_lightgbm_criteo.ipynb | 441 ++++++++++++++++++ 1 file changed, 441 insertions(+) create mode 100644 notebooks/02_model/mmlspark_lightgbm_criteo.ipynb diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb new file mode 100644 index 0000000000..32b35f1080 --- /dev/null +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -0,0 +1,441 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# LightGBM: A Highly Efficient Gradient Boosting Decision Tree\n", + "This notebook will give you a quick example of how to train LightGBM model on Spark and deploy it using MML Spark for a content personalization scenario.
\n", + "LightGBM \\[1\\] is a gradient boosting framework that uses tree-based learning algorithms.
\n", + "MML Spark \\[2\\] allows LightGBM to be called in a Spark environment which provides several advantages:\n", + "- Distributed computation for model development\n", + "- Easy integration into existing Spark workflows\n", + "- Model serving through Spark Serving \\[3\\]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Global Settings and Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", + "[GCC 7.3.0]\n", + "PySpark version: 2.3.1\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "from tempfile import TemporaryDirectory\n", + "sys.path.append(\"../../\")\n", + "\n", + "import pyspark\n", + "from pyspark.ml.feature import FeatureHasher\n", + "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.types import FloatType\n", + "import requests\n", + "\n", + "from reco_utils.common.spark_utils import start_or_get_spark\n", + "from reco_utils.common.notebook_utils import is_databricks\n", + "from reco_utils.dataset.criteo_dac import load_spark_df\n", + "from reco_utils.dataset.spark_splitters import spark_random_split\n", + "\n", + "print(\"System version: {}\".format(sys.version))\n", + "print(\"PySpark version: {}\".format(pyspark.version.__version__))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Setup MML Spark\n", + "if not is_databricks():\n", + " spark = start_or_get_spark(packages=['Azure:mmlspark:0.16'])\n", + "\n", + "from mmlspark import ComputeModelStatistics\n", + "from mmlspark import DiscreteHyperParam\n", + "from mmlspark import HyperparamBuilder\n", + "from mmlspark import LightGBMClassifier\n", + "from mmlspark import RandomSpace\n", + "from mmlspark import RangeHyperParam\n", + "from mmlspark import TuneHyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation\n", + "The Criteo Display Advertising Challenge (DAC) dataset [3] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset is too large for a lightweight demo, so we use a smaller sample for a demo dataset.

\n", + "The sample data consist of 100,000 rows with 1 label column and 39 feature columns, where 13 columns are integer values (int00-int12) and 26 columns are categorical features (cat00-cat25).

\n", + "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and indicates a user interaction with an item, so this is a useful dataset to demonstrate how to build a model that will predict likelihood of a user interacting with an item based on the user and item content features.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "raw_data = load_spark_df(size='sample', spark=spark)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Feature Processing\n", + "The feature data provided has many missing values across both integer and categorical feature fields. In addition the categorical features have many distinct values, so effectively cleaning and representing the feature data is an important step prior to training a model.
\n", + "One of the simplest ways of managing both features that have missing values as well as high cardinality is to use the hashing trick. The FeatureHasher transformer will pass integer values through and will hash categorical features into a sparse vector of lower dimensionality which can be used effectively by LightGBM.
\n", + "Lastly the dataset is split randomly for training and testing the model." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "columns = [c for c in raw_data.columns if c != 'label']\n", + "feature_processor = FeatureHasher(inputCols=columns, outputCol='features')\n", + "data = feature_processor.transform(raw_data)\n", + "train, test = spark_random_split(data, ratio=0.75, seed=42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Training\n", + "In MML Spark the LightGBM implementation for binary classification is invoked using the LightGBMClassifier class and specifying the objective as 'binary'. In this instance the occurrence of positive labels is quite low, so setting the isUnbalance flag to true helps account for this imbalance.
\n", + "\n", + "### Hyper-parameters\n", + "Key hyper-parameters \\[5\\] for LightGBM classifier on Spark are the number of leaves (numLeaves) in each tree, the number of iterations (numIterations) for training, the learning rate (learningRate) and the fraction of features used during training a tree (featureFraction). Lastly, early stopping round (earlyStoppingRound) can be useful to stop learning at the point where overfitting can begin to occur." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "NUM_LEAVES = 64\n", + "NUM_ITERATIONS = 100\n", + "LEARNING_RATE = 0.15\n", + "FEATURE_FRACTION = 0.8\n", + "EARLY_STOPPING_ROUND = 20" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "lgbm = LightGBMClassifier(\n", + " labelCol='label',\n", + " featuresCol='features',\n", + " objective='binary',\n", + " isUnbalance=True,\n", + " boostingType='gbdt',\n", + " boostFromAverage=True,\n", + " numLeaves=NUM_LEAVES,\n", + " numIterations=NUM_ITERATIONS,\n", + " learningRate=LEARNING_RATE,\n", + " featureFraction=FEATURE_FRACTION,\n", + " earlyStoppingRound=EARLY_STOPPING_ROUND,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model Training and Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+------------------+\n", + "|evaluation_type| AUC|\n", + "+---------------+------------------+\n", + "| Classification|0.6716842093722328|\n", + "+---------------+------------------+\n", + "\n" + ] + } + ], + "source": [ + "model = lgbm.fit(train)\n", + "\n", + "evaluator = (\n", + " ComputeModelStatistics()\n", + " .setScoredLabelsCol(\"prediction\")\n", + " .setLabelCol(\"label\")\n", + " .setEvaluationMetric(\"AUC\")\n", + ")\n", + "\n", + "predictions = model.transform(test)\n", + "evaluator.transform(predictions).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model Tuning\n", + "\n", + "MML Spark supports hyper-parameter tuning from a specified space of parameters which can be randomly sampled (or sampled from a grid of options) from continuous or discrete ranges of values. TuneHyperparameters can apply n-fold cross-validation with the given evaluation metric to more robustly identify the best set of parameters to use for the given model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "params = (\n", + " HyperparamBuilder()\n", + " .addHyperparam(lgbm, lgbm.learningRate, RangeHyperParam(0.001, 1.0))\n", + " .addHyperparam(lgbm, lgbm.numIterations, RangeHyperParam(10, 100))\n", + " .addHyperparam(lgbm, lgbm.numLeaves, DiscreteHyperParam([32, 64, 128]))\n", + ").build()\n", + "paramSpace = RandomSpace(params).space()\n", + "\n", + "tuner = TuneHyperparameters(\n", + " evaluationMetric=\"AUC\", \n", + " models=[lgbm], \n", + " numFolds=5,\n", + " numRuns=10, \n", + " parallelism=1,\n", + " paramSpace=paramSpace, \n", + " seed=42\n", + ")\n", + "\n", + "bestModel = tuner.fit(train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(bestModel.getBestModelInfo())\n", + "print(bestModel.getBestModel())\n", + "\n", + "predictions = bestModel.transform(test)\n", + "evaluator.transform(predictions).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Saving and Loading\n", + "The model can be saved and reloaded for use in another workflow." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "with TemporaryDirectory() as tmp:\n", + " save_file = os.path.join(tmp, r'finished.model')\n", + " model.save(save_file)\n", + " loaded_model = model.load(save_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+------------------+\n", + "|evaluation_type| AUC|\n", + "+---------------+------------------+\n", + "| Classification|0.6716842093722328|\n", + "+---------------+------------------+\n", + "\n" + ] + } + ], + "source": [ + "# Re-evaluate the performance again\n", + "predictions = loaded_model.transform(test)\n", + "evaluator.transform(predictions).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model Deployment\n", + "MML Spark provides an easy way to quickly spin up a server to deploy trained models built on top of Spark Streaming DataFrames. In this example the server reads a request, parses it to the same input as the original raw data and applies feature processing then computes the probability of engagement given the user and item features provided. This probability is written back as a response to the original request.

\n", + "Content-based personalization can be accomplished by leveraging this engagement prediction service as the key machine-learning component inside a larger system. To personalize content for a user, a set of items is selected for evaluation and item-content features are extracted for each. These item features can be combined with the user features for each user-item combination and sent to the engagement prediction service which evaluates the probability that a user will engage with each item. The probability can be used to rank the items and select the top-k desired results." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Define spark serving input\n", + "input_df = (\n", + " spark.readStream.server()\n", + " .address(\"localhost\", 8089, \"predict\")\n", + " .load()\n", + " .parseRequest(raw_data.schema)\n", + ")\n", + "\n", + "# Process features and make predictions\n", + "get_pos_prob = udf(lambda x: float(x[1]))\n", + "\n", + "processed_df = feature_processor.transform(input_df)\n", + "output_df = (\n", + " loaded_model.transform(processed_df)\n", + " .withColumn('p_eng', get_pos_prob(col('probability')).cast(FloatType()))\n", + " .makeReply(\"p_eng\")\n", + ")\n", + "\n", + "# Define spark serving output and start server\n", + "checkpoint = TemporaryDirectory()\n", + "server = (\n", + " output_df.writeStream.server()\n", + " .replyTo(\"predict\")\n", + " .queryName(\"prediction\")\n", + " .option(\"checkpointLocation\", \"file://{}\".format(checkpoint.name))\n", + " .start()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'message': 'Waiting for data to arrive',\n", + " 'isDataAvailable': False,\n", + " 'isTriggerActive': False}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "server.status" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Response {\"p_eng\":0.16379395}\n" + ] + } + ], + "source": [ + "query = raw_data.limit(1).collect()[0].asDict()\n", + "r = requests.post(data=query, url=\"http://localhost:8089/predict\")\n", + "print(\"Response {}\".format(r.text))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Cleanup\n", + "server.stop()\n", + "checkpoint.cleanup()\n", + "server.status" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (reco_pyspark)", + "language": "python", + "name": "reco_pyspark" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From bbda4cc92811b179d7e1f2b337aa33dc600ab7d6 Mon Sep 17 00:00:00 2001 From: Scott Graham Date: Thu, 21 Mar 2019 08:09:44 -0400 Subject: [PATCH 02/30] updates to notebook and spark session creator --- .../02_model/mmlspark_lightgbm_criteo.ipynb | 185 ++++++++++++------ reco_utils/common/spark_utils.py | 17 +- 2 files changed, 137 insertions(+), 65 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 32b35f1080..dfca0ab0f8 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -13,8 +13,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# LightGBM: A Highly Efficient Gradient Boosting Decision Tree\n", - "This notebook will give you a quick example of how to train LightGBM model on Spark and deploy it using MML Spark for a content personalization scenario.
\n", + "# Content Based Personalization\n", + "## LightGBM on Azure Databricks
\n", + "This notebook provides a quick example of how to train LightGBM model on Azure Databricks and deploy it using MML Spark for a content personalization scenario.

\n", "LightGBM \\[1\\] is a gradient boosting framework that uses tree-based learning algorithms.
\n", "MML Spark \\[2\\] allows LightGBM to be called in a Spark environment which provides several advantages:\n", "- Distributed computation for model development\n", @@ -51,6 +52,7 @@ "sys.path.append(\"../../\")\n", "\n", "import pyspark\n", + "from pyspark.ml import PipelineModel\n", "from pyspark.ml.feature import FeatureHasher\n", "from pyspark.sql.functions import col, udf\n", "from pyspark.sql.types import FloatType\n", @@ -58,7 +60,7 @@ "\n", "from reco_utils.common.spark_utils import start_or_get_spark\n", "from reco_utils.common.notebook_utils import is_databricks\n", - "from reco_utils.dataset.criteo_dac import load_spark_df\n", + "from reco_utils.dataset.criteo import load_spark_df\n", "from reco_utils.dataset.spark_splitters import spark_random_split\n", "\n", "print(\"System version: {}\".format(sys.version))\n", @@ -89,18 +91,34 @@ "metadata": {}, "source": [ "## Data Preparation\n", - "The Criteo Display Advertising Challenge (DAC) dataset [3] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset is too large for a lightweight demo, so we use a smaller sample for a demo dataset.

\n", - "The sample data consist of 100,000 rows with 1 label column and 39 feature columns, where 13 columns are integer values (int00-int12) and 26 columns are categorical features (cat00-cat25).

\n", - "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and indicates a user interaction with an item, so this is a useful dataset to demonstrate how to build a model that will predict likelihood of a user interacting with an item based on the user and item content features.\n" + "The Criteo Display Advertising Challenge (DAC) dataset [3] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows and provides a good challenge for scalable machine learning. The dataset has a binary label column and 38 feature columns, where 13 columns are integer values (int00-int12) and 25 columns are categorical features (cat00-cat24). There is also a sample data consist of 100,000 rows which can be used by setting DATA_SIZE = 'sample'.

\n", + "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and indicates a user interaction with an item, so this is a useful dataset to demonstrate how to build a model that will predict likelihood of a user interacting with an item based on the user context and item content features.\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "raw_data = load_spark_df(size='sample', spark=spark)" + "DATA_SIZE = 'sample'" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "8.79MB [00:08, 1.04MB/s] \n" + ] + } + ], + "source": [ + "raw_data = load_spark_df(size=DATA_SIZE, spark=spark, local_cache_path='../..')" ] }, { @@ -108,21 +126,42 @@ "metadata": {}, "source": [ "### Feature Processing\n", - "The feature data provided has many missing values across both integer and categorical feature fields. In addition the categorical features have many distinct values, so effectively cleaning and representing the feature data is an important step prior to training a model.
\n", - "One of the simplest ways of managing both features that have missing values as well as high cardinality is to use the hashing trick. The FeatureHasher transformer will pass integer values through and will hash categorical features into a sparse vector of lower dimensionality which can be used effectively by LightGBM.
\n", + "The feature data provided has many missing values across both integer and categorical feature fields. In addition the categorical features have many distinct values, so effectively cleaning and representing the feature data is an important step prior to training a model.

\n", + "One of the simplest ways of managing both features that have missing values as well as high cardinality is to use the hashing trick. The FeatureHasher transformer will pass integer values through and will hash categorical features into a sparse vector of lower dimensionality which can be used effectively by LightGBM.

\n", "Lastly the dataset is split randomly for training and testing the model." ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "columns = [c for c in raw_data.columns if c != 'label']\n", "feature_processor = FeatureHasher(inputCols=columns, outputCol='features')\n", "data = feature_processor.transform(raw_data)\n", - "train, test = spark_random_split(data, ratio=0.75, seed=42)" + "train, test = spark_random_split(data, ratio=0.8, seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# save data to tables to finalize feature transformation\n", + "train.write.mode('overwrite').saveAsTable('train')\n", + "test.write.mode('overwrite').saveAsTable('test')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train = spark.table('train')\n", + "test = spark.table('test')" ] }, { @@ -130,15 +169,20 @@ "metadata": {}, "source": [ "## Model Training\n", - "In MML Spark the LightGBM implementation for binary classification is invoked using the LightGBMClassifier class and specifying the objective as 'binary'. In this instance the occurrence of positive labels is quite low, so setting the isUnbalance flag to true helps account for this imbalance.
\n", + "In MML Spark the LightGBM implementation for binary classification is invoked using the LightGBMClassifier class and specifying the objective as 'binary'. In this instance the occurrence of positive labels is quite low, so setting the isUnbalance flag to true helps account for this imbalance.

\n", "\n", "### Hyper-parameters\n", - "Key hyper-parameters \\[5\\] for LightGBM classifier on Spark are the number of leaves (numLeaves) in each tree, the number of iterations (numIterations) for training, the learning rate (learningRate) and the fraction of features used during training a tree (featureFraction). Lastly, early stopping round (earlyStoppingRound) can be useful to stop learning at the point where overfitting can begin to occur." + "Below are some of the key hyper-parameters \\[5\\] for training a LightGBM classifier on Spark\n", + "- numLeaves: the number of leaves in each tree\n", + "- numIterations: the number of iterations to apply boosting\n", + "- learningRate: the learning rate for training across trees\n", + "- featureFraction: the fraction of features used for training a tree\n", + "- earlyStoppingRound: round at which early stopping can be applied to avoid overfitting" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 14, "metadata": { "tags": [ "parameters" @@ -146,8 +190,8 @@ }, "outputs": [], "source": [ - "NUM_LEAVES = 64\n", - "NUM_ITERATIONS = 100\n", + "NUM_LEAVES = 32\n", + "NUM_ITERATIONS = 10\n", "LEARNING_RATE = 0.15\n", "FEATURE_FRACTION = 0.8\n", "EARLY_STOPPING_ROUND = 20" @@ -155,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -183,35 +227,43 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 16, "metadata": { "scrolled": false }, + "outputs": [], + "source": [ + "model = lgbm.fit(train)\n", + "predictions = model.transform(test)\n", + "predictions.cache.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+---------------+------------------+\n", - "|evaluation_type| AUC|\n", + "|evaluation_type| accuracy|\n", "+---------------+------------------+\n", - "| Classification|0.6716842093722328|\n", + "| Classification|0.7075339448071455|\n", "+---------------+------------------+\n", "\n" ] } ], "source": [ - "model = lgbm.fit(train)\n", - "\n", "evaluator = (\n", " ComputeModelStatistics()\n", " .setScoredLabelsCol(\"prediction\")\n", " .setLabelCol(\"label\")\n", - " .setEvaluationMetric(\"AUC\")\n", + " .setEvaluationMetric(\"accuracy\")\n", ")\n", "\n", - "predictions = model.transform(test)\n", "evaluator.transform(predictions).show()" ] }, @@ -224,6 +276,16 @@ "MML Spark supports hyper-parameter tuning from a specified space of parameters which can be randomly sampled (or sampled from a grid of options) from continuous or discrete ranges of values. TuneHyperparameters can apply n-fold cross-validation with the given evaluation metric to more robustly identify the best set of parameters to use for the given model. " ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_FOLDS = 3\n", + "NUM_RUNS = 10" + ] + }, { "cell_type": "code", "execution_count": null, @@ -241,14 +303,14 @@ "tuner = TuneHyperparameters(\n", " evaluationMetric=\"AUC\", \n", " models=[lgbm], \n", - " numFolds=5,\n", - " numRuns=10, \n", + " numFolds=NUM_FOLDS,\n", + " numRuns=NUM_ROUNDS, \n", " parallelism=1,\n", " paramSpace=paramSpace, \n", " seed=42\n", ")\n", "\n", - "bestModel = tuner.fit(train)" + "best_model = tuner.fit(train)" ] }, { @@ -257,10 +319,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(bestModel.getBestModelInfo())\n", - "print(bestModel.getBestModel())\n", - "\n", - "predictions = bestModel.transform(test)\n", + "predictions = best_model.transform(test)\n", "evaluator.transform(predictions).show()" ] }, @@ -269,42 +328,43 @@ "metadata": {}, "source": [ "## Model Saving and Loading\n", - "The model can be saved and reloaded for use in another workflow." + "The full pipeline for operating on raw data including feature processing and model prediction can be saved and reloaded for use in another workflow." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "with TemporaryDirectory() as tmp:\n", " save_file = os.path.join(tmp, r'finished.model')\n", - " model.save(save_file)\n", - " loaded_model = model.load(save_file)" + " pipeline = PipelineModel(stages=[feature_processor, model])\n", + " pipeline.save(save_file)\n", + " loaded_pipeline = PipelineModel.load(save_file)" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "+---------------+------------------+\n", - "|evaluation_type| AUC|\n", - "+---------------+------------------+\n", - "| Classification|0.6716842093722328|\n", - "+---------------+------------------+\n", + "+---------------+--------+\n", + "|evaluation_type|accuracy|\n", + "+---------------+--------+\n", + "| Classification| 0.739|\n", + "+---------------+--------+\n", "\n" ] } ], "source": [ - "# Re-evaluate the performance again\n", - "predictions = loaded_model.transform(test)\n", + "# Test the loaded model on raw data\n", + "predictions = loaded_pipeline.transform(raw_data.limit(1000))\n", "evaluator.transform(predictions).show()" ] }, @@ -319,7 +379,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -331,13 +391,11 @@ " .parseRequest(raw_data.schema)\n", ")\n", "\n", - "# Process features and make predictions\n", - "get_pos_prob = udf(lambda x: float(x[1]))\n", - "\n", - "processed_df = feature_processor.transform(input_df)\n", + "# Process features and reply with the probability of engagement\n", + "get_p_eng = udf(lambda p: float(p.toArray()[1]))\n", "output_df = (\n", - " loaded_model.transform(processed_df)\n", - " .withColumn('p_eng', get_pos_prob(col('probability')).cast(FloatType()))\n", + " loaded_pipeline.transform(input_df)\n", + " .withColumn(\"p_eng\", get_p_eng(col(\"probability\")))\n", " .makeReply(\"p_eng\")\n", ")\n", "\n", @@ -354,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 60, "metadata": {}, "outputs": [ { @@ -365,7 +423,7 @@ " 'isTriggerActive': False}" ] }, - "execution_count": 12, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } @@ -376,26 +434,26 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Response {\"p_eng\":0.16379395}\n" + "Response 0.7075531769538851\n" ] } ], "source": [ "query = raw_data.limit(1).collect()[0].asDict()\n", - "r = requests.post(data=query, url=\"http://localhost:8089/predict\")\n", + "r = requests.post(json=query, url=\"http://localhost:8089/predict\")\n", "print(\"Response {}\".format(r.text))" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 62, "metadata": {}, "outputs": [ { @@ -404,7 +462,7 @@ "{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}" ] }, - "execution_count": 18, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -415,13 +473,20 @@ "checkpoint.cleanup()\n", "server.status" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python (reco_pyspark)", + "display_name": "Python 3", "language": "python", - "name": "reco_pyspark" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/reco_utils/common/spark_utils.py b/reco_utils/common/spark_utils.py index c3f51f12f9..2f79836c97 100644 --- a/reco_utils/common/spark_utils.py +++ b/reco_utils/common/spark_utils.py @@ -1,23 +1,30 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +import os + + try: from pyspark.sql import SparkSession except ImportError: - pass # skip this import if we are in pure python environment + SparkSession = None # skip this import if we are in pure python environment -def start_or_get_spark(app_name="Sample", url="local[*]", memory="10G"): +def start_or_get_spark(app_name="Sample", url="local[*]", memory="10G", packages=None): """Start Spark if not started Args: app_name (str): Set name of the application - url (str): URL for spark master. - memory (str): Size of memory for spark driver. - + url (str): URL for spark master + memory (str): Size of memory for spark driver + packages (list): list of packages to install Returns: obj: Spark context. """ + + if packages is not None: + os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages {} pyspark-shell'.format(','.join(packages)) + spark = ( SparkSession.builder.appName(app_name) .master(url) From bd920ef93300f329223f7d53f9238bf2f8692b4d Mon Sep 17 00:00:00 2001 From: Andreas Argyriou Date: Mon, 25 Mar 2019 18:11:51 +0000 Subject: [PATCH 03/30] Update mmlspark_lightgbm_criteo.ipynb --- notebooks/02_model/mmlspark_lightgbm_criteo.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index dfca0ab0f8..f9d19c92cb 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -91,7 +91,7 @@ "metadata": {}, "source": [ "## Data Preparation\n", - "The Criteo Display Advertising Challenge (DAC) dataset [3] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows and provides a good challenge for scalable machine learning. The dataset has a binary label column and 38 feature columns, where 13 columns are integer values (int00-int12) and 25 columns are categorical features (cat00-cat24). There is also a sample data consist of 100,000 rows which can be used by setting DATA_SIZE = 'sample'.

\n", + "The Criteo Display Advertising Challenge (DAC) dataset [3] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows and provides a good challenge for scalable machine learning. The dataset has a binary label column and 38 feature columns, where 13 columns are integer values (int00-int12) and 25 columns are categorical features (cat00-cat24). There is also a sample data set consisting of 100,000 rows which can be used by setting DATA_SIZE = 'sample'.

\n", "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and indicates a user interaction with an item, so this is a useful dataset to demonstrate how to build a model that will predict likelihood of a user interacting with an item based on the user context and item content features.\n" ] }, From 93579b9bda1e180fa1008a13b4697bc906b6600d Mon Sep 17 00:00:00 2001 From: Scott Graham Date: Mon, 25 Mar 2019 14:33:41 -0400 Subject: [PATCH 04/30] updates to mmlspark notebook, add references, fix small code review issues --- .gitignore | 5 +- .../02_model/mmlspark_lightgbm_criteo.ipynb | 302 ++++++------------ 2 files changed, 93 insertions(+), 214 deletions(-) diff --git a/.gitignore b/.gitignore index 416a173f25..0721e6d33d 100644 --- a/.gitignore +++ b/.gitignore @@ -112,6 +112,9 @@ aml_config aml_scripts aml_data +# Spark +spark-warehouse/ + ########################## .DS_Store .~* @@ -139,4 +142,4 @@ ml-1m/ ml-20m/ *.jar *.item -*.pkl \ No newline at end of file +*.pkl diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index dfca0ab0f8..2a74ef7045 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -30,9 +30,17 @@ "## Global Settings and Imports" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A python script is provided to simplify setting up Azure Databricks with the correct\n", + "dependencies.
Run ```python scripts/databricks_install.py -h``` for more details." + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -69,21 +77,47 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Setup MML Spark\n", "if not is_databricks():\n", - " spark = start_or_get_spark(packages=['Azure:mmlspark:0.16'])\n", + " # get the maven coordinates for MML Spark from databricks_install script\n", + " from reco_utils.scripts.databricks_install import MMLSPARK_INFO\n", + " packages = [MMLSPARK_INFO['maven']['coordinates']]\n", + " spark = start_or_get_spark(packages=packages)\n", "\n", "from mmlspark import ComputeModelStatistics\n", - "from mmlspark import DiscreteHyperParam\n", - "from mmlspark import HyperparamBuilder\n", - "from mmlspark import LightGBMClassifier\n", - "from mmlspark import RandomSpace\n", - "from mmlspark import RangeHyperParam\n", - "from mmlspark import TuneHyperparameters" + "from mmlspark import LightGBMClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'azure.mgmt.authorization'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mWorkspace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mModel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Azure ML SDK version: \"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mVERSION\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/azureml/core/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0mruns\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mrun\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \"\"\"\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mworkspace\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mWorkspace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexperiment\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mExperiment\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrunconfig\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mRunConfiguration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/azureml/core/workspace.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moperator\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mattrgetter\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_project\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0m_commands\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 22\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_file_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile_utils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnormalize_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnormalize_path_and_join\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0mcheck_and_create_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraverse_up_path_and_find_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnormalize_file_ext\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/azureml/_project/_commands.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_base_sdk_common\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommon\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcreate_role_assignment\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_base_sdk_common\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommon\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mresource_client_factory\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgive_warning\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_base_sdk_common\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommon\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mresource_error_handling\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/azureml/_base_sdk_common/common.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0madal\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madal_error\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAdalError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mazure\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmgmt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauthorization\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAuthorizationManagementClient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazure\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmgmt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauthorization\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mRoleAssignmentCreateParameters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazure\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmgmt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresource\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresources\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mResourceManagementClient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'azure.mgmt.authorization'" + ] + } + ], + "source": [ + "from azureml.core import Workspace\n", + "from azureml.core.model import Model\n", + "\n", + "print(\"Azure ML SDK version: \", azureml.core.VERSION)" ] }, { @@ -91,8 +125,9 @@ "metadata": {}, "source": [ "## Data Preparation\n", - "The Criteo Display Advertising Challenge (DAC) dataset [3] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows and provides a good challenge for scalable machine learning. The dataset has a binary label column and 38 feature columns, where 13 columns are integer values (int00-int12) and 25 columns are categorical features (cat00-cat24). There is also a sample data consist of 100,000 rows which can be used by setting DATA_SIZE = 'sample'.

\n", - "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and indicates a user interaction with an item, so this is a useful dataset to demonstrate how to build a model that will predict likelihood of a user interacting with an item based on the user context and item content features.\n" + "The Criteo Display Advertising Challenge (DAC) dataset [4] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows, but there is also a down-sampled dataset which has 100,000 rows (this can be used by setting DATA_SIZE = 'sample').

\n", + "The dataset contains 1 label column and 38 feature columns, where 13 columns are integer values (int00-int12) and 25 columns are categorical features (cat00-cat24).

\n", + "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and indicates a user interaction with an item, so this is a useful dataset to demonstrate how to build a model that will predict likelihood of a user interacting with an item based on the user and item content features.\n" ] }, { @@ -106,19 +141,19 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "8.79MB [00:08, 1.04MB/s] \n" + "8.79MB [00:02, 3.62MB/s] \n" ] } ], "source": [ - "raw_data = load_spark_df(size=DATA_SIZE, spark=spark, local_cache_path='../..')" + "raw_data = load_spark_df(size=DATA_SIZE, spark=spark)" ] }, { @@ -128,24 +163,41 @@ "### Feature Processing\n", "The feature data provided has many missing values across both integer and categorical feature fields. In addition the categorical features have many distinct values, so effectively cleaning and representing the feature data is an important step prior to training a model.

\n", "One of the simplest ways of managing both features that have missing values as well as high cardinality is to use the hashing trick. The FeatureHasher transformer will pass integer values through and will hash categorical features into a sparse vector of lower dimensionality which can be used effectively by LightGBM.

\n", - "Lastly the dataset is split randomly for training and testing the model." + "First the dataset is split randomly for training and testing and feature processing is applied to each dataset." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "raw_train, raw_test = spark_random_split(raw_data, ratio=0.8, seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "columns = [c for c in raw_data.columns if c != 'label']\n", - "feature_processor = FeatureHasher(inputCols=columns, outputCol='features')\n", - "data = feature_processor.transform(raw_data)\n", - "train, test = spark_random_split(data, ratio=0.8, seed=42)" + "feature_processor = FeatureHasher(inputCols=columns, outputCol='features')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "train = feature_processor.transform(raw_train)\n", + "test = feature_processor.transform(raw_test)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -156,10 +208,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ + "# load data from table\n", "train = spark.table('train')\n", "test = spark.table('test')" ] @@ -182,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "metadata": { "tags": [ "parameters" @@ -199,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -227,20 +280,17 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "scrolled": false - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "model = lgbm.fit(train)\n", - "predictions = model.transform(test)\n", - "predictions.cache.count()" + "predictions = model.transform(test)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -248,9 +298,9 @@ "output_type": "stream", "text": [ "+---------------+------------------+\n", - "|evaluation_type| accuracy|\n", + "|evaluation_type| AUC|\n", "+---------------+------------------+\n", - "| Classification|0.7075339448071455|\n", + "| Classification|0.6310011615829604|\n", "+---------------+------------------+\n", "\n" ] @@ -261,65 +311,9 @@ " ComputeModelStatistics()\n", " .setScoredLabelsCol(\"prediction\")\n", " .setLabelCol(\"label\")\n", - " .setEvaluationMetric(\"accuracy\")\n", - ")\n", - "\n", - "evaluator.transform(predictions).show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Model Tuning\n", - "\n", - "MML Spark supports hyper-parameter tuning from a specified space of parameters which can be randomly sampled (or sampled from a grid of options) from continuous or discrete ranges of values. TuneHyperparameters can apply n-fold cross-validation with the given evaluation metric to more robustly identify the best set of parameters to use for the given model. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "NUM_FOLDS = 3\n", - "NUM_RUNS = 10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "params = (\n", - " HyperparamBuilder()\n", - " .addHyperparam(lgbm, lgbm.learningRate, RangeHyperParam(0.001, 1.0))\n", - " .addHyperparam(lgbm, lgbm.numIterations, RangeHyperParam(10, 100))\n", - " .addHyperparam(lgbm, lgbm.numLeaves, DiscreteHyperParam([32, 64, 128]))\n", - ").build()\n", - "paramSpace = RandomSpace(params).space()\n", - "\n", - "tuner = TuneHyperparameters(\n", - " evaluationMetric=\"AUC\", \n", - " models=[lgbm], \n", - " numFolds=NUM_FOLDS,\n", - " numRuns=NUM_ROUNDS, \n", - " parallelism=1,\n", - " paramSpace=paramSpace, \n", - " seed=42\n", + " .setEvaluationMetric(\"AUC\")\n", ")\n", "\n", - "best_model = tuner.fit(train)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "predictions = best_model.transform(test)\n", "evaluator.transform(predictions).show()" ] }, @@ -344,134 +338,15 @@ " loaded_pipeline = PipelineModel.load(save_file)" ] }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---------------+--------+\n", - "|evaluation_type|accuracy|\n", - "+---------------+--------+\n", - "| Classification| 0.739|\n", - "+---------------+--------+\n", - "\n" - ] - } - ], - "source": [ - "# Test the loaded model on raw data\n", - "predictions = loaded_pipeline.transform(raw_data.limit(1000))\n", - "evaluator.transform(predictions).show()" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Model Deployment\n", - "MML Spark provides an easy way to quickly spin up a server to deploy trained models built on top of Spark Streaming DataFrames. In this example the server reads a request, parses it to the same input as the original raw data and applies feature processing then computes the probability of engagement given the user and item features provided. This probability is written back as a response to the original request.

\n", - "Content-based personalization can be accomplished by leveraging this engagement prediction service as the key machine-learning component inside a larger system. To personalize content for a user, a set of items is selected for evaluation and item-content features are extracted for each. These item features can be combined with the user features for each user-item combination and sent to the engagement prediction service which evaluates the probability that a user will engage with each item. The probability can be used to rank the items and select the top-k desired results." - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "# Define spark serving input\n", - "input_df = (\n", - " spark.readStream.server()\n", - " .address(\"localhost\", 8089, \"predict\")\n", - " .load()\n", - " .parseRequest(raw_data.schema)\n", - ")\n", - "\n", - "# Process features and reply with the probability of engagement\n", - "get_p_eng = udf(lambda p: float(p.toArray()[1]))\n", - "output_df = (\n", - " loaded_pipeline.transform(input_df)\n", - " .withColumn(\"p_eng\", get_p_eng(col(\"probability\")))\n", - " .makeReply(\"p_eng\")\n", - ")\n", - "\n", - "# Define spark serving output and start server\n", - "checkpoint = TemporaryDirectory()\n", - "server = (\n", - " output_df.writeStream.server()\n", - " .replyTo(\"predict\")\n", - " .queryName(\"prediction\")\n", - " .option(\"checkpointLocation\", \"file://{}\".format(checkpoint.name))\n", - " .start()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'message': 'Waiting for data to arrive',\n", - " 'isDataAvailable': False,\n", - " 'isTriggerActive': False}" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "server.status" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Response 0.7075531769538851\n" - ] - } - ], - "source": [ - "query = raw_data.limit(1).collect()[0].asDict()\n", - "r = requests.post(json=query, url=\"http://localhost:8089/predict\")\n", - "print(\"Response {}\".format(r.text))" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Cleanup\n", - "server.stop()\n", - "checkpoint.cleanup()\n", - "server.status" + "## Reference\n", + "\\[1\\] Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. LightGBM: A highly efficient gradient boosting decision tree. In Advances in Neural Information Processing Systems. 3146–3154.
\n", + "\\[2\\] MML Spark: https://mmlspark.blob.core.windows.net/website/index.html
\n", + "\\[3\\] MML Spark Serving: https://github.com/Azure/mmlspark/blob/master/docs/mmlspark-serving.md
\n", + "\\[4\\] The Criteo dataset: http://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz
\n" ] }, { @@ -483,6 +358,7 @@ } ], "metadata": { + "celltoolbar": "Tags", "kernelspec": { "display_name": "Python 3", "language": "python", From c4ba0dca62c1488deaea8eb3dd88dade8556ee7e Mon Sep 17 00:00:00 2001 From: Jeremy Reynolds Date: Tue, 26 Mar 2019 00:05:47 -0600 Subject: [PATCH 05/30] add functional draft of lgbm o16n --- .../05_operationalize/lgbm_criteo_o16n.ipynb | 491 ++++++++++++++++++ 1 file changed, 491 insertions(+) create mode 100644 notebooks/05_operationalize/lgbm_criteo_o16n.ipynb diff --git a/notebooks/05_operationalize/lgbm_criteo_o16n.ipynb b/notebooks/05_operationalize/lgbm_criteo_o16n.ipynb new file mode 100644 index 0000000000..4fbb82f7a5 --- /dev/null +++ b/notebooks/05_operationalize/lgbm_criteo_o16n.ipynb @@ -0,0 +1,491 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Goal\n", + "\n", + "This notebook creates a real-time scoring service for the content-personalization model created in the prior [notebook](../02_model/mmlspark_lightgbm_criteo.ipynb). It is assumed that this notebook is run in an Azure Databricks environment that has had `mmlspark` installed and has been prepared for operationalization. See [Setup instructions](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md) for details.\n", + "\n", + "**NOTE**: Please Register Azure Container Instance (ACI) using Azure Portal: https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-manager-supported-services#portal in your subscription before using the SDK to deploy your ML model to ACI." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup libraries and variables\n", + "\n", + "The next few cells initialize the environment and varibles: we import relevant libraries and set variables." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import uuid\n", + "import json\n", + "\n", + "from azureml.core import Workspace\n", + "from azureml.core import VERSION as amlversion\n", + "\n", + "from azureml.core.model import Model\n", + "from azureml.core.conda_dependencies import CondaDependencies \n", + "from azureml.core.webservice import Webservice, AciWebservice\n", + "from azureml.core.image import ContainerImage\n", + "\n", + "# Check core SDK version number\n", + "print(\"SDK version:\", amlversion)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# These variables are used to construct names of assets:\n", + "short_uuid = str(uuid.uuid4())[:4]\n", + "prefix = \"reco\" + short_uuid\n", + "data = \"criteo\"\n", + "algo = \"lgbm\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Azure subscription\n", + "subscription_id = ''\n", + "\n", + "# Resource group and workspace\n", + "resource_group = prefix + \"_\" + data\n", + "workspace_name = prefix + \"_\"+data+\"_aml\"\n", + "workspace_region = \"westus2\"\n", + "print(\"Resource group:\", resource_group)\n", + "\n", + "# AzureML\n", + "#NOTE: The name of a asset must be only letters or numerals, not contain spaces, and under 30 characters\n", + "model_name = data+\"-\"+algo+\".model\" \n", + "service_name = data + \"-\" + algo\n", + "\n", + "# add a name for the container\n", + "container_image_name = '-'.join([data, algo])\n", + "\n", + "\n", + "## locations for serializing so it persists. This is a local API URL\n", + "ws_config_path = '/dbfs/FileStore'\n", + "## location of model on **dbfs**:\n", + "model_path = os.path.join('dbfs:/FileStore/dac',model_name)\n", + "## path to the notebook for modeling. Assumes the entire repository has been imported:\n", + "modeling_notebook = '../02_model/mmlspark_lightgbm_criteo'\n", + "\n", + "## names of other files that are used below\n", + "my_conda_file = \"deploy_conda.yml\"\n", + "driver_file = \"score_sparkml.py\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare Assets for the Scoring Service\n", + "\n", + "Before walking through the steps taken to create a model, it is useful to set some context. In our example, a \"scoring service\" is a function that is executed by a docker container. It takes in some number of records and produces a set of scores for each record (usually predictions of some type) based on a previously estimated model. In our case, we will take the model we estimated earlier that predicts the probability of a click based on some set of numeric and categorical features. In order to create a scoring service, we will do several steps.\n", + "\n", + "We will:\n", + "\n", + "1. Create an Azure Machine Learning Workspace to simplify all the subsequent steps.\n", + "2. Make sure we have access to the previously estimated model. If we are working on a spark system, that means we will make sure the model is on the local filesystem (**not** DBFS) and registered with the Azure Machine Learning Service.\n", + "3. Define a 'driver' script that defines what the system needs to do in order to generate our predictions. This script needs to have an `init` method that does one-time initialization and a `run` method that is executed each time the service is called.\n", + "4. Define all the pre-requisites that that script requries.\n", + "5. Use the model, the driver script, and the pre-requisites to create a docker image.\n", + "6. We will run the docker image on a platform (in our case Azure Container Instance or ACI).\n", + "7. We will test our service." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a Workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ws = Workspace.create(name = workspace_name,\n", + " subscription_id = subscription_id,\n", + " resource_group = resource_group, \n", + " location = workspace_region,\n", + " exist_ok=True)\n", + "\n", + "# persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n", + "ws.write_config(ws_config_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Prepare the Serialized Model\n", + "\n", + "First, we will prepare the serialized model. We will make sure the model exists, and if it doesn't, then we will run the notebook to generate the file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## if it doesn't exist, run the relevant notebook:\n", + "if not os.path.exists(model_path.replace('dbfs:','/dbfs')):\n", + " print('Model pipeline does not exist. Creating by running {}'.format(modeling_notebook))\n", + " dbutils.notebook.run(modeling_notebook, timeout_seconds=600)\n", + "else:\n", + " print('Operationalizing model found at: {}'.format(model_path))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Copy from dbfs to local\n", + "\n", + "While you can access files on DBFS with local file APIs, it is better practice to explicitly copy saved models to and from dbfs, because the local file APIs can only access files smaller than 2 GB (see details [here](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html#access-dbfs-using-local-file-apis)). \n", + "\n", + "Model deployment will always get the model from the current working directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_local = \"file:\" + os.getcwd() + \"/\" + model_name\n", + "dbutils.fs.cp(model_path, model_local, True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Register the Model\n", + "\n", + "Next, we need to register the model in the Azure Machine Learning Workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Register the model\n", + "mymodel = Model.register(model_path = model_name, # this points to a local file\n", + " model_name = model_name, # this is the name the model is registered as\n", + " description = \"LightGBM Criteo Model\",\n", + " workspace = ws)\n", + "\n", + "print(mymodel.name, mymodel.description, mymodel.version)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Create the Driver Script\n", + "\n", + "Next we, need to create the driver script that will be executed when the service is called. The functions that need to be defined for scoring are `init()` and `run()`. The `init()` function is run when the service is created, and the `run()` function is run each time the service is called.\n", + "\n", + "In our example, we use the `init()` function to load all the libraries, initialize the spark session, and load the model and pipeline. We use the `run()` method to parse the input json file, generate predictions (in this case the probability of a click), and format for output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "score_sparkml = \"\"\"\n", + "\n", + "import json\n", + " \n", + "def init():\n", + " # One-time initialization of PySpark and predictive model\n", + " import pyspark\n", + " from pyspark.ml import PipelineModel\n", + " from mmlspark import LightGBMClassifier\n", + " from azureml.core.model import Model\n", + " from pyspark.ml import PipelineModel\n", + " from pyspark.sql.types import StructType, StructField, IntegerType, StringType\n", + "\n", + " global trainedModel\n", + " global spark\n", + " global schema\n", + "\n", + " spark = pyspark.sql.SparkSession.builder.appName(\"LightGBM Criteo Predictions\").getOrCreate()\n", + " model_name = \"{model_name}\" \n", + " model_path = Model.get_model_path(model_name)\n", + " trainedModel = PipelineModel.load(model_path)\n", + " \n", + "def run(input_json):\n", + " if isinstance(trainedModel, Exception):\n", + " return json.dumps({{\"trainedModel\":str(trainedModel)}})\n", + " \n", + " try:\n", + " sc = spark.sparkContext\n", + " input_list = json.loads(input_json)\n", + " input_rdd = sc.parallelize(input_list)\n", + " input_df = spark.read.json(input_rdd)\n", + " \n", + " # Compute prediction\n", + " predictions = trainedModel.transform(input_df).collect()\n", + " #Get probability of a click for each row and conver to a str\n", + " click_prob = [str(x.probability[1]) for x in predictions]\n", + "\n", + " # you can return any data type as long as it is JSON-serializable\n", + " result = \",\".join(click_prob)\n", + " return [result]\n", + " except Exception as e:\n", + " result = str(e)\n", + " return result\n", + "\"\"\".format(model_name=model_name)\n", + " \n", + "exec(score_sparkml)\n", + " \n", + "with open(driver_file, \"w\") as file:\n", + " file.write(score_sparkml)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Define Dependencies\n", + "\n", + "Next, we define the dependencies that are required by driver script." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## azureml-sdk is required to load the registered model\n", + "myconda = CondaDependencies.create(pip_packages=['azureml-sdk'])\n", + "with open(my_conda_file,\"w\") as f:\n", + " f.write(myconda.serialize_to_string())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Create the Image\n", + "\n", + "We use the `ContainerImage` class to first configure, then to create the docker image used. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "myimage_config = ContainerImage.image_configuration(execution_script = driver_file, \n", + " runtime = \"spark-py\",\n", + " conda_file=my_conda_file,\n", + " tags={\"runtime\":\"pyspark\", \"algorithm\":\"lightgbm\"})\n", + "\n", + "image = ContainerImage.create(name = service_name,\n", + " models = [mymodel],\n", + " image_config = myimage_config,\n", + " workspace = ws)\n", + "\n", + "image.wait_for_creation(show_output = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Create the Service\n", + "\n", + "Once we have created an image, we configure and run it on ACI.\n", + "\n", + "**NOTE** You *can* create a service directly from the registered model and image_configuration with the `Webservice.deploy_from_model()` function. We create the image here explicitly and use `deploy_from_image()` for two reasons:\n", + "\n", + "1. It provides more transparency in terms of the actual steps that are taking place\n", + "2. It has potential for faster iteration and for more portability. Once we have an image, we can create a new deployment with the exact same code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#configure ACI\n", + "myaci_config = AciWebservice.deploy_configuration(\n", + " cpu_cores = 2, \n", + " memory_gb = 2, \n", + " tags = {'name':'Azure ML ACI for LightGBM', 'algorithm':'LightGBM'}, \n", + " description = 'Light GBM ACI.')\n", + "\n", + "# Webservice creation\n", + "myservice = Webservice.deploy_from_image(\n", + " workspace=ws, \n", + " name=service_name,\n", + " image=image,\n", + " deployment_config = myaci_config\n", + " )\n", + "\n", + "myservice.wait_for_deployment(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View the URI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#for using the Web HTTP API \n", + "print(myservice.scoring_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Test the Service\n", + "\n", + "Next, we can use data from the `test` data to test the service.\n", + "\n", + "The service expects JSON as its payload, so we take the test data, fill missing values, convert to JSON, then submit to the service endpoint.\n", + "\n", + "We have to fill in missing values here to create the data, because the webservice expects that the data coming into the webservice is well-formed. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_samples_to_test = 10\n", + "\n", + "## load the table created in the other notebook:\n", + "test=spark.table('test')\n", + "test_for_service_df = test.drop('features').fillna('M').fillna(0).limit(n_samples_to_test)\n", + "display(test_for_service_df)\n", + "test_json = json.dumps(test_for_service_df.toJSON().collect())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run the Service and Parse the Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## The prediction is the predicted probability of a click for that particular record\n", + "service_out = myservice.run(input_data=test_json)\n", + "print(service_out)\n", + "values=json.loads('['+service_out[0]+']')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete the Service\n", + "\n", + "When you are done, you can delete the service to minimize costs. You can always redeploy from the image using the same command above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Uncomment the following line to delete the web service\n", + "# myservice.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional Resources\n", + "\n", + "- See the notebook for model estimation [here](https://github.com/Microsoft/Recommenders/blob/gramhagen/lgbm_scenario/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb).\n", + "- This notebook is adapted from the notebooks [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-databricks/amlsdk/).\n", + "- See an example of leveraging the image on AKS [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aks-existingimage-05.ipynb).\n" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "pasha" + } + ], + "kernelspec": { + "display_name": "Python (reco_base)", + "language": "python", + "name": "reco_base" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "name": "deploy-to-aci-04", + "notebookId": 2571086681627427 + }, + "nbformat": 4, + "nbformat_minor": 1 +} From dc5e957ab8b22a92dabf288a098289f43eafc17f Mon Sep 17 00:00:00 2001 From: Jeremy Reynolds Date: Tue, 26 Mar 2019 00:25:59 -0600 Subject: [PATCH 06/30] fix minor adjustments for databricks --- notebooks/05_operationalize/lgbm_criteo_o16n.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/05_operationalize/lgbm_criteo_o16n.ipynb b/notebooks/05_operationalize/lgbm_criteo_o16n.ipynb index 4fbb82f7a5..79b1232610 100644 --- a/notebooks/05_operationalize/lgbm_criteo_o16n.ipynb +++ b/notebooks/05_operationalize/lgbm_criteo_o16n.ipynb @@ -40,7 +40,7 @@ "import json\n", "\n", "from azureml.core import Workspace\n", - "from azureml.core import VERSION as amlversion\n", + "from azureml.core import VERSION as azuremlversion\n", "\n", "from azureml.core.model import Model\n", "from azureml.core.conda_dependencies import CondaDependencies \n", @@ -48,7 +48,7 @@ "from azureml.core.image import ContainerImage\n", "\n", "# Check core SDK version number\n", - "print(\"SDK version:\", amlversion)" + "print(\"SDK version: {}\".format(azuremlversion))" ] }, { From 16fb74b1c25af3e5213ad5db0d383fa1e9d556bd Mon Sep 17 00:00:00 2001 From: Jeremy Reynolds Date: Tue, 26 Mar 2019 00:29:37 -0600 Subject: [PATCH 07/30] fix minor adjustments for databricks --- .../02_model/mmlspark_lightgbm_criteo.ipynb | 28 +++++++------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 4bdcc2be5d..54eea30def 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -56,15 +56,14 @@ "source": [ "import os\n", "import sys\n", + "import shutil\n", + "\n", "from tempfile import TemporaryDirectory\n", "sys.path.append(\"../../\")\n", "\n", "import pyspark\n", "from pyspark.ml import PipelineModel\n", "from pyspark.ml.feature import FeatureHasher\n", - "from pyspark.sql.functions import col, udf\n", - "from pyspark.sql.types import FloatType\n", - "import requests\n", "\n", "from reco_utils.common.spark_utils import start_or_get_spark\n", "from reco_utils.common.notebook_utils import is_databricks\n", @@ -94,30 +93,23 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 1, "metadata": {}, "outputs": [ { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'azure.mgmt.authorization'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mWorkspace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mModel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Azure ML SDK version: \"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mVERSION\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/azureml/core/__init__.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0mruns\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mrun\u001b[0m \u001b[0moutput\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \"\"\"\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mworkspace\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mWorkspace\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexperiment\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mExperiment\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrunconfig\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mRunConfiguration\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/azureml/core/workspace.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0moperator\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mattrgetter\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 21\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_project\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0m_commands\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 22\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_file_utils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfile_utils\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnormalize_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnormalize_path_and_join\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0mcheck_and_create_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraverse_up_path_and_find_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnormalize_file_ext\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/azureml/_project/_commands.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_base_sdk_common\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommon\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mcreate_role_assignment\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_base_sdk_common\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommon\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mresource_client_factory\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgive_warning\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazureml\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_base_sdk_common\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommon\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mresource_error_handling\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/envs/reco_pyspark/lib/python3.6/site-packages/azureml/_base_sdk_common/common.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0madal\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0madal_error\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAdalError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mazure\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmgmt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauthorization\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAuthorizationManagementClient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 18\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazure\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmgmt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauthorization\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodels\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mRoleAssignmentCreateParameters\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mazure\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmgmt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresource\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresources\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mResourceManagementClient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'azure.mgmt.authorization'" + "name": "stdout", + "output_type": "stream", + "text": [ + "Azure ML SDK version: 1.0.18\n" ] } ], "source": [ "from azureml.core import Workspace\n", "from azureml.core.model import Model\n", + "from azureml.core import VERSION as azuremlversion\n", "\n", - "print(\"Azure ML SDK version: \", azureml.core.VERSION)" + "print(\"Azure ML SDK version: {}\".format(azuremlversion))" ] }, { @@ -157,7 +149,7 @@ } ], "source": [ - "raw_data = load_spark_df(size=DATA_SIZE, spark=spark)" + "raw_data = load_spark_df(size=DATA_SIZE, spark=spark, dbutils=dbutils)" ] }, { From b51e20154aab58f200eb2b6f4f85a18631a40a29 Mon Sep 17 00:00:00 2001 From: Andreas Argyriou Date: Tue, 26 Mar 2019 11:17:03 +0000 Subject: [PATCH 08/30] Update mmlspark_lightgbm_criteo.ipynb --- notebooks/02_model/mmlspark_lightgbm_criteo.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 54eea30def..6a0f92d6c1 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -360,7 +360,7 @@ "metadata": {}, "source": [ "## Reference\n", - "\\[1\\] Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. LightGBM: A highly efficient gradient boosting decision tree. In Advances in Neural Information Processing Systems. 3146–3154.
\n", + "\\[1\\] Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. LightGBM: A highly efficient gradient boosting decision tree. In Advances in Neural Information Processing Systems. 3146–3154. https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf
\n", "\\[2\\] MML Spark: https://mmlspark.blob.core.windows.net/website/index.html
\n", "\\[3\\] MML Spark Serving: https://github.com/Azure/mmlspark/blob/master/docs/mmlspark-serving.md
\n", "\\[4\\] The Criteo dataset: http://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz
\n" From 3281980065a6e765eaacf8e3ce8b1dacf3324a1f Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Mar 2019 11:27:20 +0000 Subject: [PATCH 09/30] update lgb notebook --- .../02_model/mmlspark_lightgbm_criteo.ipynb | 58 +-- .../05_operationalize/lgbm_criteo_o16n.ipynb | 491 ------------------ scripts/generate_conda_file.py | 1 + 3 files changed, 20 insertions(+), 530 deletions(-) delete mode 100644 notebooks/05_operationalize/lgbm_criteo_o16n.ipynb diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 54eea30def..abe7eea5db 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -16,8 +16,8 @@ "# Content Based Personalization\n", "## LightGBM on Azure Databricks
\n", "This notebook provides a quick example of how to train LightGBM model on Azure Databricks and deploy it using MML Spark for a content personalization scenario.

\n", - "LightGBM \\[1\\] is a gradient boosting framework that uses tree-based learning algorithms.
\n", - "MML Spark \\[2\\] allows LightGBM to be called in a Spark environment which provides several advantages:\n", + "[LightGBM](https://github.com/Microsoft/Lightgbm) \\[1\\] is a gradient boosting framework that uses tree-based learning algorithms.
\n", + "[MMLSpark](https://github.com/Azure/mmlspark) \\[2\\] allows LightGBM to be called in a Spark environment which provides several advantages:\n", "- Distributed computation for model development\n", "- Easy integration into existing Spark workflows\n", "- Model serving through Spark Serving \\[3\\]" @@ -40,16 +40,18 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", + "System version: 3.6.7 | packaged by conda-forge | (default, Nov 21 2018, 03:09:43) \n", "[GCC 7.3.0]\n", - "PySpark version: 2.3.1\n" + "PySpark version: 2.3.1\n", + "Azure ML SDK version: 1.0.10\n", + "MMLSpark version: Azure:mmlspark:0.16\n" ] } ], @@ -65,21 +67,15 @@ "from pyspark.ml import PipelineModel\n", "from pyspark.ml.feature import FeatureHasher\n", "\n", + "from azureml.core import Workspace\n", + "from azureml.core.model import Model\n", + "from azureml.core import VERSION as azuremlversion\n", + "\n", "from reco_utils.common.spark_utils import start_or_get_spark\n", "from reco_utils.common.notebook_utils import is_databricks\n", "from reco_utils.dataset.criteo import load_spark_df\n", "from reco_utils.dataset.spark_splitters import spark_random_split\n", "\n", - "print(\"System version: {}\".format(sys.version))\n", - "print(\"PySpark version: {}\".format(pyspark.version.__version__))" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ "# Setup MML Spark\n", "if not is_databricks():\n", " # get the maven coordinates for MML Spark from databricks_install script\n", @@ -88,28 +84,12 @@ " spark = start_or_get_spark(packages=packages)\n", "\n", "from mmlspark import ComputeModelStatistics\n", - "from mmlspark import LightGBMClassifier" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Azure ML SDK version: 1.0.18\n" - ] - } - ], - "source": [ - "from azureml.core import Workspace\n", - "from azureml.core.model import Model\n", - "from azureml.core import VERSION as azuremlversion\n", + "from mmlspark import LightGBMClassifier\n", "\n", - "print(\"Azure ML SDK version: {}\".format(azuremlversion))" + "print(\"System version: {}\".format(sys.version))\n", + "print(\"PySpark version: {}\".format(pyspark.version.__version__))\n", + "print(\"Azure ML SDK version: {}\".format(azuremlversion))\n", + "print(\"MMLSpark version: {}\".format(MMLSPARK_INFO['maven']['coordinates']))" ] }, { @@ -377,9 +357,9 @@ "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python (reco_pyspark)", "language": "python", - "name": "python3" + "name": "reco_pyspark" }, "language_info": { "codemirror_mode": { @@ -391,7 +371,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/notebooks/05_operationalize/lgbm_criteo_o16n.ipynb b/notebooks/05_operationalize/lgbm_criteo_o16n.ipynb deleted file mode 100644 index 79b1232610..0000000000 --- a/notebooks/05_operationalize/lgbm_criteo_o16n.ipynb +++ /dev/null @@ -1,491 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Goal\n", - "\n", - "This notebook creates a real-time scoring service for the content-personalization model created in the prior [notebook](../02_model/mmlspark_lightgbm_criteo.ipynb). It is assumed that this notebook is run in an Azure Databricks environment that has had `mmlspark` installed and has been prepared for operationalization. See [Setup instructions](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md) for details.\n", - "\n", - "**NOTE**: Please Register Azure Container Instance (ACI) using Azure Portal: https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-manager-supported-services#portal in your subscription before using the SDK to deploy your ML model to ACI." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup libraries and variables\n", - "\n", - "The next few cells initialize the environment and varibles: we import relevant libraries and set variables." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import uuid\n", - "import json\n", - "\n", - "from azureml.core import Workspace\n", - "from azureml.core import VERSION as azuremlversion\n", - "\n", - "from azureml.core.model import Model\n", - "from azureml.core.conda_dependencies import CondaDependencies \n", - "from azureml.core.webservice import Webservice, AciWebservice\n", - "from azureml.core.image import ContainerImage\n", - "\n", - "# Check core SDK version number\n", - "print(\"SDK version: {}\".format(azuremlversion))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# These variables are used to construct names of assets:\n", - "short_uuid = str(uuid.uuid4())[:4]\n", - "prefix = \"reco\" + short_uuid\n", - "data = \"criteo\"\n", - "algo = \"lgbm\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Azure subscription\n", - "subscription_id = ''\n", - "\n", - "# Resource group and workspace\n", - "resource_group = prefix + \"_\" + data\n", - "workspace_name = prefix + \"_\"+data+\"_aml\"\n", - "workspace_region = \"westus2\"\n", - "print(\"Resource group:\", resource_group)\n", - "\n", - "# AzureML\n", - "#NOTE: The name of a asset must be only letters or numerals, not contain spaces, and under 30 characters\n", - "model_name = data+\"-\"+algo+\".model\" \n", - "service_name = data + \"-\" + algo\n", - "\n", - "# add a name for the container\n", - "container_image_name = '-'.join([data, algo])\n", - "\n", - "\n", - "## locations for serializing so it persists. This is a local API URL\n", - "ws_config_path = '/dbfs/FileStore'\n", - "## location of model on **dbfs**:\n", - "model_path = os.path.join('dbfs:/FileStore/dac',model_name)\n", - "## path to the notebook for modeling. Assumes the entire repository has been imported:\n", - "modeling_notebook = '../02_model/mmlspark_lightgbm_criteo'\n", - "\n", - "## names of other files that are used below\n", - "my_conda_file = \"deploy_conda.yml\"\n", - "driver_file = \"score_sparkml.py\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare Assets for the Scoring Service\n", - "\n", - "Before walking through the steps taken to create a model, it is useful to set some context. In our example, a \"scoring service\" is a function that is executed by a docker container. It takes in some number of records and produces a set of scores for each record (usually predictions of some type) based on a previously estimated model. In our case, we will take the model we estimated earlier that predicts the probability of a click based on some set of numeric and categorical features. In order to create a scoring service, we will do several steps.\n", - "\n", - "We will:\n", - "\n", - "1. Create an Azure Machine Learning Workspace to simplify all the subsequent steps.\n", - "2. Make sure we have access to the previously estimated model. If we are working on a spark system, that means we will make sure the model is on the local filesystem (**not** DBFS) and registered with the Azure Machine Learning Service.\n", - "3. Define a 'driver' script that defines what the system needs to do in order to generate our predictions. This script needs to have an `init` method that does one-time initialization and a `run` method that is executed each time the service is called.\n", - "4. Define all the pre-requisites that that script requries.\n", - "5. Use the model, the driver script, and the pre-requisites to create a docker image.\n", - "6. We will run the docker image on a platform (in our case Azure Container Instance or ACI).\n", - "7. We will test our service." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create a Workspace" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ws = Workspace.create(name = workspace_name,\n", - " subscription_id = subscription_id,\n", - " resource_group = resource_group, \n", - " location = workspace_region,\n", - " exist_ok=True)\n", - "\n", - "# persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n", - "ws.write_config(ws_config_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Prepare the Serialized Model\n", - "\n", - "First, we will prepare the serialized model. We will make sure the model exists, and if it doesn't, then we will run the notebook to generate the file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## if it doesn't exist, run the relevant notebook:\n", - "if not os.path.exists(model_path.replace('dbfs:','/dbfs')):\n", - " print('Model pipeline does not exist. Creating by running {}'.format(modeling_notebook))\n", - " dbutils.notebook.run(modeling_notebook, timeout_seconds=600)\n", - "else:\n", - " print('Operationalizing model found at: {}'.format(model_path))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Copy from dbfs to local\n", - "\n", - "While you can access files on DBFS with local file APIs, it is better practice to explicitly copy saved models to and from dbfs, because the local file APIs can only access files smaller than 2 GB (see details [here](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html#access-dbfs-using-local-file-apis)). \n", - "\n", - "Model deployment will always get the model from the current working directory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model_local = \"file:\" + os.getcwd() + \"/\" + model_name\n", - "dbutils.fs.cp(model_path, model_local, True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Register the Model\n", - "\n", - "Next, we need to register the model in the Azure Machine Learning Workspace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Register the model\n", - "mymodel = Model.register(model_path = model_name, # this points to a local file\n", - " model_name = model_name, # this is the name the model is registered as\n", - " description = \"LightGBM Criteo Model\",\n", - " workspace = ws)\n", - "\n", - "print(mymodel.name, mymodel.description, mymodel.version)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Create the Driver Script\n", - "\n", - "Next we, need to create the driver script that will be executed when the service is called. The functions that need to be defined for scoring are `init()` and `run()`. The `init()` function is run when the service is created, and the `run()` function is run each time the service is called.\n", - "\n", - "In our example, we use the `init()` function to load all the libraries, initialize the spark session, and load the model and pipeline. We use the `run()` method to parse the input json file, generate predictions (in this case the probability of a click), and format for output." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "score_sparkml = \"\"\"\n", - "\n", - "import json\n", - " \n", - "def init():\n", - " # One-time initialization of PySpark and predictive model\n", - " import pyspark\n", - " from pyspark.ml import PipelineModel\n", - " from mmlspark import LightGBMClassifier\n", - " from azureml.core.model import Model\n", - " from pyspark.ml import PipelineModel\n", - " from pyspark.sql.types import StructType, StructField, IntegerType, StringType\n", - "\n", - " global trainedModel\n", - " global spark\n", - " global schema\n", - "\n", - " spark = pyspark.sql.SparkSession.builder.appName(\"LightGBM Criteo Predictions\").getOrCreate()\n", - " model_name = \"{model_name}\" \n", - " model_path = Model.get_model_path(model_name)\n", - " trainedModel = PipelineModel.load(model_path)\n", - " \n", - "def run(input_json):\n", - " if isinstance(trainedModel, Exception):\n", - " return json.dumps({{\"trainedModel\":str(trainedModel)}})\n", - " \n", - " try:\n", - " sc = spark.sparkContext\n", - " input_list = json.loads(input_json)\n", - " input_rdd = sc.parallelize(input_list)\n", - " input_df = spark.read.json(input_rdd)\n", - " \n", - " # Compute prediction\n", - " predictions = trainedModel.transform(input_df).collect()\n", - " #Get probability of a click for each row and conver to a str\n", - " click_prob = [str(x.probability[1]) for x in predictions]\n", - "\n", - " # you can return any data type as long as it is JSON-serializable\n", - " result = \",\".join(click_prob)\n", - " return [result]\n", - " except Exception as e:\n", - " result = str(e)\n", - " return result\n", - "\"\"\".format(model_name=model_name)\n", - " \n", - "exec(score_sparkml)\n", - " \n", - "with open(driver_file, \"w\") as file:\n", - " file.write(score_sparkml)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Define Dependencies\n", - "\n", - "Next, we define the dependencies that are required by driver script." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## azureml-sdk is required to load the registered model\n", - "myconda = CondaDependencies.create(pip_packages=['azureml-sdk'])\n", - "with open(my_conda_file,\"w\") as f:\n", - " f.write(myconda.serialize_to_string())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Create the Image\n", - "\n", - "We use the `ContainerImage` class to first configure, then to create the docker image used. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "myimage_config = ContainerImage.image_configuration(execution_script = driver_file, \n", - " runtime = \"spark-py\",\n", - " conda_file=my_conda_file,\n", - " tags={\"runtime\":\"pyspark\", \"algorithm\":\"lightgbm\"})\n", - "\n", - "image = ContainerImage.create(name = service_name,\n", - " models = [mymodel],\n", - " image_config = myimage_config,\n", - " workspace = ws)\n", - "\n", - "image.wait_for_creation(show_output = True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Create the Service\n", - "\n", - "Once we have created an image, we configure and run it on ACI.\n", - "\n", - "**NOTE** You *can* create a service directly from the registered model and image_configuration with the `Webservice.deploy_from_model()` function. We create the image here explicitly and use `deploy_from_image()` for two reasons:\n", - "\n", - "1. It provides more transparency in terms of the actual steps that are taking place\n", - "2. It has potential for faster iteration and for more portability. Once we have an image, we can create a new deployment with the exact same code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#configure ACI\n", - "myaci_config = AciWebservice.deploy_configuration(\n", - " cpu_cores = 2, \n", - " memory_gb = 2, \n", - " tags = {'name':'Azure ML ACI for LightGBM', 'algorithm':'LightGBM'}, \n", - " description = 'Light GBM ACI.')\n", - "\n", - "# Webservice creation\n", - "myservice = Webservice.deploy_from_image(\n", - " workspace=ws, \n", - " name=service_name,\n", - " image=image,\n", - " deployment_config = myaci_config\n", - " )\n", - "\n", - "myservice.wait_for_deployment(show_output=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### View the URI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#for using the Web HTTP API \n", - "print(myservice.scoring_uri)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 7. Test the Service\n", - "\n", - "Next, we can use data from the `test` data to test the service.\n", - "\n", - "The service expects JSON as its payload, so we take the test data, fill missing values, convert to JSON, then submit to the service endpoint.\n", - "\n", - "We have to fill in missing values here to create the data, because the webservice expects that the data coming into the webservice is well-formed. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_samples_to_test = 10\n", - "\n", - "## load the table created in the other notebook:\n", - "test=spark.table('test')\n", - "test_for_service_df = test.drop('features').fillna('M').fillna(0).limit(n_samples_to_test)\n", - "display(test_for_service_df)\n", - "test_json = json.dumps(test_for_service_df.toJSON().collect())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run the Service and Parse the Output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## The prediction is the predicted probability of a click for that particular record\n", - "service_out = myservice.run(input_data=test_json)\n", - "print(service_out)\n", - "values=json.loads('['+service_out[0]+']')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Delete the Service\n", - "\n", - "When you are done, you can delete the service to minimize costs. You can always redeploy from the image using the same command above." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Uncomment the following line to delete the web service\n", - "# myservice.delete()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Additional Resources\n", - "\n", - "- See the notebook for model estimation [here](https://github.com/Microsoft/Recommenders/blob/gramhagen/lgbm_scenario/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb).\n", - "- This notebook is adapted from the notebooks [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-databricks/amlsdk/).\n", - "- See an example of leveraging the image on AKS [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aks-existingimage-05.ipynb).\n" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "pasha" - } - ], - "kernelspec": { - "display_name": "Python (reco_base)", - "language": "python", - "name": "reco_base" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "name": "deploy-to-aci-04", - "notebookId": 2571086681627427 - }, - "nbformat": 4, - "nbformat_minor": 1 -} diff --git a/scripts/generate_conda_file.py b/scripts/generate_conda_file.py index fbef862640..6115eee69f 100644 --- a/scripts/generate_conda_file.py +++ b/scripts/generate_conda_file.py @@ -78,6 +78,7 @@ "papermill": "papermill>=0.15.0", "pydocumentdb": "pydocumentdb>=2.3.3", "tqdm": "tqdm==4.31.1", + "databricks-cli": "databricks-cli>=0.8.6", } PIP_PYSPARK = {} From d18e3da38bdd3b32ded5804d845498c90fc824c0 Mon Sep 17 00:00:00 2001 From: Andreas Argyriou Date: Tue, 26 Mar 2019 11:32:29 +0000 Subject: [PATCH 10/30] Update mmlspark_lightgbm_criteo.ipynb --- notebooks/02_model/mmlspark_lightgbm_criteo.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 6a0f92d6c1..3bb2478c8a 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -240,10 +240,10 @@ "outputs": [], "source": [ "NUM_LEAVES = 32\n", - "NUM_ITERATIONS = 10\n", + "NUM_ITERATIONS = 30\n", "LEARNING_RATE = 0.15\n", "FEATURE_FRACTION = 0.8\n", - "EARLY_STOPPING_ROUND = 20" + "EARLY_STOPPING_ROUND = 10" ] }, { From 5ebd8b0dc3537ed84e54f0d7699e24e1000437cb Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Mar 2019 11:37:42 +0000 Subject: [PATCH 11/30] lgb training --- .../02_model/mmlspark_lightgbm_criteo.ipynb | 45 +++---------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index abe7eea5db..df26caead0 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -50,7 +50,6 @@ "System version: 3.6.7 | packaged by conda-forge | (default, Nov 21 2018, 03:09:43) \n", "[GCC 7.3.0]\n", "PySpark version: 2.3.1\n", - "Azure ML SDK version: 1.0.10\n", "MMLSpark version: Azure:mmlspark:0.16\n" ] } @@ -67,10 +66,6 @@ "from pyspark.ml import PipelineModel\n", "from pyspark.ml.feature import FeatureHasher\n", "\n", - "from azureml.core import Workspace\n", - "from azureml.core.model import Model\n", - "from azureml.core import VERSION as azuremlversion\n", - "\n", "from reco_utils.common.spark_utils import start_or_get_spark\n", "from reco_utils.common.notebook_utils import is_databricks\n", "from reco_utils.dataset.criteo import load_spark_df\n", @@ -88,7 +83,6 @@ "\n", "print(\"System version: {}\".format(sys.version))\n", "print(\"PySpark version: {}\".format(pyspark.version.__version__))\n", - "print(\"Azure ML SDK version: {}\".format(azuremlversion))\n", "print(\"MMLSpark version: {}\".format(MMLSPARK_INFO['maven']['coordinates']))" ] }, @@ -104,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": { "tags": [ "parameters" @@ -112,7 +106,7 @@ }, "outputs": [], "source": [ - "DATA_SIZE = 'sample'" + "DATA_SIZE = 'sample'\n" ] }, { @@ -307,32 +301,10 @@ "metadata": {}, "outputs": [], "source": [ - "tmp = TemporaryDirectory()\n", - "\n", - "# save model to temporary directory\n", + "# save model\n", "model_name = 'finished.model'\n", - "save_file = os.path.join(tmp.name, model_name)\n", "pipeline = PipelineModel(stages=[feature_processor, model])\n", - "pipeline.save(save_file)\n", - "\n", - "# zip file for transfer to Azure ML Workspace\n", - "zip_file = shutil.make_archive(base_name=model_name, \n", - " format='zip', \n", - " root_dir='/dbfs{}'.format(save_file))\n", - "\n", - "# register model in Azure ML Workspace\n", - "ws = Workspace.from_config()\n", - "model = Model.register(workspace=ws, model_path=zip_file, model_name=model_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# cleanup temporary directory\n", - "tmp.cleanup()" + "pipeline.save(model_name)\n" ] }, { @@ -345,13 +317,6 @@ "\\[3\\] MML Spark Serving: https://github.com/Azure/mmlspark/blob/master/docs/mmlspark-serving.md
\n", "\\[4\\] The Criteo dataset: http://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz
\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 1e22a11aad9529628103d6385844e75510361d5e Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Mar 2019 11:39:26 +0000 Subject: [PATCH 12/30] jeremy's code --- .../lightgbm_criteo_o16n.ipynb | 491 ++++++++++++++++++ 1 file changed, 491 insertions(+) create mode 100644 notebooks/05_operationalize/lightgbm_criteo_o16n.ipynb diff --git a/notebooks/05_operationalize/lightgbm_criteo_o16n.ipynb b/notebooks/05_operationalize/lightgbm_criteo_o16n.ipynb new file mode 100644 index 0000000000..79b1232610 --- /dev/null +++ b/notebooks/05_operationalize/lightgbm_criteo_o16n.ipynb @@ -0,0 +1,491 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Goal\n", + "\n", + "This notebook creates a real-time scoring service for the content-personalization model created in the prior [notebook](../02_model/mmlspark_lightgbm_criteo.ipynb). It is assumed that this notebook is run in an Azure Databricks environment that has had `mmlspark` installed and has been prepared for operationalization. See [Setup instructions](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md) for details.\n", + "\n", + "**NOTE**: Please Register Azure Container Instance (ACI) using Azure Portal: https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-manager-supported-services#portal in your subscription before using the SDK to deploy your ML model to ACI." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup libraries and variables\n", + "\n", + "The next few cells initialize the environment and varibles: we import relevant libraries and set variables." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import uuid\n", + "import json\n", + "\n", + "from azureml.core import Workspace\n", + "from azureml.core import VERSION as azuremlversion\n", + "\n", + "from azureml.core.model import Model\n", + "from azureml.core.conda_dependencies import CondaDependencies \n", + "from azureml.core.webservice import Webservice, AciWebservice\n", + "from azureml.core.image import ContainerImage\n", + "\n", + "# Check core SDK version number\n", + "print(\"SDK version: {}\".format(azuremlversion))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# These variables are used to construct names of assets:\n", + "short_uuid = str(uuid.uuid4())[:4]\n", + "prefix = \"reco\" + short_uuid\n", + "data = \"criteo\"\n", + "algo = \"lgbm\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Azure subscription\n", + "subscription_id = ''\n", + "\n", + "# Resource group and workspace\n", + "resource_group = prefix + \"_\" + data\n", + "workspace_name = prefix + \"_\"+data+\"_aml\"\n", + "workspace_region = \"westus2\"\n", + "print(\"Resource group:\", resource_group)\n", + "\n", + "# AzureML\n", + "#NOTE: The name of a asset must be only letters or numerals, not contain spaces, and under 30 characters\n", + "model_name = data+\"-\"+algo+\".model\" \n", + "service_name = data + \"-\" + algo\n", + "\n", + "# add a name for the container\n", + "container_image_name = '-'.join([data, algo])\n", + "\n", + "\n", + "## locations for serializing so it persists. This is a local API URL\n", + "ws_config_path = '/dbfs/FileStore'\n", + "## location of model on **dbfs**:\n", + "model_path = os.path.join('dbfs:/FileStore/dac',model_name)\n", + "## path to the notebook for modeling. Assumes the entire repository has been imported:\n", + "modeling_notebook = '../02_model/mmlspark_lightgbm_criteo'\n", + "\n", + "## names of other files that are used below\n", + "my_conda_file = \"deploy_conda.yml\"\n", + "driver_file = \"score_sparkml.py\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prepare Assets for the Scoring Service\n", + "\n", + "Before walking through the steps taken to create a model, it is useful to set some context. In our example, a \"scoring service\" is a function that is executed by a docker container. It takes in some number of records and produces a set of scores for each record (usually predictions of some type) based on a previously estimated model. In our case, we will take the model we estimated earlier that predicts the probability of a click based on some set of numeric and categorical features. In order to create a scoring service, we will do several steps.\n", + "\n", + "We will:\n", + "\n", + "1. Create an Azure Machine Learning Workspace to simplify all the subsequent steps.\n", + "2. Make sure we have access to the previously estimated model. If we are working on a spark system, that means we will make sure the model is on the local filesystem (**not** DBFS) and registered with the Azure Machine Learning Service.\n", + "3. Define a 'driver' script that defines what the system needs to do in order to generate our predictions. This script needs to have an `init` method that does one-time initialization and a `run` method that is executed each time the service is called.\n", + "4. Define all the pre-requisites that that script requries.\n", + "5. Use the model, the driver script, and the pre-requisites to create a docker image.\n", + "6. We will run the docker image on a platform (in our case Azure Container Instance or ACI).\n", + "7. We will test our service." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a Workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ws = Workspace.create(name = workspace_name,\n", + " subscription_id = subscription_id,\n", + " resource_group = resource_group, \n", + " location = workspace_region,\n", + " exist_ok=True)\n", + "\n", + "# persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n", + "ws.write_config(ws_config_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Prepare the Serialized Model\n", + "\n", + "First, we will prepare the serialized model. We will make sure the model exists, and if it doesn't, then we will run the notebook to generate the file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## if it doesn't exist, run the relevant notebook:\n", + "if not os.path.exists(model_path.replace('dbfs:','/dbfs')):\n", + " print('Model pipeline does not exist. Creating by running {}'.format(modeling_notebook))\n", + " dbutils.notebook.run(modeling_notebook, timeout_seconds=600)\n", + "else:\n", + " print('Operationalizing model found at: {}'.format(model_path))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Copy from dbfs to local\n", + "\n", + "While you can access files on DBFS with local file APIs, it is better practice to explicitly copy saved models to and from dbfs, because the local file APIs can only access files smaller than 2 GB (see details [here](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html#access-dbfs-using-local-file-apis)). \n", + "\n", + "Model deployment will always get the model from the current working directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_local = \"file:\" + os.getcwd() + \"/\" + model_name\n", + "dbutils.fs.cp(model_path, model_local, True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Register the Model\n", + "\n", + "Next, we need to register the model in the Azure Machine Learning Workspace." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Register the model\n", + "mymodel = Model.register(model_path = model_name, # this points to a local file\n", + " model_name = model_name, # this is the name the model is registered as\n", + " description = \"LightGBM Criteo Model\",\n", + " workspace = ws)\n", + "\n", + "print(mymodel.name, mymodel.description, mymodel.version)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Create the Driver Script\n", + "\n", + "Next we, need to create the driver script that will be executed when the service is called. The functions that need to be defined for scoring are `init()` and `run()`. The `init()` function is run when the service is created, and the `run()` function is run each time the service is called.\n", + "\n", + "In our example, we use the `init()` function to load all the libraries, initialize the spark session, and load the model and pipeline. We use the `run()` method to parse the input json file, generate predictions (in this case the probability of a click), and format for output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "score_sparkml = \"\"\"\n", + "\n", + "import json\n", + " \n", + "def init():\n", + " # One-time initialization of PySpark and predictive model\n", + " import pyspark\n", + " from pyspark.ml import PipelineModel\n", + " from mmlspark import LightGBMClassifier\n", + " from azureml.core.model import Model\n", + " from pyspark.ml import PipelineModel\n", + " from pyspark.sql.types import StructType, StructField, IntegerType, StringType\n", + "\n", + " global trainedModel\n", + " global spark\n", + " global schema\n", + "\n", + " spark = pyspark.sql.SparkSession.builder.appName(\"LightGBM Criteo Predictions\").getOrCreate()\n", + " model_name = \"{model_name}\" \n", + " model_path = Model.get_model_path(model_name)\n", + " trainedModel = PipelineModel.load(model_path)\n", + " \n", + "def run(input_json):\n", + " if isinstance(trainedModel, Exception):\n", + " return json.dumps({{\"trainedModel\":str(trainedModel)}})\n", + " \n", + " try:\n", + " sc = spark.sparkContext\n", + " input_list = json.loads(input_json)\n", + " input_rdd = sc.parallelize(input_list)\n", + " input_df = spark.read.json(input_rdd)\n", + " \n", + " # Compute prediction\n", + " predictions = trainedModel.transform(input_df).collect()\n", + " #Get probability of a click for each row and conver to a str\n", + " click_prob = [str(x.probability[1]) for x in predictions]\n", + "\n", + " # you can return any data type as long as it is JSON-serializable\n", + " result = \",\".join(click_prob)\n", + " return [result]\n", + " except Exception as e:\n", + " result = str(e)\n", + " return result\n", + "\"\"\".format(model_name=model_name)\n", + " \n", + "exec(score_sparkml)\n", + " \n", + "with open(driver_file, \"w\") as file:\n", + " file.write(score_sparkml)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Define Dependencies\n", + "\n", + "Next, we define the dependencies that are required by driver script." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## azureml-sdk is required to load the registered model\n", + "myconda = CondaDependencies.create(pip_packages=['azureml-sdk'])\n", + "with open(my_conda_file,\"w\") as f:\n", + " f.write(myconda.serialize_to_string())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Create the Image\n", + "\n", + "We use the `ContainerImage` class to first configure, then to create the docker image used. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "myimage_config = ContainerImage.image_configuration(execution_script = driver_file, \n", + " runtime = \"spark-py\",\n", + " conda_file=my_conda_file,\n", + " tags={\"runtime\":\"pyspark\", \"algorithm\":\"lightgbm\"})\n", + "\n", + "image = ContainerImage.create(name = service_name,\n", + " models = [mymodel],\n", + " image_config = myimage_config,\n", + " workspace = ws)\n", + "\n", + "image.wait_for_creation(show_output = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Create the Service\n", + "\n", + "Once we have created an image, we configure and run it on ACI.\n", + "\n", + "**NOTE** You *can* create a service directly from the registered model and image_configuration with the `Webservice.deploy_from_model()` function. We create the image here explicitly and use `deploy_from_image()` for two reasons:\n", + "\n", + "1. It provides more transparency in terms of the actual steps that are taking place\n", + "2. It has potential for faster iteration and for more portability. Once we have an image, we can create a new deployment with the exact same code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#configure ACI\n", + "myaci_config = AciWebservice.deploy_configuration(\n", + " cpu_cores = 2, \n", + " memory_gb = 2, \n", + " tags = {'name':'Azure ML ACI for LightGBM', 'algorithm':'LightGBM'}, \n", + " description = 'Light GBM ACI.')\n", + "\n", + "# Webservice creation\n", + "myservice = Webservice.deploy_from_image(\n", + " workspace=ws, \n", + " name=service_name,\n", + " image=image,\n", + " deployment_config = myaci_config\n", + " )\n", + "\n", + "myservice.wait_for_deployment(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### View the URI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#for using the Web HTTP API \n", + "print(myservice.scoring_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Test the Service\n", + "\n", + "Next, we can use data from the `test` data to test the service.\n", + "\n", + "The service expects JSON as its payload, so we take the test data, fill missing values, convert to JSON, then submit to the service endpoint.\n", + "\n", + "We have to fill in missing values here to create the data, because the webservice expects that the data coming into the webservice is well-formed. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "n_samples_to_test = 10\n", + "\n", + "## load the table created in the other notebook:\n", + "test=spark.table('test')\n", + "test_for_service_df = test.drop('features').fillna('M').fillna(0).limit(n_samples_to_test)\n", + "display(test_for_service_df)\n", + "test_json = json.dumps(test_for_service_df.toJSON().collect())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run the Service and Parse the Output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## The prediction is the predicted probability of a click for that particular record\n", + "service_out = myservice.run(input_data=test_json)\n", + "print(service_out)\n", + "values=json.loads('['+service_out[0]+']')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete the Service\n", + "\n", + "When you are done, you can delete the service to minimize costs. You can always redeploy from the image using the same command above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## Uncomment the following line to delete the web service\n", + "# myservice.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Additional Resources\n", + "\n", + "- See the notebook for model estimation [here](https://github.com/Microsoft/Recommenders/blob/gramhagen/lgbm_scenario/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb).\n", + "- This notebook is adapted from the notebooks [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-databricks/amlsdk/).\n", + "- See an example of leveraging the image on AKS [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aks-existingimage-05.ipynb).\n" + ] + } + ], + "metadata": { + "authors": [ + { + "name": "pasha" + } + ], + "kernelspec": { + "display_name": "Python (reco_base)", + "language": "python", + "name": "reco_base" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.8" + }, + "name": "deploy-to-aci-04", + "notebookId": 2571086681627427 + }, + "nbformat": 4, + "nbformat_minor": 1 +} From 64d02b1ceb00d240794eef36cc3c7096616785d6 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Mar 2019 12:15:56 +0000 Subject: [PATCH 13/30] refine lgbm notebook --- .../02_model/mmlspark_lightgbm_criteo.ipynb | 154 ++++++++++++++++-- 1 file changed, 137 insertions(+), 17 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index b2ad82925d..3edea26950 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -59,7 +59,6 @@ "import sys\n", "import shutil\n", "\n", - "from tempfile import TemporaryDirectory\n", "sys.path.append(\"../../\")\n", "\n", "import pyspark\n", @@ -77,6 +76,7 @@ " from scripts.databricks_install import MMLSPARK_INFO\n", " packages = [MMLSPARK_INFO['maven']['coordinates']]\n", " spark = start_or_get_spark(packages=packages)\n", + " dbutils = None\n", "\n", "from mmlspark import ComputeModelStatistics\n", "from mmlspark import LightGBMClassifier\n", @@ -98,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": { "tags": [ "parameters" @@ -111,19 +111,139 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "8.79MB [00:02, 3.62MB/s] \n" + "8.79MB [00:00, 16.8MB/s] \n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
labelint00int01int02int03int04int05int06int07int08...cat16cat17cat18cat19cat20cat21cat22cat23cat24cat25
00115013824152181...e5ba7672f54016b921ddcdc9b1252a9d07b5194cNone3a171ecbc5c50484e8b834079727dd16
10204411028224...07c540c4b04e467021ddcdc95840adea60f6221eNone3a171ecb43f13e8be8b83407731c3655
\n", + "

2 rows × 40 columns

\n", + "
" + ], + "text/plain": [ + " label int00 int01 int02 int03 int04 int05 int06 int07 int08 ... \\\n", + "0 0 1 1 5 0 1382 4 15 2 181 ... \n", + "1 0 2 0 44 1 102 8 2 2 4 ... \n", + "\n", + " cat16 cat17 cat18 cat19 cat20 cat21 cat22 cat23 \\\n", + "0 e5ba7672 f54016b9 21ddcdc9 b1252a9d 07b5194c None 3a171ecb c5c50484 \n", + "1 07c540c4 b04e4670 21ddcdc9 5840adea 60f6221e None 3a171ecb 43f13e8b \n", + "\n", + " cat24 cat25 \n", + "0 e8b83407 9727dd16 \n", + "1 e8b83407 731c3655 \n", + "\n", + "[2 rows x 40 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "raw_data = load_spark_df(size=DATA_SIZE, spark=spark, dbutils=dbutils)" + "raw_data = load_spark_df(size=DATA_SIZE, spark=spark, dbutils=dbutils)\n", + "# visualize data\n", + "raw_data.limit(2).toPandas().head()" ] }, { @@ -138,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -147,7 +267,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -157,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -167,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -178,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -205,7 +325,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "tags": [ "parameters" @@ -222,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -250,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -260,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -270,7 +390,7 @@ "+---------------+------------------+\n", "|evaluation_type| AUC|\n", "+---------------+------------------+\n", - "| Classification|0.6310011615829604|\n", + "| Classification|0.6626128531787244|\n", "+---------------+------------------+\n", "\n" ] @@ -297,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ From cb3339eda635ac664a6be4e8663e48ec4c448a7c Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Mar 2019 12:48:18 +0000 Subject: [PATCH 14/30] reafctor --- .../02_model/mmlspark_lightgbm_criteo.ipynb | 86 +++++++++---------- 1 file changed, 40 insertions(+), 46 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 3edea26950..6bcd643f9a 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -86,19 +86,9 @@ "print(\"MMLSpark version: {}\".format(MMLSPARK_INFO['maven']['coordinates']))" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preparation\n", - "The Criteo Display Advertising Challenge (DAC) dataset [4] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows, but there is also a down-sampled dataset which has 100,000 rows (this can be used by setting DATA_SIZE = 'sample').

\n", - "The dataset contains 1 label column and 38 feature columns, where 13 columns are integer values (int00-int12) and 25 columns are categorical features (cat00-cat24).

\n", - "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and indicates a user interaction with an item, so this is a useful dataset to demonstrate how to build a model that will predict likelihood of a user interacting with an item based on the user and item content features.\n" - ] - }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "tags": [ "parameters" @@ -106,19 +96,41 @@ }, "outputs": [], "source": [ - "DATA_SIZE = 'sample'\n" + "# Criteo data size, it can be \"sample\" or \"ful\"\n", + "DATA_SIZE = \"sample\"\n", + "\n", + "# LightGBM parameters\n", + "# More datails on parameters: https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters-Tuning.rst\n", + "NUM_LEAVES = 32\n", + "NUM_ITERATIONS = 30\n", + "LEARNING_RATE = 0.15\n", + "FEATURE_FRACTION = 0.8\n", + "EARLY_STOPPING_ROUND = 10\n", + "\n", + "# Model name\n", + "MODEL_NAME = 'finished.model'\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation\n", + "The Criteo Display Advertising Challenge (DAC) dataset [4] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows, but there is also a down-sampled dataset which has 100,000 rows (this can be used by setting DATA_SIZE = 'sample').

\n", + "The dataset contains 1 label column and 38 feature columns, where 13 columns are integer values (int00-int12) and 25 columns are categorical features (cat00-cat24).

\n", + "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and indicates a user interaction with an item, so this is a useful dataset to demonstrate how to build a model that will predict likelihood of a user interacting with an item based on the user and item content features.\n" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "8.79MB [00:00, 16.8MB/s] \n" + "8.79MB [00:00, 35.5MB/s] \n" ] }, { @@ -235,7 +247,7 @@ "[2 rows x 40 columns]" ] }, - "execution_count": 6, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -258,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -267,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -277,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -287,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -298,7 +310,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -315,7 +327,7 @@ "In MML Spark the LightGBM implementation for binary classification is invoked using the LightGBMClassifier class and specifying the objective as 'binary'. In this instance the occurrence of positive labels is quite low, so setting the isUnbalance flag to true helps account for this imbalance.

\n", "\n", "### Hyper-parameters\n", - "Below are some of the key hyper-parameters \\[5\\] for training a LightGBM classifier on Spark\n", + "Below are some of the key [hyper-parameters](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters-Tuning.rst) for training a LightGBM classifier on Spark\n", "- numLeaves: the number of leaves in each tree\n", "- numIterations: the number of iterations to apply boosting\n", "- learningRate: the learning rate for training across trees\n", @@ -325,24 +337,7 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "NUM_LEAVES = 32\n", - "NUM_ITERATIONS = 30\n", - "LEARNING_RATE = 0.15\n", - "FEATURE_FRACTION = 0.8\n", - "EARLY_STOPPING_ROUND = 10" - ] - }, - { - "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -370,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -380,7 +375,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -390,7 +385,7 @@ "+---------------+------------------+\n", "|evaluation_type| AUC|\n", "+---------------+------------------+\n", - "| Classification|0.6626128531787244|\n", + "| Classification|0.6591854866378591|\n", "+---------------+------------------+\n", "\n" ] @@ -417,14 +412,13 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# save model\n", - "model_name = 'finished.model'\n", "pipeline = PipelineModel(stages=[feature_processor, model])\n", - "pipeline.save(model_name)\n" + "pipeline.write().overwrite().save(MODEL_NAME)\n" ] }, { From 84bd4576ab11f6dd00c27e8540d3d38d141882e8 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Tue, 26 Mar 2019 13:11:23 +0000 Subject: [PATCH 15/30] :bug: --- notebooks/02_model/mmlspark_lightgbm_criteo.ipynb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 6bcd643f9a..825c84c8d4 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -57,7 +57,6 @@ "source": [ "import os\n", "import sys\n", - "import shutil\n", "\n", "sys.path.append(\"../../\")\n", "\n", @@ -96,7 +95,7 @@ }, "outputs": [], "source": [ - "# Criteo data size, it can be \"sample\" or \"ful\"\n", + "# Criteo data size, it can be \"sample\" or \"full\"\n", "DATA_SIZE = \"sample\"\n", "\n", "# LightGBM parameters\n", @@ -108,7 +107,7 @@ "EARLY_STOPPING_ROUND = 10\n", "\n", "# Model name\n", - "MODEL_NAME = 'finished.model'\n" + "MODEL_NAME = 'finished.model'" ] }, { From 3fa4bcc3300995b163246db3e775d04e18c77f44 Mon Sep 17 00:00:00 2001 From: Scott Graham Date: Tue, 26 Mar 2019 09:48:20 -0400 Subject: [PATCH 16/30] adding jar support for spark session builder and updating parameters and references in 02 lgbm notebook --- .../02_model/mmlspark_lightgbm_criteo.ipynb | 68 +++++++------------ reco_utils/common/spark_utils.py | 11 ++- 2 files changed, 33 insertions(+), 46 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 825c84c8d4..0c381ec0eb 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -47,7 +47,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.7 | packaged by conda-forge | (default, Nov 21 2018, 03:09:43) \n", + "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", "[GCC 7.3.0]\n", "PySpark version: 2.3.1\n", "MMLSpark version: Azure:mmlspark:0.16\n" @@ -68,11 +68,11 @@ "from reco_utils.common.notebook_utils import is_databricks\n", "from reco_utils.dataset.criteo import load_spark_df\n", "from reco_utils.dataset.spark_splitters import spark_random_split\n", + "from scripts.databricks_install import MMLSPARK_INFO\n", "\n", "# Setup MML Spark\n", "if not is_databricks():\n", " # get the maven coordinates for MML Spark from databricks_install script\n", - " from scripts.databricks_install import MMLSPARK_INFO\n", " packages = [MMLSPARK_INFO['maven']['coordinates']]\n", " spark = start_or_get_spark(packages=packages)\n", " dbutils = None\n", @@ -87,7 +87,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "tags": [ "parameters" @@ -99,10 +99,10 @@ "DATA_SIZE = \"sample\"\n", "\n", "# LightGBM parameters\n", - "# More datails on parameters: https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters-Tuning.rst\n", + "# More details on parameters: https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html\n", "NUM_LEAVES = 32\n", - "NUM_ITERATIONS = 30\n", - "LEARNING_RATE = 0.15\n", + "NUM_ITERATIONS = 50\n", + "LEARNING_RATE = 0.1\n", "FEATURE_FRACTION = 0.8\n", "EARLY_STOPPING_ROUND = 10\n", "\n", @@ -122,14 +122,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "8.79MB [00:00, 35.5MB/s] \n" + "8.79MB [00:02, 4.07MB/s] \n" ] }, { @@ -246,7 +246,7 @@ "[2 rows x 40 columns]" ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -269,7 +269,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -278,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -288,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -296,28 +296,6 @@ "test = feature_processor.transform(raw_test)" ] }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# save data to tables to finalize feature transformation\n", - "train.write.mode('overwrite').saveAsTable('train')\n", - "test.write.mode('overwrite').saveAsTable('test')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# load data from table\n", - "train = spark.table('train')\n", - "test = spark.table('test')" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -336,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -347,11 +325,12 @@ " isUnbalance=True,\n", " boostingType='gbdt',\n", " boostFromAverage=True,\n", + " baggingSeed=42,\n", " numLeaves=NUM_LEAVES,\n", " numIterations=NUM_ITERATIONS,\n", " learningRate=LEARNING_RATE,\n", " featureFraction=FEATURE_FRACTION,\n", - " earlyStoppingRound=EARLY_STOPPING_ROUND,\n", + " earlyStoppingRound=EARLY_STOPPING_ROUND\n", ")" ] }, @@ -364,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -374,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -384,7 +363,7 @@ "+---------------+------------------+\n", "|evaluation_type| AUC|\n", "+---------------+------------------+\n", - "| Classification|0.6591854866378591|\n", + "| Classification|0.6889596274427175|\n", "+---------------+------------------+\n", "\n" ] @@ -411,7 +390,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -428,16 +407,17 @@ "\\[1\\] Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. LightGBM: A highly efficient gradient boosting decision tree. In Advances in Neural Information Processing Systems. 3146–3154. https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf
\n", "\\[2\\] MML Spark: https://mmlspark.blob.core.windows.net/website/index.html
\n", "\\[3\\] MML Spark Serving: https://github.com/Azure/mmlspark/blob/master/docs/mmlspark-serving.md
\n", - "\\[4\\] The Criteo dataset: http://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz
\n" + "\\[4\\] The Criteo dataset: http://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz
\n", + "\\[5\\] LightGBM Parameter Tuning: https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
\n" ] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python (reco_pyspark)", + "display_name": "Python 3", "language": "python", - "name": "reco_pyspark" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -449,7 +429,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/reco_utils/common/spark_utils.py b/reco_utils/common/spark_utils.py index 2f79836c97..1e419fe7c2 100644 --- a/reco_utils/common/spark_utils.py +++ b/reco_utils/common/spark_utils.py @@ -2,6 +2,7 @@ # Licensed under the MIT License. import os +import sys try: @@ -10,7 +11,7 @@ SparkSession = None # skip this import if we are in pure python environment -def start_or_get_spark(app_name="Sample", url="local[*]", memory="10G", packages=None): +def start_or_get_spark(app_name="Sample", url="local[*]", memory="10G", packages=None, jars=None): """Start Spark if not started Args: @@ -22,8 +23,14 @@ def start_or_get_spark(app_name="Sample", url="local[*]", memory="10G", packages obj: Spark context. """ + submit_args = '' if packages is not None: - os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages {} pyspark-shell'.format(','.join(packages)) + submit_args = '--packages {} '.format(','.join(packages)) + if jars is not None: + submit_args += '--jars {} '.format(','.join(jars)) + + if submit_args: + os.environ['PYSPARK_SUBMIT_ARGS'] = '{}pyspark-shell'.format(submit_args) spark = ( SparkSession.builder.appName(app_name) From 7b3d2ca88e6bb5c7aa942a371492d87167b17122 Mon Sep 17 00:00:00 2001 From: Scott Graham Date: Tue, 26 Mar 2019 09:57:23 -0400 Subject: [PATCH 17/30] updating text on features in lgbm notebook --- notebooks/02_model/mmlspark_lightgbm_criteo.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 0c381ec0eb..57d641e407 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -116,8 +116,8 @@ "source": [ "## Data Preparation\n", "The Criteo Display Advertising Challenge (DAC) dataset [4] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows, but there is also a down-sampled dataset which has 100,000 rows (this can be used by setting DATA_SIZE = 'sample').

\n", - "The dataset contains 1 label column and 38 feature columns, where 13 columns are integer values (int00-int12) and 25 columns are categorical features (cat00-cat24).

\n", - "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and indicates a user interaction with an item, so this is a useful dataset to demonstrate how to build a model that will predict likelihood of a user interacting with an item based on the user and item content features.\n" + "The dataset contains 1 label column and 39 feature columns, where 13 columns are integer values (int00-int12) and 26 columns are categorical features (cat00-cat25).

\n", + "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and is an example of implicit feedback indicating a user's interaction with an item. With this dataset we can demonstrate how to build a model that predicts the probability of a user interacting with an item based on available user and item content features.\n" ] }, { @@ -263,7 +263,7 @@ "source": [ "### Feature Processing\n", "The feature data provided has many missing values across both integer and categorical feature fields. In addition the categorical features have many distinct values, so effectively cleaning and representing the feature data is an important step prior to training a model.

\n", - "One of the simplest ways of managing both features that have missing values as well as high cardinality is to use the hashing trick. The FeatureHasher transformer will pass integer values through and will hash categorical features into a sparse vector of lower dimensionality which can be used effectively by LightGBM.

\n", + "One of the simplest ways of managing both features that have missing values as well as high cardinality is to use the hashing trick. The [FeatureHasher](http://spark.apache.org/docs/latest/ml-features.html#featurehasher) transformer will pass integer values through and will hash categorical features into a sparse vector of lower dimensionality which can be used effectively by LightGBM.

\n", "First the dataset is split randomly for training and testing and feature processing is applied to each dataset." ] }, From a7b3137d44d8336579fb33ded4a214f9fa644d29 Mon Sep 17 00:00:00 2001 From: Scott Graham Date: Tue, 26 Mar 2019 22:56:24 -0400 Subject: [PATCH 18/30] removing o16n notebook --- .../lightgbm_criteo_o16n.ipynb | 491 ------------------ 1 file changed, 491 deletions(-) delete mode 100644 notebooks/05_operationalize/lightgbm_criteo_o16n.ipynb diff --git a/notebooks/05_operationalize/lightgbm_criteo_o16n.ipynb b/notebooks/05_operationalize/lightgbm_criteo_o16n.ipynb deleted file mode 100644 index 79b1232610..0000000000 --- a/notebooks/05_operationalize/lightgbm_criteo_o16n.ipynb +++ /dev/null @@ -1,491 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Goal\n", - "\n", - "This notebook creates a real-time scoring service for the content-personalization model created in the prior [notebook](../02_model/mmlspark_lightgbm_criteo.ipynb). It is assumed that this notebook is run in an Azure Databricks environment that has had `mmlspark` installed and has been prepared for operationalization. See [Setup instructions](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md) for details.\n", - "\n", - "**NOTE**: Please Register Azure Container Instance (ACI) using Azure Portal: https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-manager-supported-services#portal in your subscription before using the SDK to deploy your ML model to ACI." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup libraries and variables\n", - "\n", - "The next few cells initialize the environment and varibles: we import relevant libraries and set variables." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import uuid\n", - "import json\n", - "\n", - "from azureml.core import Workspace\n", - "from azureml.core import VERSION as azuremlversion\n", - "\n", - "from azureml.core.model import Model\n", - "from azureml.core.conda_dependencies import CondaDependencies \n", - "from azureml.core.webservice import Webservice, AciWebservice\n", - "from azureml.core.image import ContainerImage\n", - "\n", - "# Check core SDK version number\n", - "print(\"SDK version: {}\".format(azuremlversion))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# These variables are used to construct names of assets:\n", - "short_uuid = str(uuid.uuid4())[:4]\n", - "prefix = \"reco\" + short_uuid\n", - "data = \"criteo\"\n", - "algo = \"lgbm\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Azure subscription\n", - "subscription_id = ''\n", - "\n", - "# Resource group and workspace\n", - "resource_group = prefix + \"_\" + data\n", - "workspace_name = prefix + \"_\"+data+\"_aml\"\n", - "workspace_region = \"westus2\"\n", - "print(\"Resource group:\", resource_group)\n", - "\n", - "# AzureML\n", - "#NOTE: The name of a asset must be only letters or numerals, not contain spaces, and under 30 characters\n", - "model_name = data+\"-\"+algo+\".model\" \n", - "service_name = data + \"-\" + algo\n", - "\n", - "# add a name for the container\n", - "container_image_name = '-'.join([data, algo])\n", - "\n", - "\n", - "## locations for serializing so it persists. This is a local API URL\n", - "ws_config_path = '/dbfs/FileStore'\n", - "## location of model on **dbfs**:\n", - "model_path = os.path.join('dbfs:/FileStore/dac',model_name)\n", - "## path to the notebook for modeling. Assumes the entire repository has been imported:\n", - "modeling_notebook = '../02_model/mmlspark_lightgbm_criteo'\n", - "\n", - "## names of other files that are used below\n", - "my_conda_file = \"deploy_conda.yml\"\n", - "driver_file = \"score_sparkml.py\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare Assets for the Scoring Service\n", - "\n", - "Before walking through the steps taken to create a model, it is useful to set some context. In our example, a \"scoring service\" is a function that is executed by a docker container. It takes in some number of records and produces a set of scores for each record (usually predictions of some type) based on a previously estimated model. In our case, we will take the model we estimated earlier that predicts the probability of a click based on some set of numeric and categorical features. In order to create a scoring service, we will do several steps.\n", - "\n", - "We will:\n", - "\n", - "1. Create an Azure Machine Learning Workspace to simplify all the subsequent steps.\n", - "2. Make sure we have access to the previously estimated model. If we are working on a spark system, that means we will make sure the model is on the local filesystem (**not** DBFS) and registered with the Azure Machine Learning Service.\n", - "3. Define a 'driver' script that defines what the system needs to do in order to generate our predictions. This script needs to have an `init` method that does one-time initialization and a `run` method that is executed each time the service is called.\n", - "4. Define all the pre-requisites that that script requries.\n", - "5. Use the model, the driver script, and the pre-requisites to create a docker image.\n", - "6. We will run the docker image on a platform (in our case Azure Container Instance or ACI).\n", - "7. We will test our service." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create a Workspace" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ws = Workspace.create(name = workspace_name,\n", - " subscription_id = subscription_id,\n", - " resource_group = resource_group, \n", - " location = workspace_region,\n", - " exist_ok=True)\n", - "\n", - "# persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n", - "ws.write_config(ws_config_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Prepare the Serialized Model\n", - "\n", - "First, we will prepare the serialized model. We will make sure the model exists, and if it doesn't, then we will run the notebook to generate the file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## if it doesn't exist, run the relevant notebook:\n", - "if not os.path.exists(model_path.replace('dbfs:','/dbfs')):\n", - " print('Model pipeline does not exist. Creating by running {}'.format(modeling_notebook))\n", - " dbutils.notebook.run(modeling_notebook, timeout_seconds=600)\n", - "else:\n", - " print('Operationalizing model found at: {}'.format(model_path))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Copy from dbfs to local\n", - "\n", - "While you can access files on DBFS with local file APIs, it is better practice to explicitly copy saved models to and from dbfs, because the local file APIs can only access files smaller than 2 GB (see details [here](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html#access-dbfs-using-local-file-apis)). \n", - "\n", - "Model deployment will always get the model from the current working directory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model_local = \"file:\" + os.getcwd() + \"/\" + model_name\n", - "dbutils.fs.cp(model_path, model_local, True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Register the Model\n", - "\n", - "Next, we need to register the model in the Azure Machine Learning Workspace." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Register the model\n", - "mymodel = Model.register(model_path = model_name, # this points to a local file\n", - " model_name = model_name, # this is the name the model is registered as\n", - " description = \"LightGBM Criteo Model\",\n", - " workspace = ws)\n", - "\n", - "print(mymodel.name, mymodel.description, mymodel.version)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Create the Driver Script\n", - "\n", - "Next we, need to create the driver script that will be executed when the service is called. The functions that need to be defined for scoring are `init()` and `run()`. The `init()` function is run when the service is created, and the `run()` function is run each time the service is called.\n", - "\n", - "In our example, we use the `init()` function to load all the libraries, initialize the spark session, and load the model and pipeline. We use the `run()` method to parse the input json file, generate predictions (in this case the probability of a click), and format for output." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "score_sparkml = \"\"\"\n", - "\n", - "import json\n", - " \n", - "def init():\n", - " # One-time initialization of PySpark and predictive model\n", - " import pyspark\n", - " from pyspark.ml import PipelineModel\n", - " from mmlspark import LightGBMClassifier\n", - " from azureml.core.model import Model\n", - " from pyspark.ml import PipelineModel\n", - " from pyspark.sql.types import StructType, StructField, IntegerType, StringType\n", - "\n", - " global trainedModel\n", - " global spark\n", - " global schema\n", - "\n", - " spark = pyspark.sql.SparkSession.builder.appName(\"LightGBM Criteo Predictions\").getOrCreate()\n", - " model_name = \"{model_name}\" \n", - " model_path = Model.get_model_path(model_name)\n", - " trainedModel = PipelineModel.load(model_path)\n", - " \n", - "def run(input_json):\n", - " if isinstance(trainedModel, Exception):\n", - " return json.dumps({{\"trainedModel\":str(trainedModel)}})\n", - " \n", - " try:\n", - " sc = spark.sparkContext\n", - " input_list = json.loads(input_json)\n", - " input_rdd = sc.parallelize(input_list)\n", - " input_df = spark.read.json(input_rdd)\n", - " \n", - " # Compute prediction\n", - " predictions = trainedModel.transform(input_df).collect()\n", - " #Get probability of a click for each row and conver to a str\n", - " click_prob = [str(x.probability[1]) for x in predictions]\n", - "\n", - " # you can return any data type as long as it is JSON-serializable\n", - " result = \",\".join(click_prob)\n", - " return [result]\n", - " except Exception as e:\n", - " result = str(e)\n", - " return result\n", - "\"\"\".format(model_name=model_name)\n", - " \n", - "exec(score_sparkml)\n", - " \n", - "with open(driver_file, \"w\") as file:\n", - " file.write(score_sparkml)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Define Dependencies\n", - "\n", - "Next, we define the dependencies that are required by driver script." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## azureml-sdk is required to load the registered model\n", - "myconda = CondaDependencies.create(pip_packages=['azureml-sdk'])\n", - "with open(my_conda_file,\"w\") as f:\n", - " f.write(myconda.serialize_to_string())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Create the Image\n", - "\n", - "We use the `ContainerImage` class to first configure, then to create the docker image used. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "myimage_config = ContainerImage.image_configuration(execution_script = driver_file, \n", - " runtime = \"spark-py\",\n", - " conda_file=my_conda_file,\n", - " tags={\"runtime\":\"pyspark\", \"algorithm\":\"lightgbm\"})\n", - "\n", - "image = ContainerImage.create(name = service_name,\n", - " models = [mymodel],\n", - " image_config = myimage_config,\n", - " workspace = ws)\n", - "\n", - "image.wait_for_creation(show_output = True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Create the Service\n", - "\n", - "Once we have created an image, we configure and run it on ACI.\n", - "\n", - "**NOTE** You *can* create a service directly from the registered model and image_configuration with the `Webservice.deploy_from_model()` function. We create the image here explicitly and use `deploy_from_image()` for two reasons:\n", - "\n", - "1. It provides more transparency in terms of the actual steps that are taking place\n", - "2. It has potential for faster iteration and for more portability. Once we have an image, we can create a new deployment with the exact same code." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#configure ACI\n", - "myaci_config = AciWebservice.deploy_configuration(\n", - " cpu_cores = 2, \n", - " memory_gb = 2, \n", - " tags = {'name':'Azure ML ACI for LightGBM', 'algorithm':'LightGBM'}, \n", - " description = 'Light GBM ACI.')\n", - "\n", - "# Webservice creation\n", - "myservice = Webservice.deploy_from_image(\n", - " workspace=ws, \n", - " name=service_name,\n", - " image=image,\n", - " deployment_config = myaci_config\n", - " )\n", - "\n", - "myservice.wait_for_deployment(show_output=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### View the URI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#for using the Web HTTP API \n", - "print(myservice.scoring_uri)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 7. Test the Service\n", - "\n", - "Next, we can use data from the `test` data to test the service.\n", - "\n", - "The service expects JSON as its payload, so we take the test data, fill missing values, convert to JSON, then submit to the service endpoint.\n", - "\n", - "We have to fill in missing values here to create the data, because the webservice expects that the data coming into the webservice is well-formed. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "n_samples_to_test = 10\n", - "\n", - "## load the table created in the other notebook:\n", - "test=spark.table('test')\n", - "test_for_service_df = test.drop('features').fillna('M').fillna(0).limit(n_samples_to_test)\n", - "display(test_for_service_df)\n", - "test_json = json.dumps(test_for_service_df.toJSON().collect())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run the Service and Parse the Output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## The prediction is the predicted probability of a click for that particular record\n", - "service_out = myservice.run(input_data=test_json)\n", - "print(service_out)\n", - "values=json.loads('['+service_out[0]+']')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Delete the Service\n", - "\n", - "When you are done, you can delete the service to minimize costs. You can always redeploy from the image using the same command above." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Uncomment the following line to delete the web service\n", - "# myservice.delete()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Additional Resources\n", - "\n", - "- See the notebook for model estimation [here](https://github.com/Microsoft/Recommenders/blob/gramhagen/lgbm_scenario/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb).\n", - "- This notebook is adapted from the notebooks [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-databricks/amlsdk/).\n", - "- See an example of leveraging the image on AKS [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aks-existingimage-05.ipynb).\n" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "pasha" - } - ], - "kernelspec": { - "display_name": "Python (reco_base)", - "language": "python", - "name": "reco_base" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - }, - "name": "deploy-to-aci-04", - "notebookId": 2571086681627427 - }, - "nbformat": 4, - "nbformat_minor": 1 -} From 24358912decdf7051d7446c795021a893e739b43 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 27 Mar 2019 11:02:02 +0000 Subject: [PATCH 19/30] tests --- .../02_model/mmlspark_lightgbm_criteo.ipynb | 50 ++++++++++++------- reco_utils/common/spark_utils.py | 4 +- scripts/generate_conda_file.py | 3 +- tests/conftest.py | 3 ++ tests/integration/test_notebooks_pyspark.py | 20 ++++++++ tests/smoke/test_notebooks_pyspark.py | 22 +++++++- tests/unit/test_notebooks_pyspark.py | 16 +++++- 7 files changed, 94 insertions(+), 24 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 57d641e407..4b40eac7c6 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -47,7 +47,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", + "System version: 3.6.7 | packaged by conda-forge | (default, Nov 21 2018, 03:09:43) \n", "[GCC 7.3.0]\n", "PySpark version: 2.3.1\n", "MMLSpark version: Azure:mmlspark:0.16\n" @@ -63,6 +63,7 @@ "import pyspark\n", "from pyspark.ml import PipelineModel\n", "from pyspark.ml.feature import FeatureHasher\n", + "import papermill as pm\n", "\n", "from reco_utils.common.spark_utils import start_or_get_spark\n", "from reco_utils.common.notebook_utils import is_databricks\n", @@ -87,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "tags": [ "parameters" @@ -115,21 +116,21 @@ "metadata": {}, "source": [ "## Data Preparation\n", - "The Criteo Display Advertising Challenge (DAC) dataset [4] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows, but there is also a down-sampled dataset which has 100,000 rows (this can be used by setting DATA_SIZE = 'sample').

\n", + "The [Criteo Display Advertising Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge) (DAC) dataset [4] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows, but there is also a down-sampled dataset which has 100,000 rows (this can be used by setting DATA_SIZE = 'sample').

\n", "The dataset contains 1 label column and 39 feature columns, where 13 columns are integer values (int00-int12) and 26 columns are categorical features (cat00-cat25).

\n", "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and is an example of implicit feedback indicating a user's interaction with an item. With this dataset we can demonstrate how to build a model that predicts the probability of a user interacting with an item based on available user and item content features.\n" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "8.79MB [00:02, 4.07MB/s] \n" + "8.79MB [00:00, 35.6MB/s] \n" ] }, { @@ -246,7 +247,7 @@ "[2 rows x 40 columns]" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -269,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -278,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -288,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -314,7 +315,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -343,7 +344,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -353,7 +354,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -363,7 +364,7 @@ "+---------------+------------------+\n", "|evaluation_type| AUC|\n", "+---------------+------------------+\n", - "| Classification|0.6889596274427175|\n", + "| Classification|0.6609632610671168|\n", "+---------------+------------------+\n", "\n" ] @@ -377,7 +378,19 @@ " .setEvaluationMetric(\"AUC\")\n", ")\n", "\n", - "evaluator.transform(predictions).show()" + "result = evaluator.transform(predictions)\n", + "auc = result.select(\"AUC\").collect()[0][0]\n", + "result.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Record results with papermill for tests\n", + "pm.record(\"auc\", auc)" ] }, { @@ -407,17 +420,16 @@ "\\[1\\] Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. LightGBM: A highly efficient gradient boosting decision tree. In Advances in Neural Information Processing Systems. 3146–3154. https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf
\n", "\\[2\\] MML Spark: https://mmlspark.blob.core.windows.net/website/index.html
\n", "\\[3\\] MML Spark Serving: https://github.com/Azure/mmlspark/blob/master/docs/mmlspark-serving.md
\n", - "\\[4\\] The Criteo dataset: http://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz
\n", - "\\[5\\] LightGBM Parameter Tuning: https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
\n" + "\\[4\\] The Criteo dataset: http://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz
" ] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python (reco_pyspark)", "language": "python", - "name": "python3" + "name": "reco_pyspark" }, "language_info": { "codemirror_mode": { @@ -429,7 +441,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.7" } }, "nbformat": 4, diff --git a/reco_utils/common/spark_utils.py b/reco_utils/common/spark_utils.py index 1e419fe7c2..adb4d975b5 100644 --- a/reco_utils/common/spark_utils.py +++ b/reco_utils/common/spark_utils.py @@ -8,7 +8,7 @@ try: from pyspark.sql import SparkSession except ImportError: - SparkSession = None # skip this import if we are in pure python environment + pass # skip this import if we are in pure python environment def start_or_get_spark(app_name="Sample", url="local[*]", memory="10G", packages=None, jars=None): @@ -19,6 +19,8 @@ def start_or_get_spark(app_name="Sample", url="local[*]", memory="10G", packages url (str): URL for spark master memory (str): Size of memory for spark driver packages (list): list of packages to install + jars (list): list of jar files to add + Returns: obj: Spark context. """ diff --git a/scripts/generate_conda_file.py b/scripts/generate_conda_file.py index 6115eee69f..31cce147e3 100644 --- a/scripts/generate_conda_file.py +++ b/scripts/generate_conda_file.py @@ -68,7 +68,7 @@ "azure-storage": "azure-storage>=0.36.0", "black": "black>=18.6b4", "category_encoders": "category_encoders>=1.3.0", - "databricks-cli": "databricks-cli==0.8.4", + "databricks-cli": "databricks-cli==0.8.6", "dataclasses": "dataclasses>=0.6", "fastai": "fastai==1.0.46", "hyperopt": "hyperopt==0.1.1", @@ -78,7 +78,6 @@ "papermill": "papermill>=0.15.0", "pydocumentdb": "pydocumentdb>=2.3.3", "tqdm": "tqdm==4.31.1", - "databricks-cli": "databricks-cli>=0.8.6", } PIP_PYSPARK = {} diff --git a/tests/conftest.py b/tests/conftest.py index f64fd8a67a..3ccd16629f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -236,6 +236,9 @@ def notebooks(): "vowpal_wabbit_deep_dive": os.path.join( folder_notebooks, "02_model", "vowpal_wabbit_deep_dive.ipynb" ), + "mmlspark_lightgbm_criteo": os.path.join( + folder_notebooks, "02_model", "mmlspark_lightgbm_criteo.ipynb" + ), "evaluation": os.path.join(folder_notebooks, "03_evaluate", "evaluation.ipynb"), "spark_tuning": os.path.join( folder_notebooks, "04_model_select_and_optimize", "tuning_spark_als.ipynb" diff --git a/tests/integration/test_notebooks_pyspark.py b/tests/integration/test_notebooks_pyspark.py index d5690eefbd..5825e07799 100644 --- a/tests/integration/test_notebooks_pyspark.py +++ b/tests/integration/test_notebooks_pyspark.py @@ -33,3 +33,23 @@ def test_als_pyspark_integration(notebooks): assert results["mae"] == pytest.approx(0.68023, rel=TOL, abs=ABS_TOL) assert results["exp_var"] == pytest.approx(0.4094, rel=TOL, abs=ABS_TOL) assert results["rsquared"] == pytest.approx(0.4038, rel=TOL, abs=ABS_TOL) + + +@pytest.mark.spark +@pytest.mark.integration +@pytest.mark.skip(reason="It takes too long in the current test machine") +def test_mmlspark_lightgbm_criteo_integration(notebooks): + notebook_path = notebooks["mmlspark_lightgbm_criteo"] + pm.execute_notebook( + notebook_path, + OUTPUT_NOTEBOOK, + kernel_name=KERNEL_NAME, + parameters=dict( + DATA_SIZE="full", + NUM_ITERATIONS=50, + EARLY_STOPPING_ROUND=10 + ) + ) + nb = pm.read_notebook(OUTPUT_NOTEBOOK) + results = nb.dataframe.set_index("name")["value"] + # assert results["auc"] == pytest.approx(0.68895, rel=TOL, abs=ABS_TOL) \ No newline at end of file diff --git a/tests/smoke/test_notebooks_pyspark.py b/tests/smoke/test_notebooks_pyspark.py index 25468c6524..68e79759db 100644 --- a/tests/smoke/test_notebooks_pyspark.py +++ b/tests/smoke/test_notebooks_pyspark.py @@ -32,4 +32,24 @@ def test_als_pyspark_smoke(notebooks): assert results["rmse"] == pytest.approx(0.9636, rel=TOL, abs=ABS_TOL) assert results["mae"] == pytest.approx(0.7508, rel=TOL, abs=ABS_TOL) assert results["exp_var"] == pytest.approx(0.2672, rel=TOL, abs=ABS_TOL) - assert results["rsquared"] == pytest.approx(0.2611, rel=TOL, abs=ABS_TOL) \ No newline at end of file + assert results["rsquared"] == pytest.approx(0.2611, rel=TOL, abs=ABS_TOL) + + +@pytest.mark.notebooks +@pytest.mark.spark +def test_mmlspark_lightgbm_criteo_smoke(notebooks): + notebook_path = notebooks["mmlspark_lightgbm_criteo"] + pm.execute_notebook( + notebook_path, + OUTPUT_NOTEBOOK, + kernel_name=KERNEL_NAME, + parameters=dict( + DATA_SIZE="sample", + NUM_ITERATIONS=50, + EARLY_STOPPING_ROUND=10 + ) + ) + nb = pm.read_notebook(OUTPUT_NOTEBOOK) + results = nb.dataframe.set_index("name")["value"] + assert results["auc"] == pytest.approx(0.68895, rel=TOL, abs=ABS_TOL) + \ No newline at end of file diff --git a/tests/unit/test_notebooks_pyspark.py b/tests/unit/test_notebooks_pyspark.py index 995dee73e1..8c284687a7 100644 --- a/tests/unit/test_notebooks_pyspark.py +++ b/tests/unit/test_notebooks_pyspark.py @@ -34,7 +34,6 @@ def test_evaluation_runs(notebooks): pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME) - @pytest.mark.notebooks @pytest.mark.spark def test_spark_tuning(notebooks): @@ -51,3 +50,18 @@ def test_spark_tuning(notebooks): ) ) + +@pytest.mark.notebooks +@pytest.mark.spark +def test_mmlspark_lightgbm_criteo_runs(notebooks): + notebook_path = notebooks["mmlspark_lightgbm_criteo"] + pm.execute_notebook( + notebook_path, + OUTPUT_NOTEBOOK, + kernel_name=KERNEL_NAME, + parameters=dict( + DATA_SIZE="sample", + NUM_ITERATIONS=10, + EARLY_STOPPING_ROUND=2 + ) + ) \ No newline at end of file From 1fc0a1b192c935f05120438d532e46fb4bb86759 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 27 Mar 2019 11:04:39 +0000 Subject: [PATCH 20/30] :bug: --- tests/smoke/test_notebooks_pyspark.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/smoke/test_notebooks_pyspark.py b/tests/smoke/test_notebooks_pyspark.py index 68e79759db..f432d8ff5d 100644 --- a/tests/smoke/test_notebooks_pyspark.py +++ b/tests/smoke/test_notebooks_pyspark.py @@ -35,7 +35,7 @@ def test_als_pyspark_smoke(notebooks): assert results["rsquared"] == pytest.approx(0.2611, rel=TOL, abs=ABS_TOL) -@pytest.mark.notebooks +@pytest.mark.smoke @pytest.mark.spark def test_mmlspark_lightgbm_criteo_smoke(notebooks): notebook_path = notebooks["mmlspark_lightgbm_criteo"] @@ -52,4 +52,3 @@ def test_mmlspark_lightgbm_criteo_smoke(notebooks): nb = pm.read_notebook(OUTPUT_NOTEBOOK) results = nb.dataframe.set_index("name")["value"] assert results["auc"] == pytest.approx(0.68895, rel=TOL, abs=ABS_TOL) - \ No newline at end of file From 977070105209ad90014feb520191b13cfc6fda34 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 27 Mar 2019 12:10:13 +0000 Subject: [PATCH 21/30] fixed :bug: in DB --- notebooks/02_model/mmlspark_lightgbm_criteo.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 4b40eac7c6..9671bcc83b 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -47,10 +47,10 @@ "name": "stdout", "output_type": "stream", "text": [ + "MMLSpark version: Azure:mmlspark:0.16\n", "System version: 3.6.7 | packaged by conda-forge | (default, Nov 21 2018, 03:09:43) \n", "[GCC 7.3.0]\n", - "PySpark version: 2.3.1\n", - "MMLSpark version: Azure:mmlspark:0.16\n" + "PySpark version: 2.3.1\n" ] } ], @@ -69,21 +69,21 @@ "from reco_utils.common.notebook_utils import is_databricks\n", "from reco_utils.dataset.criteo import load_spark_df\n", "from reco_utils.dataset.spark_splitters import spark_random_split\n", - "from scripts.databricks_install import MMLSPARK_INFO\n", "\n", "# Setup MML Spark\n", "if not is_databricks():\n", " # get the maven coordinates for MML Spark from databricks_install script\n", + " from scripts.databricks_install import MMLSPARK_INFO\n", " packages = [MMLSPARK_INFO['maven']['coordinates']]\n", " spark = start_or_get_spark(packages=packages)\n", " dbutils = None\n", + " print(\"MMLSpark version: {}\".format(MMLSPARK_INFO['maven']['coordinates']))\n", "\n", "from mmlspark import ComputeModelStatistics\n", "from mmlspark import LightGBMClassifier\n", "\n", "print(\"System version: {}\".format(sys.version))\n", - "print(\"PySpark version: {}\".format(pyspark.version.__version__))\n", - "print(\"MMLSpark version: {}\".format(MMLSPARK_INFO['maven']['coordinates']))" + "print(\"PySpark version: {}\".format(pyspark.version.__version__))\n" ] }, { From 83089d840f6eb8df9fefbdde154d2b107efd7e07 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 27 Mar 2019 17:12:03 +0000 Subject: [PATCH 22/30] :bug: fix and new version of MMLSpark by @eisber --- .../02_model/mmlspark_lightgbm_criteo.ipynb | 31 +++++++++++++------ reco_utils/common/spark_utils.py | 14 +++++++-- scripts/databricks_install.py | 4 ++- 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 9671bcc83b..68e1d328c1 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -47,10 +47,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "MMLSpark version: Azure:mmlspark:0.16\n", - "System version: 3.6.7 | packaged by conda-forge | (default, Nov 21 2018, 03:09:43) \n", + "MMLSpark version: com.microsoft.ml.spark:mmlspark_2.11:0.16.dev8+2.g6a5318b\n", + "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", "[GCC 7.3.0]\n", - "PySpark version: 2.3.1\n" + "PySpark version: 2.3.0\n" ] } ], @@ -74,8 +74,9 @@ "if not is_databricks():\n", " # get the maven coordinates for MML Spark from databricks_install script\n", " from scripts.databricks_install import MMLSPARK_INFO\n", - " packages = [MMLSPARK_INFO['maven']['coordinates']]\n", - " spark = start_or_get_spark(packages=packages)\n", + " packages = [MMLSPARK_INFO[\"maven\"][\"coordinates\"]]\n", + " repos = [MMLSPARK_INFO[\"maven\"][\"repositories\"]]\n", + " spark = start_or_get_spark(packages=packages, repositories=repos)\n", " dbutils = None\n", " print(\"MMLSpark version: {}\".format(MMLSPARK_INFO['maven']['coordinates']))\n", "\n", @@ -130,7 +131,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "8.79MB [00:00, 35.6MB/s] \n" + "8.79MB [00:00, 32.6MB/s] \n" ] }, { @@ -364,7 +365,7 @@ "+---------------+------------------+\n", "|evaluation_type| AUC|\n", "+---------------+------------------+\n", - "| Classification|0.6609632610671168|\n", + "| Classification|0.6870253907336659|\n", "+---------------+------------------+\n", "\n" ] @@ -385,9 +386,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/papermill.record+json": { + "auc": 0.6870253907336659 + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Record results with papermill for tests\n", "pm.record(\"auc\", auc)" @@ -441,7 +452,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/reco_utils/common/spark_utils.py b/reco_utils/common/spark_utils.py index adb4d975b5..f9029dca3c 100644 --- a/reco_utils/common/spark_utils.py +++ b/reco_utils/common/spark_utils.py @@ -11,7 +11,14 @@ pass # skip this import if we are in pure python environment -def start_or_get_spark(app_name="Sample", url="local[*]", memory="10G", packages=None, jars=None): +def start_or_get_spark( + app_name="Sample", + url="local[*]", + memory="10G", + packages=None, + jars=None, + repositories=None + ): """Start Spark if not started Args: @@ -30,9 +37,10 @@ def start_or_get_spark(app_name="Sample", url="local[*]", memory="10G", packages submit_args = '--packages {} '.format(','.join(packages)) if jars is not None: submit_args += '--jars {} '.format(','.join(jars)) - + if repositories is not None: + submit_args += "--repositories {}".format(",".join(repositories)) if submit_args: - os.environ['PYSPARK_SUBMIT_ARGS'] = '{}pyspark-shell'.format(submit_args) + os.environ['PYSPARK_SUBMIT_ARGS'] = '{} pyspark-shell'.format(submit_args) spark = ( SparkSession.builder.appName(app_name) diff --git a/scripts/databricks_install.py b/scripts/databricks_install.py index da753bcd70..efcdcc08e7 100644 --- a/scripts/databricks_install.py +++ b/scripts/databricks_install.py @@ -56,7 +56,9 @@ "pydocumentdb==2.3.3", ] -MMLSPARK_INFO = {"maven": {"coordinates": "Azure:mmlspark:0.16"}} +MMLSPARK_INFO = {"maven": {"coordinates": "com.microsoft.ml.spark:mmlspark_2.11:0.16.dev8+2.g6a5318b", + "repositories": "https://mmlspark.azureedge.net/maven"} + } DEFAULT_CLUSTER_CONFIG = { "cluster_name": "DB_CLUSTER", From 0d8557945b2c0ff81b27582778c626d57ecf08ed Mon Sep 17 00:00:00 2001 From: Scott Graham Date: Wed, 27 Mar 2019 15:35:58 -0400 Subject: [PATCH 23/30] updating model name and adding note about databricks autoscaling --- .../02_model/mmlspark_lightgbm_criteo.ipynb | 22 +++++++++++++++---- tests/unit/test_notebooks_pyspark.py | 4 ++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 68e1d328c1..0e8f8c578c 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -38,6 +38,13 @@ "dependencies.
Run ```python scripts/databricks_install.py -h``` for more details." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note** - MML Spark should not be run on a cluster with autoscaling enabled. Disable the flag in the Azure Databricks Cluster configuration before running this notebook." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -109,7 +116,7 @@ "EARLY_STOPPING_ROUND = 10\n", "\n", "# Model name\n", - "MODEL_NAME = 'finished.model'" + "MODEL_NAME = 'lightgbm_criteo.mml'" ] }, { @@ -427,20 +434,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Reference\n", + "## References\n", "\\[1\\] Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. LightGBM: A highly efficient gradient boosting decision tree. In Advances in Neural Information Processing Systems. 3146–3154. https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf
\n", "\\[2\\] MML Spark: https://mmlspark.blob.core.windows.net/website/index.html
\n", "\\[3\\] MML Spark Serving: https://github.com/Azure/mmlspark/blob/master/docs/mmlspark-serving.md
\n", "\\[4\\] The Criteo dataset: http://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz
" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python (reco_pyspark)", + "display_name": "Python 3", "language": "python", - "name": "reco_pyspark" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/tests/unit/test_notebooks_pyspark.py b/tests/unit/test_notebooks_pyspark.py index 8c284687a7..72e9262b15 100644 --- a/tests/unit/test_notebooks_pyspark.py +++ b/tests/unit/test_notebooks_pyspark.py @@ -62,6 +62,6 @@ def test_mmlspark_lightgbm_criteo_runs(notebooks): parameters=dict( DATA_SIZE="sample", NUM_ITERATIONS=10, - EARLY_STOPPING_ROUND=2 + EARLY_STOPPING_ROUND=2, ) - ) \ No newline at end of file + ) From d23ca1d2dea5565db2c7919ce729194623ec264e Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Wed, 27 Mar 2019 21:23:33 +0000 Subject: [PATCH 24/30] docstrings --- reco_utils/common/spark_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/reco_utils/common/spark_utils.py b/reco_utils/common/spark_utils.py index f9029dca3c..2a04d975f7 100644 --- a/reco_utils/common/spark_utils.py +++ b/reco_utils/common/spark_utils.py @@ -27,6 +27,7 @@ def start_or_get_spark( memory (str): Size of memory for spark driver packages (list): list of packages to install jars (list): list of jar files to add + repositories (list): list of repositories Returns: obj: Spark context. From b10f7b6eb62d0c0c6e85c388b56ec909ad956d4d Mon Sep 17 00:00:00 2001 From: Andreas Date: Thu, 28 Mar 2019 15:27:15 +0000 Subject: [PATCH 25/30] LightGBM: unset SPARK_HOME in SETUP.md --- SETUP.md | 1 + 1 file changed, 1 insertion(+) diff --git a/SETUP.md b/SETUP.md index ea2b9575ed..773172d89c 100644 --- a/SETUP.md +++ b/SETUP.md @@ -94,6 +94,7 @@ To set these variables every time the environment is activated, we can follow th #!/bin/sh export PYSPARK_PYTHON=/anaconda/envs/reco_pyspark/bin/python export PYSPARK_DRIVER_PYTHON=/anaconda/envs/reco_pyspark/bin/python +unset SPARK_HOME ``` This will export the variables every time we do `conda activate reco_pyspark`. To unset these variables when we deactivate the environment, we create the file `/anaconda/envs/reco_pyspark/etc/conda/deactivate.d/env_vars.sh` and add: From 58e77375a349d288277aa351c4ee8770b1a06500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Gonz=C3=A1lez-Fierro?= <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 28 Mar 2019 19:21:45 +0000 Subject: [PATCH 26/30] Update README.md --- notebooks/02_model/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/notebooks/02_model/README.md b/notebooks/02_model/README.md index 86027a95f2..9a1897f91a 100644 --- a/notebooks/02_model/README.md +++ b/notebooks/02_model/README.md @@ -7,6 +7,7 @@ In this directory, notebooks are provided to give a deep dive into training mode | Notebook | Environment | Description | | --- | --- | --- | | [als_deep_dive](als_deep_dive.ipynb) | PySpark | Deep dive on the ALS algorithm and implementation. +| [mmlspark_lightgbm_criteo](mmlspark_lightgbm_criteo.ipynb) | PySpark | LightGBM gradient boosting tree algorithm implementation in MML Spark with Criteo dataset. | [baseline_deep_dive](baseline_deep_dive.ipynb) | --- | Deep dive on baseline performance estimation. | [ncf_deep_dive](ncf_deep_dive.ipynb) | Python CPU, GPU | Deep dive on a NCF algorithm and implementation. | [rbm_deep_dive](rbm_deep_dive.ipynb)| Python CPU, GPU | Deep dive on the rbm algorithm and its implementation. From 505039b2cca76ad301da0d64c94a37980720cd94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Gonz=C3=A1lez-Fierro?= <3491412+miguelgfierro@users.noreply.github.com> Date: Thu, 28 Mar 2019 19:22:55 +0000 Subject: [PATCH 27/30] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 13a0c2e75a..5adc209476 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ The table below lists recommender algorithms available in the repository at the | [FastAI Embedding Dot Bias (FAST)](notebooks/00_quick_start/fastai_movielens.ipynb) | Python CPU / Python GPU | Collaborative Filtering | General purpose algorithm with embeddings and biases for users and items | | [Alternating Least Squares (ALS)](notebooks/00_quick_start/als_movielens.ipynb) | PySpark | Collaborative Filtering | Matrix factorization algorithm for explicit or implicit feedback in large datasets, optimized by Spark MLLib for scalability and distributed computing capability | | [Vowpal Wabbit Family (VW)*](notebooks/02_model/vowpal_wabbit_deep_dive.ipynb) | Python CPU (train online) | Collaborative, Content-Based Filtering | Fast online learning algorithms, great for scenarios where user features / context are constantly changing | -| [LightGBM/Gradient Boosting Tree*](notebooks/00_quick_start/lightgbm_tinycriteo.ipynb) | Python CPU | Content-Based Filtering | Gradient Boosting Tree algorithm for fast training and low memory usage in content-based problems | +| [LightGBM/Gradient Boosting Tree*](notebooks/00_quick_start/lightgbm_tinycriteo.ipynb) | Python CPU / PySpark | Content-Based Filtering | Gradient Boosting Tree algorithm for fast training and low memory usage in content-based problems | | [Deep Knowledge-Aware Network (DKN)*](notebooks/00_quick_start/dkn_synthetic.ipynb) | Python CPU / Python GPU | Content-Based Filtering | Deep learning algorithm incorporating a knowledge graph and article embeddings to provide powerful news or article recommendations | | [Extreme Deep Factorization Machine (xDeepFM)*](notebooks/00_quick_start/xdeepfm_synthetic.ipynb) | Python CPU / Python GPU | Hybrid | Deep learning based algorithm for implicit and explicit feedback with user/item features | | [Wide and Deep](notebooks/00_quick_start/wide_deep_movielens.ipynb) | Python CPU / Python GPU | Hybrid | Deep learning algorithm that can memorize feature interactions and generalize user features | From 8c0df8df3904e7bb5fc8375000b21c6fc2d6fe94 Mon Sep 17 00:00:00 2001 From: miguelgfierro Date: Fri, 29 Mar 2019 14:36:54 +0000 Subject: [PATCH 28/30] mmlspark lightgbm pass --- SETUP.md | 11 ++- .../02_model/mmlspark_lightgbm_criteo.ipynb | 86 ++++++++----------- reco_utils/common/spark_utils.py | 8 +- scripts/databricks_install.py | 26 +++--- 4 files changed, 58 insertions(+), 73 deletions(-) diff --git a/SETUP.md b/SETUP.md index ea2b9575ed..8dbfae5765 100644 --- a/SETUP.md +++ b/SETUP.md @@ -169,18 +169,17 @@ This option utilizes an installation script to do the setup, and it requires add > databricks clusters start --cluster-id ` > ``` - -Once you have confirmed the databricks cluster is *RUNNING*, install the modules within this repository with the following commands. +The installation script has a number of options that can also deal with different databricks-cli profiles, install a version of the mmlspark library, overwrite the libraries, or prepare the cluster for operationalization. For all options, please see: ```{shell} -cd Recommenders -python scripts/databricks_install.py +python scripts/databricks_install.py -h ``` -The installation script has a number of options that can also deal with different databricks-cli profiles, install a version of the mmlspark library, or prepare the cluster for operationalization. For all options, please see: +Once you have confirmed the databricks cluster is *RUNNING*, install the modules within this repository with the following commands. ```{shell} -python scripts/databricks_install.py -h +cd Recommenders +python scripts/databricks_install.py ``` **Note** If you are planning on running through the sample code for operationalization [here](notebooks/05_operationalize/als_movie_o16n.ipynb), you need to prepare the cluster for operationalization. You can do so by adding an additional option to the script run. is the same as that mentioned above, and can be identified by running `databricks clusters list` and selecting the appropriate cluster. diff --git a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb index 0e8f8c578c..885e889b6e 100644 --- a/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb +++ b/notebooks/02_model/mmlspark_lightgbm_criteo.ipynb @@ -13,14 +13,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Content Based Personalization\n", - "## LightGBM on Azure Databricks
\n", - "This notebook provides a quick example of how to train LightGBM model on Azure Databricks and deploy it using MML Spark for a content personalization scenario.

\n", - "[LightGBM](https://github.com/Microsoft/Lightgbm) \\[1\\] is a gradient boosting framework that uses tree-based learning algorithms.
\n", - "[MMLSpark](https://github.com/Azure/mmlspark) \\[2\\] allows LightGBM to be called in a Spark environment which provides several advantages:\n", - "- Distributed computation for model development\n", - "- Easy integration into existing Spark workflows\n", - "- Model serving through Spark Serving \\[3\\]" + "# Content-Based Personalization with LightGBM on Spark\n", + "\n", + "This notebook provides a quick example of how to train a [LightGBM](https://github.com/Microsoft/Lightgbm) model on Spark using [MMLSpark](https://github.com/Azure/mmlspark) for a content-based personalization scenario.\n", + "\n", + "We use the [CRITEO dataset](https://www.kaggle.com/c/criteo-display-ad-challenge), a well known dataset of website ads that can be used to optimize the Click-Through Rate (CTR). The dataset consists of a series of numerical and categorical features and a binary label indicating whether the add has been clicked or not.\n", + "\n", + "The model is based on [LightGBM](https://github.com/Microsoft/Lightgbm), which is a gradient boosting framework that uses tree-based learning algorithms. Finally, we take advantage of\n", + "[MMLSpark](https://github.com/Azure/mmlspark) library, which allows LightGBM to be called in a Spark environment and be computed distributely.\n", + "\n", + "This scenario is a good example of **implicit feedback**, where binary labels indicate the interaction between a user and an item. This contrasts with explicit feedback, where the user explicitely rate the content, for example from 1 to 5. \n" ] }, { @@ -34,15 +36,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "A python script is provided to simplify setting up Azure Databricks with the correct\n", - "dependencies.
Run ```python scripts/databricks_install.py -h``` for more details." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note** - MML Spark should not be run on a cluster with autoscaling enabled. Disable the flag in the Azure Databricks Cluster configuration before running this notebook." + "This notebook can be run in a Spark environment in a DSVM or in Azure Databricks. For more details about the installation process, please refer to the [setup instructions](../../SETUP.md).\n", + "\n", + "**NOTE for Azure Databricks:**\n", + "* A python script is provided to simplify setting up Azure Databricks with the correct dependencies. Run ```python scripts/databricks_install.py -h``` for more details.\n", + "* MMLSpark should not be run on a cluster with autoscaling enabled. Disable the flag in the Azure Databricks Cluster configuration before running this notebook." ] }, { @@ -82,8 +80,8 @@ " # get the maven coordinates for MML Spark from databricks_install script\n", " from scripts.databricks_install import MMLSPARK_INFO\n", " packages = [MMLSPARK_INFO[\"maven\"][\"coordinates\"]]\n", - " repos = [MMLSPARK_INFO[\"maven\"][\"repositories\"]]\n", - " spark = start_or_get_spark(packages=packages, repositories=repos)\n", + " repo = MMLSPARK_INFO[\"maven\"].get(\"repositories\", None)\n", + " spark = start_or_get_spark(packages=packages, repository=repo)\n", " dbutils = None\n", " print(\"MMLSpark version: {}\".format(MMLSPARK_INFO['maven']['coordinates']))\n", "\n", @@ -124,7 +122,8 @@ "metadata": {}, "source": [ "## Data Preparation\n", - "The [Criteo Display Advertising Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge) (DAC) dataset [4] is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows, but there is also a down-sampled dataset which has 100,000 rows (this can be used by setting DATA_SIZE = 'sample').

\n", + "\n", + "The [Criteo Display Advertising Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge) (Criteo DAC) dataset is a well-known industry benchmarking dataset for developing CTR prediction models, and is used frequently by research papers. The original dataset contains over 45M rows, but there is also a down-sampled dataset which has 100,000 rows (this can be used by setting `DATA_SIZE = \"sample\"`). Each row corresponds to a display ad served by Criteo and the first column is indicates whether this ad has been clicked or not.

\n", "The dataset contains 1 label column and 39 feature columns, where 13 columns are integer values (int00-int12) and 26 columns are categorical features (cat00-cat25).

\n", "What the columns represent is not provided, but for this case we can consider the integer and categorical values as features representing the user and / or item content. The label is binary and is an example of implicit feedback indicating a user's interaction with an item. With this dataset we can demonstrate how to build a model that predicts the probability of a user interacting with an item based on available user and item content features.\n" ] @@ -272,8 +271,8 @@ "source": [ "### Feature Processing\n", "The feature data provided has many missing values across both integer and categorical feature fields. In addition the categorical features have many distinct values, so effectively cleaning and representing the feature data is an important step prior to training a model.

\n", - "One of the simplest ways of managing both features that have missing values as well as high cardinality is to use the hashing trick. The [FeatureHasher](http://spark.apache.org/docs/latest/ml-features.html#featurehasher) transformer will pass integer values through and will hash categorical features into a sparse vector of lower dimensionality which can be used effectively by LightGBM.

\n", - "First the dataset is split randomly for training and testing and feature processing is applied to each dataset." + "One of the simplest ways of managing both features that have missing values as well as high cardinality is to use the hashing trick. The [FeatureHasher](http://spark.apache.org/docs/latest/ml-features.html#featurehasher) transformer will pass integer values through and will hash categorical features into a sparse vector of lower dimensionality, which can be used effectively by LightGBM.

\n", + "First, the dataset is splitted randomly for training and testing and feature processing is applied to each dataset." ] }, { @@ -310,15 +309,15 @@ "metadata": {}, "source": [ "## Model Training\n", - "In MML Spark the LightGBM implementation for binary classification is invoked using the LightGBMClassifier class and specifying the objective as 'binary'. In this instance the occurrence of positive labels is quite low, so setting the isUnbalance flag to true helps account for this imbalance.

\n", + "In MMLSpark, the LightGBM implementation for binary classification is invoked using the `LightGBMClassifier` class and specifying the objective as `\"binary\"`. In this instance, the occurrence of positive labels is quite low, so setting the `isUnbalance` flag to true helps account for this imbalance.

\n", "\n", "### Hyper-parameters\n", - "Below are some of the key [hyper-parameters](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters-Tuning.rst) for training a LightGBM classifier on Spark\n", - "- numLeaves: the number of leaves in each tree\n", - "- numIterations: the number of iterations to apply boosting\n", - "- learningRate: the learning rate for training across trees\n", - "- featureFraction: the fraction of features used for training a tree\n", - "- earlyStoppingRound: round at which early stopping can be applied to avoid overfitting" + "Below are some of the key [hyper-parameters](https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters-Tuning.rst) for training a LightGBM classifier on Spark:\n", + "- `numLeaves`: the number of leaves in each tree\n", + "- `numIterations`: the number of iterations to apply boosting\n", + "- `learningRate`: the learning rate for training across trees\n", + "- `featureFraction`: the fraction of features used for training a tree\n", + "- `earlyStoppingRound`: round at which early stopping can be applied to avoid overfitting" ] }, { @@ -328,11 +327,11 @@ "outputs": [], "source": [ "lgbm = LightGBMClassifier(\n", - " labelCol='label',\n", - " featuresCol='features',\n", - " objective='binary',\n", + " labelCol=\"label\",\n", + " featuresCol=\"features\",\n", + " objective=\"binary\",\n", " isUnbalance=True,\n", - " boostingType='gbdt',\n", + " boostingType=\"gbdt\",\n", " boostFromAverage=True,\n", " baggingSeed=42,\n", " numLeaves=NUM_LEAVES,\n", @@ -415,7 +414,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Model Saving and Loading\n", + "## Model Saving \n", "The full pipeline for operating on raw data including feature processing and model prediction can be saved and reloaded for use in another workflow." ] }, @@ -427,34 +426,25 @@ "source": [ "# save model\n", "pipeline = PipelineModel(stages=[feature_processor, model])\n", - "pipeline.write().overwrite().save(MODEL_NAME)\n" + "pipeline.write().overwrite().save(MODEL_NAME)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## References\n", + "## Additional Reading\n", "\\[1\\] Guolin Ke, Qi Meng, Thomas Finley, Taifeng Wang, Wei Chen, Weidong Ma, Qiwei Ye, and Tie-Yan Liu. 2017. LightGBM: A highly efficient gradient boosting decision tree. In Advances in Neural Information Processing Systems. 3146–3154. https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree.pdf
\n", - "\\[2\\] MML Spark: https://mmlspark.blob.core.windows.net/website/index.html
\n", - "\\[3\\] MML Spark Serving: https://github.com/Azure/mmlspark/blob/master/docs/mmlspark-serving.md
\n", - "\\[4\\] The Criteo dataset: http://labs.criteo.com/wp-content/uploads/2015/04/dac_sample.tar.gz
" + "\\[2\\] MML Spark: https://mmlspark.blob.core.windows.net/website/index.html
\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python (reco_pyspark)", "language": "python", - "name": "python3" + "name": "reco_pyspark" }, "language_info": { "codemirror_mode": { diff --git a/reco_utils/common/spark_utils.py b/reco_utils/common/spark_utils.py index 2a04d975f7..435cf393f6 100644 --- a/reco_utils/common/spark_utils.py +++ b/reco_utils/common/spark_utils.py @@ -17,7 +17,7 @@ def start_or_get_spark( memory="10G", packages=None, jars=None, - repositories=None + repository=None ): """Start Spark if not started @@ -27,7 +27,7 @@ def start_or_get_spark( memory (str): Size of memory for spark driver packages (list): list of packages to install jars (list): list of jar files to add - repositories (list): list of repositories + repository (str): The maven repository Returns: obj: Spark context. @@ -38,8 +38,8 @@ def start_or_get_spark( submit_args = '--packages {} '.format(','.join(packages)) if jars is not None: submit_args += '--jars {} '.format(','.join(jars)) - if repositories is not None: - submit_args += "--repositories {}".format(",".join(repositories)) + if repository is not None: + submit_args += "--repositories {}".format(repository) if submit_args: os.environ['PYSPARK_SUBMIT_ARGS'] = '{} pyspark-shell'.format(submit_args) diff --git a/scripts/databricks_install.py b/scripts/databricks_install.py index efcdcc08e7..63bf020374 100644 --- a/scripts/databricks_install.py +++ b/scripts/databricks_install.py @@ -16,6 +16,7 @@ import sys import time from urllib.request import urlretrieve +from requests.exceptions import HTTPError # requires databricks-cli to be installed and authentication to be configured from databricks_cli.configure.provider import ProfileConfigProvider @@ -25,7 +26,7 @@ from databricks_cli.libraries.api import LibrariesApi from databricks_cli.dbfs.dbfs_path import DbfsPath -from requests.exceptions import HTTPError +from scripts.generate_conda_file import PIP_BASE CLUSTER_NOT_FOUND_MSG = """ Cannot find the target cluster {}. Please check if you entered the valid id. @@ -48,16 +49,16 @@ "5": "https://search.maven.org/remotecontent?filepath=com/microsoft/azure/azure-cosmosdb-spark_2.4.0_2.11/1.3.5/azure-cosmosdb-spark_2.4.0_2.11-1.3.5-uber.jar", } -PYPI_RECO_LIB_DEPS = ["tqdm==4.31.1"] +PYPI_RECO_LIB_DEPS = [PIP_BASE["tqdm"]] PYPI_O16N_LIBS = [ "azure-cli==2.0.56", "azureml-sdk[databricks]==1.0.8", - "pydocumentdb==2.3.3", + PIP_BASE["pydocumentdb"], ] MMLSPARK_INFO = {"maven": {"coordinates": "com.microsoft.ml.spark:mmlspark_2.11:0.16.dev8+2.g6a5318b", - "repositories": "https://mmlspark.azureedge.net/maven"} + "repo": "https://mmlspark.azureedge.net/maven"} } DEFAULT_CLUSTER_CONFIG = { @@ -83,8 +84,8 @@ def create_egg( Packages files in the reco_utils directory as a .egg file that can be uploaded to dbfs and installed as a library on a databricks cluster. Args: - path_to_recommenders_repo_root (String): the (relative or absolute) path to the root of the recommenders repository - local_eggname (String): the basename of the egg you want to create (NOTE: must have .egg extension) + path_to_recommenders_repo_root (str): the (relative or absolute) path to the root of the recommenders repository + local_eggname (str): the basename of the egg you want to create (NOTE: must have .egg extension) overwrite (bool): whether to overwrite local_eggname if it already exists. Returns: @@ -111,7 +112,7 @@ def dbfs_file_exists(api_client, dbfs_path): Args: api_client (ApiClient object): Object used for authenticating to the workspace - dbfs_path (String): Path to check + dbfs_path (str): Path to check Returns: True if file exists on dbfs, False otherwise. @@ -121,7 +122,6 @@ def dbfs_file_exists(api_client, dbfs_path): file_exists = True except: file_exists = False - pass return file_exists @@ -132,11 +132,11 @@ def prepare_for_operationalization( Installs appropriate versions of several libraries to support operationalization. Args: - cluster_id (String): cluster_id representing the cluster to prepare for operationalization + cluster_id (str): cluster_id representing the cluster to prepare for operationalization api_client (ApiClient): the ApiClient object used to authenticate to the workspace - dbfs_path (String): the path on dbfs to upload libraries to + dbfs_path (str): the path on dbfs to upload libraries to overwrite (bool): whether to overwrite existing files on dbfs with new files of the same name - spark_version (String): string version indicating which version of spark is installed on the databricks cluster + spark_version (str): str version indicating which version of spark is installed on the databricks cluster Returns: A dictionary of libraries installed @@ -207,25 +207,21 @@ def prepare_for_operationalization( "--overwrite", action="store_true", help="Whether to overwrite existing files.", - default=False, ) parser.add_argument( "--prepare-o16n", action="store_true", help="Whether to install additional libraries for operationalization.", - default=False, ) parser.add_argument( "--mmlspark", action="store_true", help="Whether to install mmlspark.", - default=False, ) parser.add_argument( "--create-cluster", action="store_true", help="Whether to create the cluster. This will create a cluster with default parameters.", - default=False, ) parser.add_argument( "cluster_id", From beda4e8acbcddbd9ac985f9a58092c83486a52c1 Mon Sep 17 00:00:00 2001 From: Andreas Argyriou Date: Fri, 29 Mar 2019 18:38:08 +0000 Subject: [PATCH 29/30] LGBM: fix databricks script --- scripts/databricks_install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/databricks_install.py b/scripts/databricks_install.py index 63bf020374..eae3eabfb2 100644 --- a/scripts/databricks_install.py +++ b/scripts/databricks_install.py @@ -26,7 +26,7 @@ from databricks_cli.libraries.api import LibrariesApi from databricks_cli.dbfs.dbfs_path import DbfsPath -from scripts.generate_conda_file import PIP_BASE +from generate_conda_file import PIP_BASE CLUSTER_NOT_FOUND_MSG = """ Cannot find the target cluster {}. Please check if you entered the valid id. From 42ce91c9c789a97fade1fd1d5c5c309da8f87e6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Gonz=C3=A1lez-Fierro?= <3491412+miguelgfierro@users.noreply.github.com> Date: Fri, 29 Mar 2019 22:07:11 +0000 Subject: [PATCH 30/30] :bug: --- scripts/databricks_install.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/databricks_install.py b/scripts/databricks_install.py index eae3eabfb2..63bf020374 100644 --- a/scripts/databricks_install.py +++ b/scripts/databricks_install.py @@ -26,7 +26,7 @@ from databricks_cli.libraries.api import LibrariesApi from databricks_cli.dbfs.dbfs_path import DbfsPath -from generate_conda_file import PIP_BASE +from scripts.generate_conda_file import PIP_BASE CLUSTER_NOT_FOUND_MSG = """ Cannot find the target cluster {}. Please check if you entered the valid id.