Merge branch 'staging' into simonz/rectify-mmlspark-auc

recommenders-team · Jan 21, 2022 · b305bb4 · b305bb4
2 parents 73b6a0d + 735df87
commit b305bb4
Show file tree

Hide file tree

Showing 8 changed files with 28 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ For a more detailed overview of the repository, please see the documents on the
 Please see the [setup guide](SETUP.md) for more details on setting up your machine locally, on a [data science virtual machine (DSVM)](https://azure.microsoft.com/en-gb/services/virtual-machines/data-science-virtual-machines/) or on [Azure Databricks](SETUP.md#setup-guide-for-azure-databricks).
 
 The installation of the recommenders package has been tested with 
-- Python versions 3.6, 3.7 and [venv](https://docs.python.org/3/library/venv.html), [virtualenv](https://virtualenv.pypa.io/en/latest/index.html#) or [conda](https://docs.conda.io/projects/conda/en/latest/glossary.html?highlight=environment#conda-environment) 
+- Python versions 3.6 - 3.8 and [venv](https://docs.python.org/3/library/venv.html), [virtualenv](https://virtualenv.pypa.io/en/latest/index.html#) or [conda](https://docs.conda.io/projects/conda/en/latest/glossary.html?highlight=environment#conda-environment) 
 
 and currently does not support version 3.8 and above. It is recommended to install the package and its dependencies inside a clean environment (such as [conda](https://docs.conda.io/projects/conda/en/latest/glossary.html?highlight=environment#conda-environment), [venv](https://docs.python.org/3/library/venv.html) or [virtualenv](https://virtualenv.pypa.io/en/latest/index.html#)).
 
@@ -48,7 +48,7 @@ To set up on your local machine:
          sudo apt-get install -y build-essential libpython<version>
          ``` 
 
-         where `<version>` should be `3.6` or `3.7` as appropriate.
+         where `<version>` should be the Python version (e.g. `3.6`).
 
        + On Windows you will need [Microsoft C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/).
 

diff --git a/SETUP.md b/SETUP.md
@@ -61,7 +61,7 @@ If using venv or virtualenv, see [these instructions](#using-a-virtual-environme
 
 **NOTE** the `xlearn` package has dependency on `cmake`. If one uses the `xlearn` related notebooks or scripts, make sure `cmake` is installed in the system. The easiest way to install on Linux is with apt-get: `sudo apt-get install -y build-essential cmake`. Detailed instructions for installing `cmake` from source can be found [here](https://cmake.org/install/).
 
-**NOTE** the models from Cornac require installation of `libpython` i.e. using `sudo apt-get install -y libpython3.6` or `libpython3.7`, depending on the version of Python.
+**NOTE** the models from Cornac require installation of `libpython` i.e. using `sudo apt-get install -y libpython3.x`, depending on the version of Python.
 
 **NOTE** Spark requires Java version 8 or 11. We support Spark versions 3.0 and 3.1, but versions 2.4+ with Java version 8 may also work. 
 
@@ -235,7 +235,7 @@ sudo rm -rf Azure_mmlspark-0.12.jar com.microsoft.cntk_cntk-2.4.jar com.microsof
 ### Requirements
 
 * Databricks Runtime version >= 7, <= 9 (Apache Spark >= 3.0, <= 3.1, Scala 2.12)
-* Python 3.6 or 3.7
+* Python 3.6 - 3.8
 
 Earlier versions of Databricks or Spark may work but this is not guaranteed.
 An example of how to create an Azure Databricks workspace and an Apache Spark cluster within the workspace can be found from [here](https://docs.microsoft.com/en-us/azure/azure-databricks/quickstart-create-databricks-workspace-portal). To utilize deep learning models and GPUs, you may setup GPU-enabled cluster. For more details about this topic, please see [Azure Databricks deep learning guide](https://docs.azuredatabricks.net/applications/deep-learning/index.html).

diff --git a/examples/00_quick_start/sasrec_amazon.ipynb b/examples/00_quick_start/sasrec_amazon.ipynb
@@ -30,53 +30,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/recsys_data/RecSys/recommenders-tf2/myfork/recoenv_tf2p6/lib/python3.7/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n",
-      "  from pyarrow import HadoopFileSystem\n",
-      "2021-12-10 14:02:54.638638: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/intel/compilers_and_libraries_2018.1.163/linux/tbb/lib/intel64_lin/gcc4.7:/opt/intel/compilers_and_libraries_2018.1.163/linux/compiler/lib/intel64_lin:/opt/intel/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64_lin::/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64/:/opt/gurobi902/linux64/lib:/opt/gurobi902/linux64/lib\n",
-      "2021-12-10 14:02:54.638679: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
+      "/anaconda/envs/py38/lib/python3.8/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n",
+      "  from pyarrow import HadoopFileSystem\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "System version: 3.7.6 (default, Jan  8 2020, 19:59:22) \n",
-      "[GCC 7.3.0]\n",
-      "Tensorflow version: 2.6.0\n"
+      "System version: 3.8.12 (default, Oct 12 2021, 13:49:34) \n",
+      "[GCC 7.5.0]\n",
+      "Tensorflow version: 2.7.0\n"
      ]
     }
    ],
    "source": [
     "import sys\n",
     "import os\n",
-    "import logging\n",
-    "import papermill as pm\n",
     "import scrapbook as sb\n",
-    "from tempfile import TemporaryDirectory\n",
-    "import numpy as np\n",
     "from collections import defaultdict\n",
     "import tensorflow as tf\n",
     "tf.get_logger().setLevel('ERROR') # only show error messages\n",
     "\n",
     "from recommenders.utils.timer import Timer\n",
-    "from recommenders.utils.constants import SEED\n",
-    "from recommenders.models.deeprec.deeprec_utils import (\n",
-    "    prepare_hparams\n",
-    ")\n",
-    "from recommenders.datasets.amazon_reviews import download_and_extract, data_preprocessing\n",
+    "from recommenders.datasets.amazon_reviews import download_and_extract\n",
     "from recommenders.datasets.amazon_reviews import _reviews_preprocessing\n",
-    "from recommenders.datasets.download_utils import maybe_download\n",
     "\n",
     "# Transformer Based Models\n",
     "from recommenders.models.sasrec.model import SASREC\n",
-    "from recommenders.models.sasrec.ssept import SSEPT\n",
     "\n",
     "# Sampler for sequential prediction\n",
     "from recommenders.models.sasrec.sampler import WarpSampler\n",
@@ -95,7 +83,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 5,
    "metadata": {
     "tags": [
      "parameters"
@@ -107,8 +95,8 @@
     "batch_size = 128\n",
     "RANDOM_SEED = 100  # Set None for non-deterministic result\n",
     "\n",
-    "# data_dir = \"/recsys_data/RecSys/SASRec-tf2/data/\"\n",
-    "data_dir = os.path.join(\"..\", \"..\", \"tests\", \"resources\", \"deeprec\", \"sasrec\")\n",
+    "# data_dir = os.path.join(\"tests\", \"recsys_data\", \"RecSys\", \"SASRec-tf2\", \"data\")\n",
+    "data_dir = os.path.join(\"tests\", \"resources\", \"deeprec\", \"sasrec\")\n",
     "\n",
     "# Amazon Electronics Data (already converted into integer user-ids and item-ids)\n",
     "dataset = \"reviews_Electronics_5\"\n",
@@ -125,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -215,14 +203,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 484k/484k [01:20<00:00, 6.04kKB/s]\n"
+      "100%|██████████| 484k/484k [02:31<00:00, 3.20kKB/s] \n"
      ]
     },
     {
@@ -234,7 +222,7 @@
       "147178 users have less than 10 interactions\n",
       "Total 45225 users and 35074 items\n",
       "Total 36262 users, 8963 removed\n",
-      "Processed model input data in ../../tests/resources/deeprec/sasrec/reviews_Electronics_5.txt\n"
+      "Processed model input data in recsys_data/RecSys/SASRec-tf2/data/reviews_Electronics_5.txt\n"
      ]
     }
    ],
@@ -889,10 +877,13 @@
  ],
  "metadata": {
   "celltoolbar": "Tags",
+  "interpreter": {
+   "hash": "74e1e608f292f7dca250d57ef9fee6215d6d7d71f8af2f8fa1898ee2c2a10179"
+  },
   "kernelspec": {
    "display_name": "recoenv_tf2p6",
    "language": "python",
-   "name": "recoenv_tf2p6"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -904,7 +895,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.6"
+   "version": "3.8.12"
   }
  },
  "nbformat": 4,

diff --git a/recommenders/README.md b/recommenders/README.md
@@ -9,7 +9,7 @@ Some dependencies require compilation during pip installation. On Linux this can
 ```bash
 sudo apt-get install -y build-essential libpython<version>
 ``` 
-where `<version>` should be `3.6` or `3.7` as appropriate.
+where `<version>` should be the Python version (e.g. `3.6`).
 
 On Windows you will need [Microsoft C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)
 

diff --git a/setup.py b/setup.py
@@ -129,6 +129,7 @@
         "License :: OSI Approved :: MIT License",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
         "Operating System :: Microsoft :: Windows",
         "Operating System :: POSIX :: Linux",
         "Operating System :: MacOS",

diff --git a/tests/integration/examples/test_notebooks_gpu.py b/tests/integration/examples/test_notebooks_gpu.py
@@ -607,7 +607,7 @@ def test_cornac_bivae_integration(
     "data_dir, num_epochs, batch_size, expected_values, seed",
     [
         (
-            "/recsys_data/RecSys/SASRec-tf2/data/",
+            os.path.join("tests", "recsys_data", "RecSys", "SASRec-tf2", "data"),
             1,
             128,
             {"ndcg@10": 0.2626, "Hit@10": 0.4244},

diff --git a/tests/unit/examples/test_notebooks_gpu.py b/tests/unit/examples/test_notebooks_gpu.py
@@ -124,10 +124,3 @@ def test_dkn_quickstart(notebooks, output_notebook, kernel_name):
         kernel_name=kernel_name,
         parameters=dict(epochs=1, batch_size=500),
     )
-
-
-@pytest.mark.notebooks
-@pytest.mark.gpu
-def test_sasrec_single_node_runs(notebooks, output_notebook, kernel_name):
-    notebook_path = notebooks["sasrec_quickstart"]
-    pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name)
diff --git a/tests/unit/recommenders/models/test_sasrec_model.py b/tests/unit/recommenders/models/test_sasrec_model.py
@@ -121,7 +121,7 @@ def data_process_with_time(fname, pname, K=10, sep=" ", item_set=None, add_time=
 
 @pytest.mark.gpu
 def test_prepare_data():
-    data_dir = os.path.join("..", "..", "tests", "resources", "deeprec", "sasrec")
+    data_dir = os.path.join("tests", "resources", "deeprec", "sasrec")
     dataset = "reviews_Electronics_5"
     reviews_name = dataset + ".json"
     outfile = os.path.join(data_dir, dataset + ".txt")
@@ -146,7 +146,7 @@ def test_prepare_data():
 def test_sampler():
     batch_size = 8
     maxlen = 50
-    data_dir = os.path.join("..", "..", "tests", "resources", "deeprec", "sasrec")
+    data_dir = os.path.join("tests", "resources", "deeprec", "sasrec")
     dataset = "reviews_Electronics_5"
     reviews_name = dataset + ".json"
     outfile = os.path.join(data_dir, dataset + ".txt")