Merge branch 'main' into docs-calver-banner

NVIDIA-Merlin · Nov 26, 2022 · c1596c8 · c1596c8
2 parents 549316f + 776af7b
commit c1596c8
Show file tree

Hide file tree

Showing 45 changed files with 864 additions and 661 deletions.
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
@@ -20,18 +20,18 @@ on:
   workflow_dispatch:
       inputs:
           platform:
-            description: 'runs-on argument'     
+            description: 'runs-on argument'
             required: false
           args:
-            description: 'argument'     
+            description: 'argument'
             required: false
 jobs:
   Authorization:
     name: Authorization
-    runs-on: blossom 
+    runs-on: blossom
     outputs:
       args: ${{ env.args }}
-    
+
     # This job only runs for pull request comments
     if: contains( '\
       albert17,\
@@ -54,7 +54,7 @@ jobs:
           OPERATION: 'AUTH'
           REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           REPO_KEY_DATA: ${{ secrets.BLOSSOM_KEY }}
-        
+
   Vulnerability-scan:
     name: Vulnerability scan
     needs: [Authorization]
@@ -66,20 +66,20 @@ jobs:
           repository: ${{ fromJson(needs.Authorization.outputs.args).repo }}
           ref: ${{ fromJson(needs.Authorization.outputs.args).ref }}
           lfs: 'true'
-      
-      # repo specific steps 
+
+      # repo specific steps
       #- name: Setup java
       #  uses: actions/setup-java@v1
       #  with:
       #    java-version: 1.8
-      
+
       # add blackduck properties https://synopsys.atlassian.net/wiki/spaces/INTDOCS/pages/631308372/Methods+for+Configuring+Analysis#Using-a-configuration-file
       #- name: Setup blackduck properties
       #  run: |
       #       PROJECTS=$(mvn -am dependency:tree | grep maven-dependency-plugin | awk '{ out="com.nvidia:"$(NF-1);print out }' | grep rapids | xargs | sed -e 's/ /,/g')
       #       echo detect.maven.build.command="-pl=$PROJECTS -am" >> application.properties
       #       echo detect.maven.included.scopes=compile >> application.properties
-          
+
       - name: Run blossom action
         uses: NVIDIA/blossom-action@main
         env:
@@ -89,7 +89,7 @@ jobs:
           args1: ${{ fromJson(needs.Authorization.outputs.args).args1 }}
           args2: ${{ fromJson(needs.Authorization.outputs.args).args2 }}
           args3: ${{ fromJson(needs.Authorization.outputs.args).args3 }}
-          
+
   Job-trigger:
     name: Start ci job
     needs: [Vulnerability-scan]
@@ -101,7 +101,7 @@ jobs:
           OPERATION: 'START-CI-JOB'
           CI_SERVER: ${{ secrets.CI_SERVER }}
           REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-              
+
   Upload-Log:
     name: Upload log
     runs-on: blossom

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,9 +30,6 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip tox
-      - name: Lint with flake8, black, isort
-        run: |
-          tox -e lint
       - name: Checking Manifest
         run: |
           pip install check-manifest
@@ -42,7 +39,14 @@ jobs:
           python setup.py develop
       - name: Run unittests
         run: |
-          tox -e test-cpu
+          ref_type=${{ github.ref_type }}
+          branch=main
+          if [[ $ref_type == "tag"* ]]
+          then
+            raw=$(git branch -r --contains ${{ github.ref_name }})
+            branch=${raw/origin\/}
+          fi
+          tox -e test-cpu -- $branch
       - name: Generate package for pypi
         run: |
           python setup.py sdist

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,17 @@
+name: lint
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          cache: 'pip'
+          cache-dependency-path: '**/**.txt'
+      - uses: pre-commit/action@v2.0.3
diff --git a/.github/workflows/release-drafter.yaml b/.github/workflows/release-drafter.yaml
@@ -2,16 +2,10 @@ name: release-drafter
 
 on:
   push:
-    # branches to consider in the event; optional, defaults to all
-    branches:
-      - "main"
-  # pull_request event is required only for autolabeler
-  pull_request:
-    # Only following types are handled by the action, but one can default to all as well
-    types: [opened, reopened, synchronize]
-  # pull_request_target event is required for autolabeler to support PRs from forks
-  # pull_request_target:
-  #  types: [opened, reopened, synchronize]
+    # trigger on tags only
+    tags:
+      - v*
+
   workflow_dispatch:
 
 jobs:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
         rev: 22.8.0
         hooks:
         - id: black
-      - repo: https://gitlab.com/pycqa/flake8
+      - repo: https://github.com/pycqa/flake8
         rev: 3.9.2
         hooks:
         - id: flake8
@@ -17,7 +17,7 @@ repos:
         hooks:
           - id: mypy
             language_version: python3
-            args: [--no-strict-optional, --ignore-missing-imports, --show-traceback, --install-types, --non-interactive]
+            args: [--non-interactive, --install-types]
       - repo: https://github.com/codespell-project/codespell
         rev: v2.2.1
         hooks:

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -15,6 +15,7 @@ recursive-include tests *.pbtxt
 recursive-include tests *.py
 
 recursive-include transformers4rec *.parquet *.json *.py *.typed
+recursive-include merlin_standard_lib *.py
 
 
 # Ignore notebooks & examples

diff --git a/examples/end-to-end-session-based/01-ETL-with-NVTabular.ipynb b/examples/end-to-end-session-based/01-ETL-with-NVTabular.ipynb
@@ -292,7 +292,7 @@
     {
      "data": {
       "text/plain": [
-       "0"
+       "518"
       ]
      },
      "execution_count": 8,
@@ -341,9 +341,9 @@
    "source": [
     "In this cell, we are defining three transformations ops: \n",
     "\n",
-    "- 1. Encoding categorical variables using `Categorify()` op. We set `start_index` to 1, so that encoded null values start from `1` instead of `0` because we reserve `0` for padding the sequence features.\n",
+    "- 1. Encoding categorical variables using `Categorify()` op. We set `start_index` to 1 so that encoded null values start from `1` instead of `0` because we reserve `0` for padding the sequence features.\n",
     "- 2. Deriving temporal features from timestamp and computing their cyclical representation using a custom lambda function. \n",
-    "- 3. Computing the item recency in days using a custom Op. Note that item recency is defined as the difference between the first occurrence of the item in dataset and the actual date of item interaction. \n",
+    "- 3. Computing the item recency in days using a custom op. Note that item recency is defined as the difference between the first occurrence of the item in dataset and the actual date of item interaction. \n",
     "\n",
     "For more ETL workflow examples, visit NVTabular [example notebooks](https://github.com/NVIDIA-Merlin/NVTabular/tree/main/examples)."
    ]
@@ -371,7 +371,7 @@
     "    nvt.ops.Rename(name ='et_dayofweek')\n",
     ")\n",
     "\n",
-    "# Derive cyclical features: Defines a custom lambda function \n",
+    "# Derive cyclical features: Define a custom lambda function \n",
     "def get_cycled_feature_value_sin(col, max_value):\n",
     "    value_scaled = (col + 0.000001) / max_value\n",
     "    value_sin = np.sin(2*np.pi*value_scaled)\n",
@@ -440,7 +440,7 @@
    "id": "018872e9",
    "metadata": {},
    "source": [
-    "Once the item features are generated, the objective of this cell is grouping interactions at the session level, sorting the interactions by time. We additionally truncate all sessions to first 20 interactions and filter out sessions with less than 2 interactions."
+    "Once the item features are generated, the objective of this cell is to group interactions at the session level, sorting the interactions by time. We additionally truncate all sessions to first 20 interactions and filter out sessions with less than 2 interactions."
    ]
   },
   {
@@ -490,7 +490,7 @@
    "id": "5eacbb6c",
    "metadata": {},
    "source": [
-    "- Avoid Numba low occupancy warnings"
+    "Avoid Numba low occupancy warnings:"
    ]
   },
   {
@@ -517,30 +517,21 @@
    "id": "26fdbd0c",
    "metadata": {},
    "source": [
-    "Once we have defined the general workflow (`filtered_sessions`), we provide our cudf dataset to nvt.Dataset class which is optimized to split data into chunks that can fit in device memory and to handle the calculation of complex global statistics. Then, we execute the pipeline that fits and transforms data to get the desired output features."
+    "Once we have defined the general workflow (`filtered_sessions`), we provide our cudf dataset to `nvt.Dataset` class which is optimized to split data into chunks that can fit in device memory and to handle the calculation of complex global statistics. Then, we execute the pipeline that fits and transforms data to get the desired output features."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 12,
    "id": "45803886",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python3.8/dist-packages/cudf/core/frame.py:384: UserWarning: The deep parameter is ignored and is only included for pandas compatibility.\n",
-      "  warnings.warn(\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "dataset = nvt.Dataset(interactions_merged_df)\n",
     "workflow = nvt.Workflow(filtered_sessions)\n",
-    "# Learns features statistics necessary of the preprocessing workflow\n",
+    "# Learn features statistics necessary of the preprocessing workflow\n",
     "workflow.fit(dataset)\n",
-    "# Apply the preprocessing workflow in the dataset and converts the resulting Dask cudf dataframe to a cudf dataframe\n",
+    "# Apply the preprocessing workflow in the dataset and convert the resulting Dask cudf dataframe to a cudf dataframe\n",
     "sessions_gdf = workflow.transform(dataset).compute()"
    ]
   },
@@ -596,17 +587,17 @@
        "      <td>[2223, 2125, 1800, 123, 3030, 1861, 1076, 1285...</td>\n",
        "      <td>[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...</td>\n",
        "      <td>[1.1285199e-06, 1.1285199e-06, 1.1285199e-06, ...</td>\n",
-       "      <td>[-1.1126351356506348, -0.9665398597717285, -0....</td>\n",
+       "      <td>[-1.1126341, -0.9665389, -0.1350116, -0.127809...</td>\n",
        "      <td>27</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
        "      <td>3</td>\n",
        "      <td>200</td>\n",
-       "      <td>[35137, 19260, 46449, 29027, 39096, 27266, 326...</td>\n",
+       "      <td>[34959, 24004, 32503, 39480, 28132, 47339, 351...</td>\n",
        "      <td>[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...</td>\n",
        "      <td>[0.43388295, 0.43388295, 0.43388295, 0.4338829...</td>\n",
-       "      <td>[0.393317312002182, 0.541846752166748, -3.0278...</td>\n",
+       "      <td>[0.3110803, 0.475488, -3.0278225, -3.0278225, ...</td>\n",
        "      <td>58</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -616,7 +607,7 @@
        "      <td>[23212, 30448, 16468, 2052, 22490, 31097, 6243...</td>\n",
        "      <td>[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...</td>\n",
        "      <td>[0.9749277, 0.9749277, 0.9749277, 0.9749277, 0...</td>\n",
-       "      <td>[0.6801633834838867, 0.7174698114395142, 0.718...</td>\n",
+       "      <td>[0.6801631, 0.7174695, 0.7185285, 0.7204116, 0...</td>\n",
        "      <td>71</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -626,7 +617,7 @@
        "      <td>[230, 451, 732, 1268, 2014, 567, 497, 439, 338...</td>\n",
        "      <td>[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, ...</td>\n",
        "      <td>[0.43388295, 0.43388295, 0.43388295, 0.4338829...</td>\n",
-       "      <td>[1.3680896759033203, -0.6530488133430481, -0.6...</td>\n",
+       "      <td>[1.3680888, -0.6530481, -0.69314253, -0.590593...</td>\n",
        "      <td>149</td>\n",
        "    </tr>\n",
        "    <tr>\n",
@@ -636,7 +627,7 @@
        "      <td>[23, 70, 160, 70, 90, 742, 851, 359, 734, 878,...</td>\n",
        "      <td>[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...</td>\n",
        "      <td>[0.43388295, 0.43388295, 0.43388295, 0.4338829...</td>\n",
-       "      <td>[1.3714832067489624, 1.371589183807373, 1.3715...</td>\n",
+       "      <td>[1.3714824, 1.3715883, 1.3715737, 1.3715955, 1...</td>\n",
        "      <td>149</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -653,7 +644,7 @@
        "\n",
        "                                    item_id-list_seq  \\\n",
        "0  [2223, 2125, 1800, 123, 3030, 1861, 1076, 1285...   \n",
-       "1  [35137, 19260, 46449, 29027, 39096, 27266, 326...   \n",
+       "1  [34959, 24004, 32503, 39480, 28132, 47339, 351...   \n",
        "2  [23212, 30448, 16468, 2052, 22490, 31097, 6243...   \n",
        "3  [230, 451, 732, 1268, 2014, 567, 497, 439, 338...   \n",
        "4  [23, 70, 160, 70, 90, 742, 851, 359, 734, 878,...   \n",
@@ -673,11 +664,11 @@
        "4  [0.43388295, 0.43388295, 0.43388295, 0.4338829...   \n",
        "\n",
        "              product_recency_days_log_norm-list_seq  day_index  \n",
-       "0  [-1.1126351356506348, -0.9665398597717285, -0....         27  \n",
-       "1  [0.393317312002182, 0.541846752166748, -3.0278...         58  \n",
-       "2  [0.6801633834838867, 0.7174698114395142, 0.718...         71  \n",
-       "3  [1.3680896759033203, -0.6530488133430481, -0.6...        149  \n",
-       "4  [1.3714832067489624, 1.371589183807373, 1.3715...        149  "
+       "0  [-1.1126341, -0.9665389, -0.1350116, -0.127809...         27  \n",
+       "1  [0.3110803, 0.475488, -3.0278225, -3.0278225, ...         58  \n",
+       "2  [0.6801631, 0.7174695, 0.7185285, 0.7204116, 0...         71  \n",
+       "3  [1.3680888, -0.6530481, -0.69314253, -0.590593...        149  \n",
+       "4  [1.3714824, 1.3715883, 1.3715737, 1.3715955, 1...        149  "
       ]
      },
      "execution_count": 13,
@@ -720,9 +711,9 @@
    "id": "9515d625",
    "metadata": {},
    "source": [
-    "In this example we are going to split the preprocessed parquet files by days, to allow for temporal training and evaluation. There will be a folder for each day and three parquet files within each day: `train.parquet`, `validation.parquet` and `test.parquet`\n",
+    "In this example we are going to split the preprocessed parquet files by days, to allow for temporal training and evaluation. There will be a folder for each day and three parquet files within each day: `train.parquet`, `validation.parquet` and `test.parquet`.\n",
     "  \n",
-    "P.s. It is worthwhile a note that the dataset have a single categorical feature (category), but it is inconsistent over time in the dataset. All interactions before day 84 (2014-06-23) have the same value for that feature, whereas many other categories are introduced afterwards. Thus for this example, we save only the last five days."
+    "P.s. It is worthwhile to note that the dataset has a single categorical feature (category), which, however, is inconsistent over time in the dataset. All interactions before day 84 (2014-06-23) have the same value for that feature, whereas many other categories are introduced afterwards. Thus for this example, we save only the last five days."
    ]
   },
   {
@@ -745,7 +736,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Creating time-based splits: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  4.73it/s]\n"
+      "Creating time-based splits: 100%|██████████| 5/5 [00:00<00:00,  5.99it/s]\n"
      ]
     }
    ],
@@ -789,25 +780,25 @@
      "output_type": "stream",
      "text": [
       "preproc_sessions_by_day/\n",
-      "    182/\n",
-      "        test.parquet\n",
-      "        valid.parquet\n",
-      "        train.parquet\n",
       "    179/\n",
-      "        test.parquet\n",
       "        valid.parquet\n",
+      "        test.parquet\n",
       "        train.parquet\n",
       "    180/\n",
-      "        test.parquet\n",
       "        valid.parquet\n",
+      "        test.parquet\n",
       "        train.parquet\n",
       "    178/\n",
+      "        valid.parquet\n",
       "        test.parquet\n",
+      "        train.parquet\n",
+      "    182/\n",
       "        valid.parquet\n",
+      "        test.parquet\n",
       "        train.parquet\n",
       "    181/\n",
-      "        test.parquet\n",
       "        valid.parquet\n",
+      "        test.parquet\n",
       "        train.parquet\n"
      ]
     }
@@ -825,7 +816,7 @@
     {
      "data": {
       "text/plain": [
-       "557"
+       "578"
       ]
      },
      "execution_count": 19,
@@ -844,7 +835,7 @@
    "id": "64772bf1",
    "metadata": {},
    "source": [
-    "That's it! We created our sequential features, now we can go to next notebook to train a PyTorch session-based model."
+    "That's it! We created our sequential features, now we can go to the next notebook to train a PyTorch session-based model."
    ]
   }
  ],