diff --git a/AUTHORS.md b/AUTHORS.md index 0c480792e2..966987f430 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -41,6 +41,9 @@ To contributors: please add your name to the list when you submit a patch to the * **[Aaron He](https://github.com/AaronHeee)** * Reco utils of NCF * Deep dive notebook demonstrating the use of NCF +* **[Abir Chakraborty](https://github.com/aeroabir)** + * Self-Attentive Sequential Recommendation (SASRec) + * Sequential Recommendation Via Personalized Transformer (SSEPT) * **[Alexandros Ioannou](https://github.com/aioannou96)** * Standard VAE algorithm * Multinomial VAE algorithm diff --git a/examples/00_quick_start/sasrec_amazon.ipynb b/examples/00_quick_start/sasrec_amazon.ipynb index aa3edea2c0..7df6f965bc 100644 --- a/examples/00_quick_start/sasrec_amazon.ipynb +++ b/examples/00_quick_start/sasrec_amazon.ipynb @@ -30,38 +30,35 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/anaconda/envs/py38/lib/python3.8/site-packages/papermill/iorw.py:50: FutureWarning: pyarrow.HadoopFileSystem is deprecated as of 2.0.0, please use pyarrow.fs.HadoopFileSystem instead.\n", - " from pyarrow import HadoopFileSystem\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "System version: 3.8.12 (default, Oct 12 2021, 13:49:34) \n", + "System version: 3.7.11 (default, Jul 27 2021, 14:32:16) \n", "[GCC 7.5.0]\n", - "Tensorflow version: 2.7.0\n" + "Tensorflow version: 2.7.1\n" ] } ], "source": [ + "import re\n", "import sys\n", "import os\n", "import scrapbook as sb\n", + "from tempfile import TemporaryDirectory\n", + "import numpy as np\n", + "import pandas as pd \n", + "\n", "from collections import defaultdict\n", "import tensorflow as tf\n", "tf.get_logger().setLevel('ERROR') # only show error messages\n", "\n", "from recommenders.utils.timer import Timer\n", - "from recommenders.datasets.amazon_reviews import download_and_extract\n", - "from recommenders.datasets.amazon_reviews import _reviews_preprocessing\n", + "from recommenders.datasets.amazon_reviews import get_review_data\n", + "from recommenders.datasets.split_utils import min_rating_filter_pandas\n", "\n", "# Transformer Based Models\n", "from recommenders.models.sasrec.model import SASREC\n", @@ -83,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": { "tags": [ "parameters" @@ -96,145 +93,90 @@ "RANDOM_SEED = 100 # Set None for non-deterministic result\n", "\n", "# data_dir = os.path.join(\"tests\", \"recsys_data\", \"RecSys\", \"SASRec-tf2\", \"data\")\n", - "data_dir = os.path.join(\"tests\", \"resources\", \"deeprec\", \"sasrec\")\n", + "data_dir = os.path.join(\"..\", \"..\", \"tests\", \"resources\", \"deeprec\", \"sasrec\")\n", "\n", - "# Amazon Electronics Data (already converted into integer user-ids and item-ids)\n", + "# Amazon Electronics Data\n", "dataset = \"reviews_Electronics_5\"\n", "\n", - "lr = 0.001 # learning rate\n", - "maxlen = 50 # maximum sequence length for each user\n", - "num_blocks = 2 # number of transformer blocks\n", - "hidden_units = 100 # number of units in the attention calculation\n", - "num_heads = 1 # number of attention heads\n", - "dropout_rate = 0.1 # dropout rate\n", - "l2_emb = 0.0 # L2 regularization coefficient\n", - "num_neg_test = 100 # number of negative examples per positive example" + "lr = 0.001 # learning rate\n", + "maxlen = 50 # maximum sequence length for each user\n", + "num_blocks = 2 # number of transformer blocks\n", + "hidden_units = 100 # number of units in the attention calculation\n", + "num_heads = 1 # number of attention heads\n", + "dropout_rate = 0.1 # dropout rate\n", + "l2_emb = 0.0 # L2 regularization coefficient\n", + "num_neg_test = 100 # number of negative examples per positive example" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "def data_process_with_time(fname, pname, K=10, sep=\" \", item_set=None, add_time=False):\n", - " User = defaultdict(list)\n", - " Users = set()\n", - " Items = set()\n", - " user_dict, item_dict = {}, {}\n", - "\n", - " item_counter = defaultdict(lambda: 0)\n", - " user_counter = defaultdict(lambda: 0)\n", - " with open(fname, \"r\") as fr:\n", - " for line in fr:\n", - " u, i, t = line.rstrip().split(sep)\n", - " User[u].append((i, t))\n", - " Items.add(i)\n", - " Users.add(u)\n", - " item_counter[i] += 1\n", - " user_counter[u] += 1\n", - "\n", - " # remove items with less than K interactions\n", - " print(f\"Read {len(User)} users and {len(Items)} items\")\n", - " remove_items = set()\n", - " count_remove, count_missing = 0, 0\n", - " for item in Items:\n", - " if item_counter[item] < K:\n", - " count_remove += 1\n", - " remove_items.add(item)\n", - " elif item_set and item not in item_set:\n", - " count_missing += 1\n", - " remove_items.add(item)\n", - "\n", - " if count_remove > 0:\n", - " print(f\"{count_remove} items have less than {K} interactions\")\n", - "\n", - " if count_missing > 0:\n", - " print(f\"{count_missing} items are not in the meta data\")\n", - "\n", - " Items = Items - remove_items\n", - "\n", - " # remove users with less than K interactions\n", - " remove_users = set()\n", - " count_remove = 0\n", - " # Users = set(User.keys())\n", - " for user in Users:\n", - " if user_counter[user] < K:\n", - " remove_users.add(user)\n", - " count_remove += 1\n", - " if count_remove > 0:\n", - " print(f\"{count_remove} users have less than {K} interactions\")\n", - " Users = Users - remove_users\n", - "\n", - " print(f\"Total {len(Users)} users and {len(Items)} items\")\n", - " item_count = 1\n", - " for item in Items:\n", - " item_dict[item] = item_count\n", - " item_count += 1\n", - "\n", - " count_del = 0\n", - " user_count = 1\n", - " with open(pname, \"w\") as fw:\n", - " for user in Users:\n", - " items = User[user]\n", - " items = [tup for tup in items if tup[0] in Items]\n", - " if len(items) < K:\n", - " # del User[user]\n", - " count_del += 1\n", - " else:\n", - " user_dict[user] = user_count\n", - " # sort by time\n", - " items = sorted(items, key=lambda x: x[1])\n", - "\n", - " # replace by the item-code\n", - " timestamps = [x[1] for x in items]\n", - " items = [item_dict[x[0]] for x in items]\n", - " for i, t in zip(items, timestamps):\n", - " out_txt = [str(user_count), str(i)]\n", - " if add_time:\n", - " out_txt.append(str(t))\n", - " fw.write(sep.join(out_txt) + \"\\n\")\n", - " user_count += 1\n", + "model_name = 'sasrec' # 'sasrec' or 'ssept'\n", + "reviews_name = dataset + '.json'\n", + "outfile = dataset + '.txt'\n", "\n", - " print(f\"Total {user_count-1} users, {count_del} removed\")\n", - " print(f\"Processed model input data in {pname}\")\n", - " return user_dict, item_dict\n" + "reviews_file = os.path.join(data_dir, reviews_name)\n", + "if not os.path.exists(reviews_file):\n", + " reviews_output = get_review_data(reviews_file)\n", + "else:\n", + " reviews_output = os.path.join(data_dir, dataset+\".json_output\")" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 484k/484k [02:31<00:00, 3.20kKB/s] \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Read 192403 users and 63001 items\n", - "27927 items have less than 10 interactions\n", - "147178 users have less than 10 interactions\n", - "Total 45225 users and 35074 items\n", - "Total 36262 users, 8963 removed\n", - "Processed model input data in recsys_data/RecSys/SASRec-tf2/data/reviews_Electronics_5.txt\n" - ] - } - ], + "outputs": [], "source": [ - "reviews_name = dataset + '.json'\n", - "outfile = dataset + '.txt'\n", - "\n", - "reviews_file = os.path.join(data_dir, reviews_name)\n", - "download_and_extract(reviews_name, reviews_file)\n", - "reviews_output = _reviews_preprocessing(reviews_file)\n", - "udict, idict = data_process_with_time(reviews_output,\n", - " os.path.join(data_dir, outfile), K=10, sep=\"\\t\")\n" + "def filter_K_core(data, core_num=0, col_user=\"userID\", col_item=\"itemID\"):\n", + " \"\"\"Filter rating dataframe for minimum number of users and items by \n", + " repeatedly applying min_rating_filter until the condition is satisfied. \n", + " \n", + " \"\"\"\n", + " num_users, num_items = len(data[col_user].unique()), len(data[col_item].unique())\n", + " print(f\"Original: {num_users} users and {num_items} items\")\n", + " df = data.copy()\n", + "\n", + " if core_num > 0:\n", + " while True:\n", + " df = min_rating_filter_pandas(df, min_rating=core_num, filter_by=\"item\")\n", + " df = min_rating_filter_pandas(df, min_rating=core_num, filter_by=\"user\")\n", + " count_u = df.groupby(col_user)[col_item].count()\n", + " count_i = df.groupby(col_item)[col_user].count()\n", + " if len(count_i[count_i < core_num]) == 0 and len(count_u[count_u < core_num]) == 0:\n", + " break\n", + " df = df.sort_values(by=[col_user])\n", + " print(f\"Final: {len(df[col_user].unique())} users and {len(df[col_item].unique())} items\")\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists(os.path.join(data_dir, outfile)):\n", + " df = pd.read_csv(reviews_output, sep=\"\\t\", names=[\"userID\", \"itemID\", \"time\"])\n", + " df = filter_K_core(df, 10)\n", + " \n", + " user_set, item_set = set(df['userID'].unique()), set(df['itemID'].unique())\n", + " user_map = dict()\n", + " item_map = dict()\n", + " for u, user in enumerate(user_set):\n", + " user_map[user] = u+1\n", + " for i, item in enumerate(item_set):\n", + " item_map[item] = i+1\n", + " \n", + " df[\"userID\"] = df[\"userID\"].apply(lambda x: user_map[x])\n", + " df[\"itemID\"] = df[\"itemID\"].apply(lambda x: item_map[x])\n", + " df = df.sort_values(by=[\"userID\", \"time\"])\n", + " df.drop(columns=[\"time\"], inplace=True)\n", + " df.to_csv(os.path.join(data_dir, outfile), sep=\"\\t\", header=False, index=False)" ] }, { @@ -284,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -336,78 +278,39 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2021-10-07 10:16:04.997837: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1\n", - "2021-10-07 10:16:05.007240: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: \n", - "pciBusID: 0001:00:00.0 name: Tesla K80 computeCapability: 3.7\n", - "coreClock: 0.8235GHz coreCount: 13 deviceMemorySize: 11.17GiB deviceMemoryBandwidth: 223.96GiB/s\n", - "2021-10-07 10:16:05.007272: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1\n", - "2021-10-07 10:16:05.009754: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10\n", - "2021-10-07 10:16:05.011395: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10\n", - "2021-10-07 10:16:05.011757: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10\n", - "2021-10-07 10:16:05.013467: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10\n", - "2021-10-07 10:16:05.014335: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10\n", - "2021-10-07 10:16:05.018536: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7\n", - "2021-10-07 10:16:05.019445: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0\n", - "2021-10-07 10:16:05.020033: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2021-10-07 10:16:05.026869: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2596990000 Hz\n", - "2021-10-07 10:16:05.027667: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55d8124eefb0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", - "2021-10-07 10:16:05.027684: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n", - "2021-10-07 10:16:05.137748: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55d81236d2e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n", - "2021-10-07 10:16:05.137778: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Tesla K80, Compute Capability 3.7\n", - "2021-10-07 10:16:05.138278: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: \n", - "pciBusID: 0001:00:00.0 name: Tesla K80 computeCapability: 3.7\n", - "coreClock: 0.8235GHz coreCount: 13 deviceMemorySize: 11.17GiB deviceMemoryBandwidth: 223.96GiB/s\n", - "2021-10-07 10:16:05.138332: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1\n", - "2021-10-07 10:16:05.138373: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10\n", - "2021-10-07 10:16:05.138400: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10\n", - "2021-10-07 10:16:05.138425: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcurand.so.10\n", - "2021-10-07 10:16:05.138450: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusolver.so.10\n", - "2021-10-07 10:16:05.138475: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcusparse.so.10\n", - "2021-10-07 10:16:05.138502: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.7\n", - "2021-10-07 10:16:05.139142: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1858] Adding visible gpu devices: 0\n", - "2021-10-07 10:16:05.139191: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1\n", - "2021-10-07 10:16:05.553534: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1257] Device interconnect StreamExecutor with strength 1 edge matrix:\n", - "2021-10-07 10:16:05.553578: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1263] 0 \n", - "2021-10-07 10:16:05.553586: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1276] 0: N \n", - "2021-10-07 10:16:05.554520: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1402] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 205 MB memory) -> physical GPU (device: 0, name: Tesla K80, pci bus id: 0001:00:00.0, compute capability: 3.7)\n" - ] - } - ], + "outputs": [], "source": [ - "model = SASREC(item_num=data.itemnum,\n", - " seq_max_len=maxlen,\n", - " num_blocks=num_blocks,\n", - " embedding_dim=hidden_units,\n", - " attention_dim=hidden_units,\n", - " attention_num_heads=num_heads,\n", - " dropout_rate=dropout_rate,\n", - " conv_dims = [100, 100],\n", - " l2_reg=l2_emb,\n", - " num_neg_test=num_neg_test\n", - ")\n", - "\n", - "# model = SSEPT(item_num=data.itemnum,\n", - "# user_num=data.usernum,\n", - "# seq_max_len=maxlen,\n", - "# num_blocks=num_blocks,\n", - "# # embedding_dim=hidden_units, # optional\n", - "# user_embedding_dim=hidden_units,\n", - "# item_embedding_dim=hidden_units,\n", - "# attention_dim=hidden_units,\n", - "# attention_num_heads=num_heads,\n", - "# dropout_rate=dropout_rate,\n", - "# conv_dims = [200, 200],\n", - "# l2_reg=l2_emb,\n", - "# num_neg_test=num_neg_test\n", - "# )\n" + "if model_name == 'sasrec':\n", + " model = SASREC(item_num=data.itemnum,\n", + " seq_max_len=maxlen,\n", + " num_blocks=num_blocks,\n", + " embedding_dim=hidden_units,\n", + " attention_dim=hidden_units,\n", + " attention_num_heads=num_heads,\n", + " dropout_rate=dropout_rate,\n", + " conv_dims = [100, 100],\n", + " l2_reg=l2_emb,\n", + " num_neg_test=num_neg_test\n", + " )\n", + "elif model_name == \"ssept\":\n", + " model = SSEPT(item_num=data.itemnum,\n", + " user_num=data.usernum,\n", + " seq_max_len=maxlen,\n", + " num_blocks=num_blocks,\n", + " # embedding_dim=hidden_units, # optional\n", + " user_embedding_dim=hidden_units,\n", + " item_embedding_dim=hidden_units,\n", + " attention_dim=hidden_units,\n", + " attention_num_heads=num_heads,\n", + " dropout_rate=dropout_rate,\n", + " conv_dims = [200, 200],\n", + " l2_reg=l2_emb,\n", + " num_neg_test=num_neg_test\n", + " )\n", + "else:\n", + " print(f\"Model-{model_name} not found\")" ] }, { @@ -423,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -445,314 +348,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 17, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/493 [00:00