From 1633a4f805f6d0fabffb2b9e4968844bef4ba269 Mon Sep 17 00:00:00 2001 From: Eli Fajardo Date: Tue, 2 Jan 2024 16:06:52 -0500 Subject: [PATCH 1/6] gnn fraud detection notebook fix --- .../gnn-fraud-detection-training.ipynb | 333 +++++++----------- 1 file changed, 127 insertions(+), 206 deletions(-) diff --git a/models/training-tuning-scripts/fraud-detection-models/gnn-fraud-detection-training.ipynb b/models/training-tuning-scripts/fraud-detection-models/gnn-fraud-detection-training.ipynb index d66234974d..f58d441d58 100644 --- a/models/training-tuning-scripts/fraud-detection-models/gnn-fraud-detection-training.ipynb +++ b/models/training-tuning-scripts/fraud-detection-models/gnn-fraud-detection-training.ipynb @@ -52,6 +52,7 @@ "%autoreload 2\n", "import pandas as pd\n", "import numpy as np\n", + "import matplotlib.pylab as plt\n", "import os\n", "import dgl\n", "import numpy as np\n", @@ -60,6 +61,7 @@ "import torch.nn as nn\n", "from model import HeteroRGCN\n", "from model import HinSAGE\n", + "from model import prepare_data\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import auc\n", "from sklearn.metrics import average_precision_score\n", @@ -70,7 +72,8 @@ "from tqdm import trange\n", "from xgboost import XGBClassifier\n", "from training import (get_metrics, evaluate, init_loaders, build_fsi_graph,\n", - " map_node_id, prepare_data, save_model, train)\n" + " save_model, train)\n", + "import cudf as cf" ] }, { @@ -85,37 +88,17 @@ "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")" ] }, - { - "cell_type": "code", - "execution_count": 73, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "device(type='cuda', index=0)" - ] - }, - "execution_count": 73, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#device " - ] - }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "##### Load traing and test dataset" + "##### Load training and test dataset" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -136,30 +119,29 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Increase number of samples.\n", - "def augement_data(train_data=train_data, n=20):\n", - " max_id = inductive_data.index.max()\n", + "def augment_data(train_data=train_data, n=20):\n", + " train_data.drop(columns=['index'], inplace=True, axis=1)\n", " non_fraud = train_data[train_data['fraud_label'] == 0]\n", - " \n", - " non_fraud = non_fraud.drop(['index'], axis=1)\n", - " df_fraud = pd.concat([non_fraud for i in range(n)])\n", - " df_fraud.index = np.arange(1076, 1076 + df_fraud.shape[0])\n", - " df_fraud['index'] = df_fraud.index\n", - " \n", - " return pd.concat((train_data, df_fraud))" + " df_fraud = pd.concat([non_fraud for _ in range(n)])\n", + " df_train = pd.concat([train_data, df_fraud])\n", + " df_train.reset_index(inplace=True)\n", + " df_train['index'] = df_train.index\n", + "\n", + " return df_train\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "train_data = augement_data(train_data, n=20)" + "train_data = augment_data(train_data, n=20)" ] }, { @@ -173,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -196,39 +178,14 @@ "print('The distribution of fraud for the inductive data is:\\n', inductive_data['fraud_label'].value_counts())" ] }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# split train, test and create nodes index\n", - "def prepare_data(df_train, df_test):\n", - " \n", - " train_idx_ = df_train.shape[0]\n", - " df = pd.concat([df_train, df_test], axis=0)\n", - " df['tran_id'] = df['index']\n", - "\n", - " meta_cols = ['tran_id', 'client_node', 'merchant_node']\n", - " for col in meta_cols:\n", - " map_node_id(df, col)\n", - "\n", - " train_idx = df['tran_id'][:train_idx_]\n", - " test_idx = df['tran_id'][train_idx_:]\n", - "\n", - " df['index'] = df['tran_id']\n", - " df.index = df['index']\n", - "\n", - " return (df.iloc[train_idx, :], df.iloc[test_idx, :], train_idx, test_idx, df['fraud_label'].values, df)" - ] - }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "train_data, test_data, train_idx, inductive_idx, labels, df = prepare_data(train_data, inductive_data)" + "# train_data, test_data, train_index, test_index, labels, all_data\n", + "train_data, test_data, train_idx, inductive_idx, labels, df = prepare_data(cf.from_pandas(train_data), cf.from_pandas(inductive_data))" ] }, { @@ -236,7 +193,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 3. Construct transasction graph network" + "### 3. Construct transaction graph network" ] }, { @@ -253,45 +210,18 @@ "metadata": {}, "outputs": [], "source": [ - "meta_cols = [\"client_node\", \"merchant_node\", \"fraud_label\", \"index\", \"tran_id\"]\n", + "\n", + "meta_cols = [\"client_node\", \"merchant_node\", \"index\"]\n", "\n", "# Build graph\n", "whole_graph, feature_tensors = build_fsi_graph(df, meta_cols)\n", "train_graph, _ = build_fsi_graph(train_data, meta_cols)\n", - "whole_graph = whole_graph.to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Dataset to tensors\n", - "feature_tensors = feature_tensors.to(device)\n", - "train_idx = torch.from_numpy(train_idx.values).to(device)\n", - "inductive_idx = torch.from_numpy(inductive_idx.values).to(device)\n", - "labels = torch.LongTensor(labels).to(device)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Graph(num_nodes={'client': 623, 'merchant': 388, 'transaction': 12053},\n", - " num_edges={('client', 'buy', 'transaction'): 12053, ('merchant', 'sell', 'transaction'): 12053, ('transaction', 'bought', 'client'): 12053, ('transaction', 'issued', 'merchant'): 12053},\n", - " metagraph=[('client', 'transaction', 'buy'), ('transaction', 'client', 'bought'), ('transaction', 'merchant', 'issued'), ('merchant', 'transaction', 'sell')])\n" - ] - } - ], - "source": [ - "# Show structure of training graph.\n", - "print(train_graph)" + "\n", + "# Dataset\n", + "feature_tensors = feature_tensors.float()\n", + "train_idx = torch.from_dlpack(train_idx.values.toDlpack()).long()\n", + "inductive_idx = torch.from_dlpack(inductive_idx.values.toDlpack()).long()\n", + "labels = torch.from_dlpack(labels.toDlpack()).long()\n" ] }, { @@ -312,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -322,21 +252,24 @@ "in_size, hidden_size, out_size, n_layers,\\\n", " embedding_size = 111, 64, 2, 2, 1\n", "batch_size = 100\n", - "hyperparameters = {\"in_size\": in_size, \"hidden_size\": hidden_size,\n", - " \"out_size\": out_size, \"n_layers\": n_layers,\n", - " \"embedding_size\": embedding_size,\n", - " \"target_node\": target_node,\n", - " \"epoch\": epochs}\n", + "in_size, hidden_size, out_size, n_layers, embedding_size = 111, 64, 2, 2, 1\n", + "hyperparameters = {\n", + " \"in_size\": in_size,\n", + " \"hidden_size\": hidden_size,\n", + " \"out_size\": out_size,\n", + " \"n_layers\": n_layers,\n", + " \"embedding_size\": embedding_size,\n", + " \"target_node\": target_node,\n", + " \"epoch\": epochs\n", + "}\n", "\n", - "\n", - "scale_pos_weight = train_data['fraud_label'].sum() / train_data.shape[0]\n", - "scale_pos_weight = torch.tensor(\n", - " [scale_pos_weight, 1-scale_pos_weight]).to(device)" + "scale_pos_weight = (labels[train_idx].sum() / train_data.shape[0]).item()\n", + "scale_pos_weight = torch.FloatTensor([scale_pos_weight, 1 - scale_pos_weight]).to(device)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -354,314 +287,309 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - " 0%| | 0/20 [00:00#sk-container-id-2 {color: black;background-color: white;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
+       "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
        "              colsample_bylevel=None, colsample_bynode=None,\n",
        "              colsample_bytree=None, early_stopping_rounds=None,\n",
        "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
@@ -806,7 +734,7 @@
        "              max_delta_step=None, max_depth=None, max_leaves=None,\n",
        "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
        "              n_estimators=100, n_jobs=None, num_parallel_tree=None,\n",
-       "              predictor=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.