adap · tanertopal · Mar 23, 2023 · Mar 17, 2023 · Mar 17, 2023 · Mar 19, 2023
@@ -0,0 +1,2 @@
+dataset
+
@@ -11,22 +11,8 @@ This example demonstrates a federated XGBoost using Flower with PyTorch. This is
 
 ## Project Setup
 
-This implementation can be easily run in Google Colab with the following file structure in Google Drive, * denotes folder:
-
-```shell
-—————————————————————————————————————————————————————————————————————
-My Drive
-  XGBoost*
-      |----- code.ipynb
-      dataset*
-          binary_classifications*
-              |----- dataset file 1
-              |----- dataset file 2
-          regression*
-              |----- dataset file 1
-              |----- dataset file 2 
-—————————————————————————————————————————————————————————————————————
-```
+This implementation can be easily run in Google Colab with the button at the top of the README or as a standalone Jupyter notebook,
+it will automatically download and extract the example data inside a `dataset` folder and `binary_classification` and `regression` sub-folders.
 
 ## Datasets
 

@@ -29,36 +29,54 @@
    },
    "outputs": [],
    "source": [
-    "# File structure to run this implementation in Google Colab, * denotes folder:\n",
-    "#—————————————————————————————————————————————————————————————————————\n",
-    "# My Drive\n",
-    "#   XGBoost*\n",
-    "#       |----- code.ipynb (this file)\n",
-    "#       dataset*\n",
-    "#           binary_classifications*\n",
-    "#               |----- dataset file 1\n",
-    "#               |----- dataset file 2\n",
-    "#           regression*\n",
-    "#               |----- dataset file 1\n",
-    "#               |----- dataset file 2 \n",
-    "#—————————————————————————————————————————————————————————————————————\n",
-    "\n",
-    "from google.colab import drive\n",
-    "drive.mount(\"/content/drive\")\n",
-    "\n",
     "import os\n",
-    "import sys\n",
-    "GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = \"FedXGBoost\"\n",
-    "GOOGLE_DRIVE_PATH = os.path.join(\"drive\", \"My Drive\", GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)\n",
-    "print(os.listdir(GOOGLE_DRIVE_PATH))\n",
+    "import urllib.request\n",
+    "import bz2\n",
+    "import shutil\n",
+    "\n",
+    "CLASSIFICATION_PATH = os.path.join(\"dataset\", \"binary_classification\")\n",
+    "REGRESSION_PATH = os.path.join(\"dataset\", \"regression\")\n",
+    "\n",
+    "if not os.path.exists(CLASSIFICATION_PATH):\n",
+    "  os.makedirs(CLASSIFICATION_PATH)\n",
+    "  urllib.request.urlretrieve(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/cod-rna\",\n",
+    "                             f\"{os.path.join(CLASSIFICATION_PATH, 'cod-rna')}\")\n",
+    "  urllib.request.urlretrieve(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/cod-rna.t\",\n",
+    "                             f\"{os.path.join(CLASSIFICATION_PATH, 'cod-rna.t')}\")\n",
+    "  urllib.request.urlretrieve(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/cod-rna.r\",\n",
+    "                             f\"{os.path.join(CLASSIFICATION_PATH, 'cod-rna.r')}\")\n",
+    "  urllib.request.urlretrieve(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.t.bz2\",\n",
+    "                             f\"{os.path.join(CLASSIFICATION_PATH, 'ijcnn1.t.bz2')}\")\n",
+    "  urllib.request.urlretrieve(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/ijcnn1.tr.bz2\",\n",
+    "                             f\"{os.path.join(CLASSIFICATION_PATH, 'ijcnn1.tr.bz2')}\")\n",
+    "  for filepath in os.listdir(CLASSIFICATION_PATH):\n",
+    "    if filepath[-3:] == \"bz2\":\n",
+    "      abs_filepath = os.path.join(CLASSIFICATION_PATH, filepath)\n",
+    "      with bz2.BZ2File(abs_filepath) as fr, open(abs_filepath[:-4],\"wb\") as fw:\n",
+    "        shutil.copyfileobj(fr,fw)\n",
+    "\n",
+    "if not os.path.exists(REGRESSION_PATH):\n",
+    "  os.makedirs(REGRESSION_PATH)\n",
+    "  urllib.request.urlretrieve(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/eunite2001\",\n",
+    "                              f\"{os.path.join(REGRESSION_PATH, 'eunite2001')}\")\n",
+    "  urllib.request.urlretrieve(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/eunite2001.t\",\n",
+    "                             f\"{os.path.join(REGRESSION_PATH, 'eunite2001.t')}\")\n",
+    "  urllib.request.urlretrieve(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/YearPredictionMSD.bz2\",\n",
+    "                             f\"{os.path.join(REGRESSION_PATH, 'YearPredictionMSD.bz2')}\")\n",
+    "  urllib.request.urlretrieve(\"https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression/YearPredictionMSD.t.bz2\",\n",
+    "                             f\"{os.path.join(REGRESSION_PATH, 'YearPredictionMSD.t.bz2')}\")\n",
+    "  for filepath in os.listdir(REGRESSION_PATH):\n",
+    "    if filepath[-3:] == \"bz2\":\n",
+    "      abs_filepath = os.path.join(REGRESSION_PATH, filepath)\n",
+    "      with bz2.BZ2File(abs_filepath) as fr, open(abs_filepath[:-4],\"wb\") as fw:\n",
+    "        shutil.copyfileobj(fr,fw)\n",
+    "\n",
     "\n",
     "# Add to sys so we can import .py files.\n",
-    "sys.path.append(GOOGLE_DRIVE_PATH)\n",
     "!nvidia-smi\n",
-    "!pip install xgboost==1.7.2\n",
     "!pip install torchmetrics\n",
     "!pip install torchsummary\n",
-    "!pip install -U flwr-nightly[simulation]"
+    "!pip install -U flwr-nightly[simulation, xgboost]"
    ]
   },
   {
@@ -124,12 +142,156 @@
     "from flwr.server.strategy.fedxgb import construct_tree, construct_tree_from_loader, single_tree_prediction, tree_encoding\n",
     "from flwr.common.typing import Parameters\n",
     "from collections import OrderedDict\n",
-    "from typing import Dict, List, Optional, Tuple, Union\n",
+    "from typing import Any, Dict, List, Optional, Tuple, Union\n",
     "from flwr.common import NDArray, NDArrays\n",
     "\n",
     "print(\"Imported modules.\")"
    ]
   },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define utility function for xgboost trees"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from matplotlib import pyplot as plt  # pylint: disable=E0401\n",
+    "\n",
+    "\n",
+    "def plot_xgbtree(tree: Union[XGBClassifier, XGBRegressor], n_tree: int) -> None:\n",
+    "    \"\"\"Visualize the built xgboost tree.\"\"\"\n",
+    "    xgb.plot_tree(tree, num_trees=n_tree)\n",
+    "    plt.rcParams[\"figure.figsize\"] = [50, 10]\n",
+    "    plt.show()\n",
+    "\n",
+    "\n",
+    "def construct_tree(\n",
+    "    dataset: Dataset, label: NDArray, n_estimators: int, tree_type: str\n",
+    ") -> Union[XGBClassifier, XGBRegressor]:\n",
+    "    \"\"\"Construct a xgboost tree form tabular dataset.\"\"\"\n",
+    "    if tree_type == \"BINARY\":\n",
+    "        tree = xgb.XGBClassifier(\n",
+    "            objective=\"binary:logistic\",\n",
+    "            learning_rate=0.1,\n",
+    "            max_depth=8,\n",
+    "            n_estimators=n_estimators,\n",
+    "            subsample=0.8,\n",
+    "            colsample_bylevel=1,\n",
+    "            colsample_bynode=1,\n",
+    "            colsample_bytree=1,\n",
+    "            alpha=5,\n",
+    "            gamma=5,\n",
+    "            num_parallel_tree=1,\n",
+    "            min_child_weight=1,\n",
+    "        )\n",
+    "\n",
+    "    elif tree_type == \"REG\":\n",
+    "        tree = xgb.XGBRegressor(\n",
+    "            objective=\"reg:squarederror\",\n",
+    "            learning_rate=0.1,\n",
+    "            max_depth=8,\n",
+    "            n_estimators=n_estimators,\n",
+    "            subsample=0.8,\n",
+    "            colsample_bylevel=1,\n",
+    "            colsample_bynode=1,\n",
+    "            colsample_bytree=1,\n",
+    "            alpha=5,\n",
+    "            gamma=5,\n",
+    "            num_parallel_tree=1,\n",
+    "            min_child_weight=1,\n",
+    "        )\n",
+    "\n",
+    "    tree.fit(dataset, label)\n",
+    "    return tree\n",
+    "\n",
+    "\n",
+    "def construct_tree_from_loader(\n",
+    "    dataset_loader: DataLoader, n_estimators: int, tree_type: str\n",
+    ") -> Union[XGBClassifier, XGBRegressor]:\n",
+    "    \"\"\"Construct a xgboost tree form tabular dataset loader.\"\"\"\n",
+    "    for dataset in dataset_loader:\n",
+    "        data, label = dataset[0], dataset[1]\n",
+    "    return construct_tree(data, label, n_estimators, tree_type)\n",
+    "\n",
+    "\n",
+    "def single_tree_prediction(\n",
+    "    tree: Union[XGBClassifier, XGBRegressor], n_tree: int, dataset: NDArray\n",
+    ") -> Optional[NDArray]:\n",
+    "    \"\"\"Extract the prediction result of a single tree in the xgboost tree\n",
+    "    ensemble.\"\"\"\n",
+    "    # How to access a single tree\n",
+    "    # https://github.com/bmreiniger/datascience.stackexchange/blob/master/57905.ipynb\n",
+    "    num_t = len(tree.get_booster().get_dump())\n",
+    "    if n_tree > num_t:\n",
+    "        print(\n",
+    "            \"The tree index to be extracted is larger than the total number of trees.\"\n",
+    "        )\n",
+    "        return None\n",
+    "\n",
+    "    return tree.predict(  # type: ignore\n",
+    "        dataset, iteration_range=(n_tree, n_tree + 1), output_margin=True\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "def tree_encoding(  # pylint: disable=R0914\n",
+    "    trainloader: DataLoader,\n",
+    "    client_trees: Union[\n",
+    "        Tuple[XGBClassifier, int],\n",
+    "        Tuple[XGBRegressor, int],\n",
+    "        List[Union[Tuple[XGBClassifier, int], Tuple[XGBRegressor, int]]],\n",
+    "    ],\n",
+    "    client_tree_num: int,\n",
+    "    client_num: int,\n",
+    ") -> Optional[Tuple[NDArray, NDArray]]:\n",
+    "    \"\"\"Transform the tabular dataset into prediction results using the\n",
+    "    aggregated xgboost tree ensembles from all clients.\"\"\"\n",
+    "    if trainloader is None:\n",
+    "        return None\n",
+    "\n",
+    "    for local_dataset in trainloader:\n",
+    "        x_train, y_train = local_dataset[0], local_dataset[1]\n",
+    "\n",
+    "    x_train_enc = np.zeros((x_train.shape[0], client_num * client_tree_num))\n",
+    "    x_train_enc = np.array(x_train_enc, copy=True)\n",
+    "\n",
+    "    temp_trees: Any = None\n",
+    "    if isinstance(client_trees, list) is False:\n",
+    "        temp_trees = [client_trees[0]] * client_num\n",
+    "    elif isinstance(client_trees, list) and len(client_trees) != client_num:\n",
+    "        temp_trees = [client_trees[0][0]] * client_num\n",
+    "    else:\n",
+    "        cids = []\n",
+    "        temp_trees = []\n",
+    "        for i, _ in enumerate(client_trees):\n",
+    "            temp_trees.append(client_trees[i][0])  # type: ignore\n",
+    "            cids.append(client_trees[i][1])  # type: ignore\n",
+    "        sorted_index = np.argsort(np.asarray(cids))\n",
+    "        temp_trees = np.asarray(temp_trees)[sorted_index]\n",
+    "\n",
+    "    for i, _ in enumerate(temp_trees):\n",
+    "        for j in range(client_tree_num):\n",
+    "            x_train_enc[:, i * client_tree_num + j] = single_tree_prediction(\n",
+    "                temp_trees[i], j, x_train\n",
+    "            )\n",
+    "\n",
+    "    x_train_enc32: Any = np.float32(x_train_enc)\n",
+    "    y_train32: Any = np.float32(y_train)\n",
+    "\n",
+    "    x_train_enc32, y_train32 = torch.from_numpy(\n",
+    "        np.expand_dims(x_train_enc32, axis=1)  # type: ignore\n",
+    "    ), torch.from_numpy(\n",
+    "        np.expand_dims(y_train32, axis=-1)  # type: ignore\n",
+    "    )\n",
+    "    return x_train_enc32, y_train32"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",

@@ -64,15 +64,13 @@ requests = { version = "^2.28.2", optional = true }
 fastapi = { version = "^0.92.0", optional = true }
 starlette = { version = "^0.25.0", optional = true }
 uvicorn = { extras = ["standard"], version = "^0.20.0", optional = true }
-# Optional dependencies (xgboost)
-torch = { version = "^1.10.1", optional = true }
-matplotlib = { version = "^3.5.1", optional = true }
+# Optional dependency (xgboost)
 xgboost = { version = "^1.6.2", optional = true }
 
 [tool.poetry.extras]
 simulation = ["ray"]
 rest = ["fastapi", "requests", "uvicorn"]
-xgboost = ["torch", "matplotlib", "xgboost"]
+xgboost = ["xgboost"]
 
 [tool.poetry.group.dev.dependencies]
 types-dataclasses = "==0.6.5"