finished baseline model

UCSolarCarTeam · Nov 6, 2024 · 47d458d · 47d458d
1 parent 6a5789a
commit 47d458d
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 3 deletions.
diff --git a/analysis/baselineModelTesting.ipynb b/analysis/baselineModelTesting.ipynb
@@ -86,13 +86,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Check the Data Size ##\n",
+    "## Import Modules and Check the Data Size ##\n",
     "Should be 208x9"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -301,6 +301,10 @@
    ],
    "source": [
     "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.linear_model import LinearRegression\n",
+    "from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
+    "import numpy as np\n",
     "\n",
     "packetTrainingDataPath=\"../training_data/Elysia.Laps.feather\"\n",
     "df = pd.read_feather(packetTrainingDataPath)\n",
@@ -315,6 +319,85 @@
     "\n",
     "display(df)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Building the Model #"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RMSE (root mean squared error) is: 36.22178511618355\n",
+      "MAE (mean absolute error) is: 6.9643536149352085\n",
+      "Minimum distance: -227.01091494750978\n",
+      "Maximum distance: 77.62573291015624\n",
+      "Average distance: 3.219106722576028\n",
+      "25th percentile of distance: 3.962990234375\n",
+      "50th percentile (median) of distance: 3.983119140625\n",
+      "75th percentile of distance: 4.00380712890625\n"
+     ]
+    }
+   ],
+   "source": [
+    "#we need the averagepackCurrent data to be numeric instead of {\"$numberDouble\": \"NaN\"}, setting errors='coerce' sets them to numerical NaN\n",
+    "df['averagepackCurrent'] = pd.to_numeric(df['averagepackCurrent'], errors='coerce')\n",
+    "\n",
+    "#drop the 4 rows with null values\n",
+    "df = df.dropna(subset=['distance', 'averagepackCurrent', 'averagespeed'])\n",
+    "\n",
+    "#seperate distance from the other features\n",
+    "X = df[['secondsdifference', 'totalpowerin', 'totalpowerout', 'netpowerout', 'amphours', \n",
+    "        'averagepackCurrent', 'batterysecondsremaining', 'averagespeed']]\n",
+    "y = df['distance']\n",
+    "\n",
+    "#split into training and test sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69420)\n",
+    "\n",
+    "#train the baseline model with linear regression\n",
+    "linear_regression_model = LinearRegression()\n",
+    "linear_regression_model.fit(X_train, y_train)\n",
+    "\n",
+    "#get predictions on test data\n",
+    "y_pred = linear_regression_model.predict(X_test)\n",
+    "\n",
+    "#evaluate using RMSE and MAE\n",
+    "RMSE = np.sqrt(mean_squared_error(y_test, y_pred))\n",
+    "MAE = mean_absolute_error(y_test, y_pred)\n",
+    "\n",
+    "print(f\"RMSE (root mean squared error) is: {RMSE}\")\n",
+    "print(f\"MAE (mean absolute error) is: {MAE}\")\n",
+    "\n",
+    "min_distance = df['distance'].min()\n",
+    "max_distance = df['distance'].max()\n",
+    "average_distance = df['distance'].mean()\n",
+    "percentile_25 = np.percentile(df['distance'], 25)\n",
+    "percentile_50 = np.percentile(df['distance'], 50)\n",
+    "percentile_75 = np.percentile(df['distance'], 75)\n",
+    "\n",
+    "print(\"Minimum distance:\", min_distance)\n",
+    "print(\"Maximum distance:\", max_distance)\n",
+    "print(\"Average distance:\", average_distance)\n",
+    "print(f\"25th percentile of distance: {percentile_25}\")\n",
+    "print(f\"50th percentile (median) of distance: {percentile_50}\")\n",
+    "print(f\"75th percentile of distance: {percentile_75}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Interpretation of Performance #\n",
+    "Given such little spread in the IQR (middle 50% of distance values), but RMSE of 36.22 and MAE of 6.96, it is clear that this baseline model is very inaccurate. There should be very few predictions of distance which differ from a value of 4. This suggests that there could be large outliers, but we will not deal with that for the sake of this baseline model. "
+   ]
   }
  ],
  "metadata": {

diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,4 @@ pyarrow==17.0.0
 uvicorn==0.32.0
 fastapi==0.115.3
 # matplotlib==3.7.2
-# scikit-learn==1.5.2
+scikit-learn==1.5.2