diff --git a/analysis/baselineModelTesting.ipynb b/analysis/baselineModelTesting.ipynb index 4163968..febafa4 100644 --- a/analysis/baselineModelTesting.ipynb +++ b/analysis/baselineModelTesting.ipynb @@ -86,13 +86,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Check the Data Size ##\n", + "## Import Modules and Check the Data Size ##\n", "Should be 208x9" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -301,6 +301,10 @@ ], "source": [ "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", + "import numpy as np\n", "\n", "packetTrainingDataPath=\"../training_data/Elysia.Laps.feather\"\n", "df = pd.read_feather(packetTrainingDataPath)\n", @@ -315,6 +319,85 @@ "\n", "display(df)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building the Model #" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE (root mean squared error) is: 36.22178511618355\n", + "MAE (mean absolute error) is: 6.9643536149352085\n", + "Minimum distance: -227.01091494750978\n", + "Maximum distance: 77.62573291015624\n", + "Average distance: 3.219106722576028\n", + "25th percentile of distance: 3.962990234375\n", + "50th percentile (median) of distance: 3.983119140625\n", + "75th percentile of distance: 4.00380712890625\n" + ] + } + ], + "source": [ + "#we need the averagepackCurrent data to be numeric instead of {\"$numberDouble\": \"NaN\"}, setting errors='coerce' sets them to numerical NaN\n", + "df['averagepackCurrent'] = pd.to_numeric(df['averagepackCurrent'], errors='coerce')\n", + "\n", + "#drop the 4 rows with null values\n", + "df = df.dropna(subset=['distance', 'averagepackCurrent', 'averagespeed'])\n", + "\n", + "#seperate distance from the other features\n", + "X = df[['secondsdifference', 'totalpowerin', 'totalpowerout', 'netpowerout', 'amphours', \n", + " 'averagepackCurrent', 'batterysecondsremaining', 'averagespeed']]\n", + "y = df['distance']\n", + "\n", + "#split into training and test sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69420)\n", + "\n", + "#train the baseline model with linear regression\n", + "linear_regression_model = LinearRegression()\n", + "linear_regression_model.fit(X_train, y_train)\n", + "\n", + "#get predictions on test data\n", + "y_pred = linear_regression_model.predict(X_test)\n", + "\n", + "#evaluate using RMSE and MAE\n", + "RMSE = np.sqrt(mean_squared_error(y_test, y_pred))\n", + "MAE = mean_absolute_error(y_test, y_pred)\n", + "\n", + "print(f\"RMSE (root mean squared error) is: {RMSE}\")\n", + "print(f\"MAE (mean absolute error) is: {MAE}\")\n", + "\n", + "min_distance = df['distance'].min()\n", + "max_distance = df['distance'].max()\n", + "average_distance = df['distance'].mean()\n", + "percentile_25 = np.percentile(df['distance'], 25)\n", + "percentile_50 = np.percentile(df['distance'], 50)\n", + "percentile_75 = np.percentile(df['distance'], 75)\n", + "\n", + "print(\"Minimum distance:\", min_distance)\n", + "print(\"Maximum distance:\", max_distance)\n", + "print(\"Average distance:\", average_distance)\n", + "print(f\"25th percentile of distance: {percentile_25}\")\n", + "print(f\"50th percentile (median) of distance: {percentile_50}\")\n", + "print(f\"75th percentile of distance: {percentile_75}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Interpretation of Performance #\n", + "Given such little spread in the IQR (middle 50% of distance values), but RMSE of 36.22 and MAE of 6.96, it is clear that this baseline model is very inaccurate. There should be very few predictions of distance which differ from a value of 4. This suggests that there could be large outliers, but we will not deal with that for the sake of this baseline model. " + ] } ], "metadata": { diff --git a/requirements.txt b/requirements.txt index 9689041..6c83da5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,4 +6,4 @@ pyarrow==17.0.0 uvicorn==0.32.0 fastapi==0.115.3 # matplotlib==3.7.2 -# scikit-learn==1.5.2 \ No newline at end of file +scikit-learn==1.5.2 \ No newline at end of file