From cb471c3f16659b4e763d028815d6b8398718c799 Mon Sep 17 00:00:00 2001
From: Julius Schlapbach <80708107+sjschlapbach@users.noreply.github.com>
Date: Thu, 5 Dec 2024 17:17:04 +0100
Subject: [PATCH] enhance(apps/analytics): add computation logic for
 participant course performance (#4390)

---
 apps/analytics/src/modules/__init__.py        |   5 +
 .../participant_performance/__init__.py       |   3 +
 .../compute_performance_levels.py             |  39 ++++++
 .../compute_response_error_rates.py           |  54 ++++++++
 .../save_participant_performance.py           |  30 +++++
 .../notebooks/participant_performance.ipynb   | 126 ++++++++++++++++++
 .../migration.sql                             |   3 +
 .../prisma/src/prisma/schema/analytics.prisma |   2 +
 8 files changed, 262 insertions(+)
 create mode 100644 apps/analytics/src/modules/participant_performance/__init__.py
 create mode 100644 apps/analytics/src/modules/participant_performance/compute_performance_levels.py
 create mode 100644 apps/analytics/src/modules/participant_performance/compute_response_error_rates.py
 create mode 100644 apps/analytics/src/modules/participant_performance/save_participant_performance.py
 create mode 100644 apps/analytics/src/notebooks/participant_performance.ipynb
 rename packages/prisma/src/prisma/migrations/{20241205143213_learning_analytics_performance_progress => 20241205154359_learning_analytics_performance_progress}/migration.sql (97%)

diff --git a/apps/analytics/src/modules/__init__.py b/apps/analytics/src/modules/__init__.py
index f478e9e6fa..3435c1fc55 100644
--- a/apps/analytics/src/modules/__init__.py
+++ b/apps/analytics/src/modules/__init__.py
@@ -7,3 +7,8 @@
     save_participant_course_analytics,
 )
 from .aggregated_course_analytics import compute_weekday_activity
+from .participant_performance import (
+    compute_response_error_rates,
+    compute_performance_levels,
+    save_participant_performance,
+)
diff --git a/apps/analytics/src/modules/participant_performance/__init__.py b/apps/analytics/src/modules/participant_performance/__init__.py
new file mode 100644
index 0000000000..17911cd421
--- /dev/null
+++ b/apps/analytics/src/modules/participant_performance/__init__.py
@@ -0,0 +1,3 @@
+from .compute_response_error_rates import compute_response_error_rates
+from .compute_performance_levels import compute_performance_levels
+from .save_participant_performance import save_participant_performance
diff --git a/apps/analytics/src/modules/participant_performance/compute_performance_levels.py b/apps/analytics/src/modules/participant_performance/compute_performance_levels.py
new file mode 100644
index 0000000000..c4a7f64bbf
--- /dev/null
+++ b/apps/analytics/src/modules/participant_performance/compute_performance_levels.py
@@ -0,0 +1,39 @@
+def compute_performance_levels(df_performance):
+    # set the performance levels based on the quantiles
+    first_qs = df_performance.firstErrorRate.quantile([0.25, 0.75])
+    last_qs = df_performance.lastErrorRate.quantile([0.25, 0.75])
+    total_qs = df_performance.totalErrorRate.quantile([0.25, 0.75])
+
+    first_q1 = first_qs[0.25]
+    first_q3 = first_qs[0.75]
+    last_q1 = last_qs[0.25]
+    last_q3 = last_qs[0.75]
+    total_q1 = total_qs[0.25]
+    total_q3 = total_qs[0.75]
+
+    # set the performance levels based on the quantiles (inverse logic compared to activity - higher error rate is worse)
+    df_performance["firstPerformance"] = "MEDIUM"
+    df_performance.loc[
+        df_performance.firstErrorRate <= first_q1, "firstPerformance"
+    ] = "HIGH"
+    df_performance.loc[
+        df_performance.firstErrorRate >= first_q3, "firstPerformance"
+    ] = "LOW"
+
+    df_performance["lastPerformance"] = "MEDIUM"
+    df_performance.loc[df_performance.lastErrorRate <= last_q1, "lastPerformance"] = (
+        "HIGH"
+    )
+    df_performance.loc[df_performance.lastErrorRate >= last_q3, "lastPerformance"] = (
+        "LOW"
+    )
+
+    df_performance["totalPerformance"] = "MEDIUM"
+    df_performance.loc[
+        df_performance.totalErrorRate <= total_q1, "totalPerformance"
+    ] = "HIGH"
+    df_performance.loc[
+        df_performance.totalErrorRate >= total_q3, "totalPerformance"
+    ] = "LOW"
+
+    return df_performance
diff --git a/apps/analytics/src/modules/participant_performance/compute_response_error_rates.py b/apps/analytics/src/modules/participant_performance/compute_response_error_rates.py
new file mode 100644
index 0000000000..edc40d61df
--- /dev/null
+++ b/apps/analytics/src/modules/participant_performance/compute_response_error_rates.py
@@ -0,0 +1,54 @@
+def compute_response_error_rates(df_responses):
+    # compute the error rate for each response itself
+    df_responses["responseErrorRate"] = (
+        df_responses["wrongCount"] / df_responses["trialsCount"]
+    )
+
+    # compute the total number of responses, number of wrong first and last responses,
+    # total number of wrong responses, and the average total error rate
+    df_response_count = (
+        df_responses.groupby("participantId").size().reset_index(name="responseCount")
+    )
+    df_first_response_wrong_count = (
+        df_responses[df_responses["firstResponseCorrectness"] == "WRONG"]
+        .groupby("participantId")
+        .size()
+        .reset_index(name="wrongFirstResponseCount")
+    )
+    df_last_response_wrong_count = (
+        df_responses[df_responses["lastResponseCorrectness"] == "WRONG"]
+        .groupby("participantId")
+        .size()
+        .reset_index(name="wrongLastResponseCount")
+    )
+    df_total_error_rate = (
+        df_responses[["participantId", "responseErrorRate"]]
+        .groupby("participantId")
+        .agg("mean")
+        .reset_index()
+        .rename(
+            columns={
+                "responseErrorRate": "totalErrorRate",
+            }
+        )
+    )
+
+    # combine the dataframes into a single one
+    df_performance = (
+        df_response_count.merge(
+            df_first_response_wrong_count, on="participantId", how="left"
+        )
+        .merge(df_last_response_wrong_count, on="participantId", how="left")
+        .merge(df_total_error_rate, on="participantId", how="left")
+        .fillna(0)
+    )
+
+    # compute the first and last error rates
+    df_performance["firstErrorRate"] = (
+        df_performance["wrongFirstResponseCount"] / df_performance["responseCount"]
+    )
+    df_performance["lastErrorRate"] = (
+        df_performance["wrongLastResponseCount"] / df_performance["responseCount"]
+    )
+
+    return df_performance
diff --git a/apps/analytics/src/modules/participant_performance/save_participant_performance.py b/apps/analytics/src/modules/participant_performance/save_participant_performance.py
new file mode 100644
index 0000000000..95d4705a06
--- /dev/null
+++ b/apps/analytics/src/modules/participant_performance/save_participant_performance.py
@@ -0,0 +1,30 @@
+def save_participant_performance(db, df_performance, course_id):
+    for _, row in df_performance.iterrows():
+        db.participantperformance.upsert(
+            where={
+                "participantId_courseId": {
+                    "participantId": row["participantId"],
+                    "courseId": course_id,
+                }
+            },
+            data={
+                "create": {
+                    "firstErrorRate": row["firstErrorRate"],
+                    "firstPerformance": row["firstPerformance"],
+                    "lastErrorRate": row["lastErrorRate"],
+                    "lastPerformance": row["lastPerformance"],
+                    "totalErrorRate": row["totalErrorRate"],
+                    "totalPerformance": row["totalPerformance"],
+                    "participant": {"connect": {"id": row["participantId"]}},
+                    "course": {"connect": {"id": course_id}},
+                },
+                "update": {
+                    "firstErrorRate": row["firstErrorRate"],
+                    "firstPerformance": row["firstPerformance"],
+                    "lastErrorRate": row["lastErrorRate"],
+                    "lastPerformance": row["lastPerformance"],
+                    "totalErrorRate": row["totalErrorRate"],
+                    "totalPerformance": row["totalPerformance"],
+                },
+            },
+        )
diff --git a/apps/analytics/src/notebooks/participant_performance.ipynb b/apps/analytics/src/notebooks/participant_performance.ipynb
new file mode 100644
index 0000000000..7cb68989b2
--- /dev/null
+++ b/apps/analytics/src/notebooks/participant_performance.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preparation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from datetime import datetime\n",
+    "from prisma import Prisma\n",
+    "import pandas as pd\n",
+    "import sys\n",
+    "\n",
+    "# set the python path correctly for module imports to work\n",
+    "sys.path.append(\"../../\")\n",
+    "\n",
+    "from src.modules.participant_course_analytics.get_running_past_courses import (\n",
+    "    get_running_past_courses,\n",
+    ")\n",
+    "from src.modules.participant_performance.compute_response_error_rates import (\n",
+    "    compute_response_error_rates,\n",
+    ")\n",
+    "from src.modules.participant_performance.compute_performance_levels import (\n",
+    "    compute_performance_levels,\n",
+    ")\n",
+    "from src.modules.participant_performance.save_participant_performance import (\n",
+    "    save_participant_performance,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db = Prisma()\n",
+    "\n",
+    "# set the environment variable DATABASE_URL to the connection string of your database\n",
+    "os.environ[\"DATABASE_URL\"] = \"postgresql://klicker:klicker@localhost:5432/klicker-prod\"\n",
+    "\n",
+    "db.connect()\n",
+    "\n",
+    "# Script settings\n",
+    "verbose = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compute Participant Performance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Fetch all courses from the database\n",
+    "df_courses = get_running_past_courses(db)\n",
+    "\n",
+    "# Iterate over the course and fetch all question responses linked to it\n",
+    "for idx, course in df_courses.iterrows():\n",
+    "    course_id = course[\"id\"]\n",
+    "    print(f\"Processing course\", idx, \"of\", len(df_courses), \"with id\", course_id)\n",
+    "\n",
+    "    # fetch all question responses linked to this course\n",
+    "    question_responses = db.questionresponse.find_many(where={\"courseId\": course_id})\n",
+    "    df_responses = pd.DataFrame(list(map(lambda x: x.dict(), question_responses)))\n",
+    "\n",
+    "    # if no responses are linked to the course, skip the iteration\n",
+    "    if df_responses.empty:\n",
+    "        print(\"No responses linked to course\", course_id)\n",
+    "        continue\n",
+    "\n",
+    "    df_performance = compute_response_error_rates(df_responses)\n",
+    "    df_performance = compute_performance_levels(df_performance)\n",
+    "\n",
+    "    # store computed performance analytics in the corresponding database table\n",
+    "    save_participant_performance(db, df_performance, course_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Disconnect from the database\n",
+    "db.disconnect()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "analytics-fkWWeYLw-py3.12",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/packages/prisma/src/prisma/migrations/20241205143213_learning_analytics_performance_progress/migration.sql b/packages/prisma/src/prisma/migrations/20241205154359_learning_analytics_performance_progress/migration.sql
similarity index 97%
rename from packages/prisma/src/prisma/migrations/20241205143213_learning_analytics_performance_progress/migration.sql
rename to packages/prisma/src/prisma/migrations/20241205154359_learning_analytics_performance_progress/migration.sql
index 2b0f6ac7a3..9fdc277283 100644
--- a/packages/prisma/src/prisma/migrations/20241205143213_learning_analytics_performance_progress/migration.sql
+++ b/packages/prisma/src/prisma/migrations/20241205154359_learning_analytics_performance_progress/migration.sql
@@ -75,6 +75,9 @@ CREATE TABLE "ActivityProgress" (
     CONSTRAINT "ActivityProgress_pkey" PRIMARY KEY ("id")
 );
 
+-- CreateIndex
+CREATE UNIQUE INDEX "ParticipantPerformance_participantId_courseId_key" ON "ParticipantPerformance"("participantId", "courseId");
+
 -- CreateIndex
 CREATE UNIQUE INDEX "InstancePerformance_instanceId_key" ON "InstancePerformance"("instanceId");
 
diff --git a/packages/prisma/src/prisma/schema/analytics.prisma b/packages/prisma/src/prisma/schema/analytics.prisma
index e3c90011d8..ed27694f09 100644
--- a/packages/prisma/src/prisma/schema/analytics.prisma
+++ b/packages/prisma/src/prisma/schema/analytics.prisma
@@ -188,6 +188,8 @@ model ParticipantPerformance {
 
   createdAt DateTime @default(now())
   updatedAt DateTime @updatedAt
+
+  @@unique([participantId, courseId])
 }
 
 model InstancePerformance {