From cb471c3f16659b4e763d028815d6b8398718c799 Mon Sep 17 00:00:00 2001 From: Julius Schlapbach <80708107+sjschlapbach@users.noreply.github.com> Date: Thu, 5 Dec 2024 17:17:04 +0100 Subject: [PATCH] enhance(apps/analytics): add computation logic for participant course performance (#4390) --- apps/analytics/src/modules/__init__.py | 5 + .../participant_performance/__init__.py | 3 + .../compute_performance_levels.py | 39 ++++++ .../compute_response_error_rates.py | 54 ++++++++ .../save_participant_performance.py | 30 +++++ .../notebooks/participant_performance.ipynb | 126 ++++++++++++++++++ .../migration.sql | 3 + .../prisma/src/prisma/schema/analytics.prisma | 2 + 8 files changed, 262 insertions(+) create mode 100644 apps/analytics/src/modules/participant_performance/__init__.py create mode 100644 apps/analytics/src/modules/participant_performance/compute_performance_levels.py create mode 100644 apps/analytics/src/modules/participant_performance/compute_response_error_rates.py create mode 100644 apps/analytics/src/modules/participant_performance/save_participant_performance.py create mode 100644 apps/analytics/src/notebooks/participant_performance.ipynb rename packages/prisma/src/prisma/migrations/{20241205143213_learning_analytics_performance_progress => 20241205154359_learning_analytics_performance_progress}/migration.sql (97%) diff --git a/apps/analytics/src/modules/__init__.py b/apps/analytics/src/modules/__init__.py index f478e9e6fa..3435c1fc55 100644 --- a/apps/analytics/src/modules/__init__.py +++ b/apps/analytics/src/modules/__init__.py @@ -7,3 +7,8 @@ save_participant_course_analytics, ) from .aggregated_course_analytics import compute_weekday_activity +from .participant_performance import ( + compute_response_error_rates, + compute_performance_levels, + save_participant_performance, +) diff --git a/apps/analytics/src/modules/participant_performance/__init__.py b/apps/analytics/src/modules/participant_performance/__init__.py new file mode 100644 index 0000000000..17911cd421 --- /dev/null +++ b/apps/analytics/src/modules/participant_performance/__init__.py @@ -0,0 +1,3 @@ +from .compute_response_error_rates import compute_response_error_rates +from .compute_performance_levels import compute_performance_levels +from .save_participant_performance import save_participant_performance diff --git a/apps/analytics/src/modules/participant_performance/compute_performance_levels.py b/apps/analytics/src/modules/participant_performance/compute_performance_levels.py new file mode 100644 index 0000000000..c4a7f64bbf --- /dev/null +++ b/apps/analytics/src/modules/participant_performance/compute_performance_levels.py @@ -0,0 +1,39 @@ +def compute_performance_levels(df_performance): + # set the performance levels based on the quantiles + first_qs = df_performance.firstErrorRate.quantile([0.25, 0.75]) + last_qs = df_performance.lastErrorRate.quantile([0.25, 0.75]) + total_qs = df_performance.totalErrorRate.quantile([0.25, 0.75]) + + first_q1 = first_qs[0.25] + first_q3 = first_qs[0.75] + last_q1 = last_qs[0.25] + last_q3 = last_qs[0.75] + total_q1 = total_qs[0.25] + total_q3 = total_qs[0.75] + + # set the performance levels based on the quantiles (inverse logic compared to activity - higher error rate is worse) + df_performance["firstPerformance"] = "MEDIUM" + df_performance.loc[ + df_performance.firstErrorRate <= first_q1, "firstPerformance" + ] = "HIGH" + df_performance.loc[ + df_performance.firstErrorRate >= first_q3, "firstPerformance" + ] = "LOW" + + df_performance["lastPerformance"] = "MEDIUM" + df_performance.loc[df_performance.lastErrorRate <= last_q1, "lastPerformance"] = ( + "HIGH" + ) + df_performance.loc[df_performance.lastErrorRate >= last_q3, "lastPerformance"] = ( + "LOW" + ) + + df_performance["totalPerformance"] = "MEDIUM" + df_performance.loc[ + df_performance.totalErrorRate <= total_q1, "totalPerformance" + ] = "HIGH" + df_performance.loc[ + df_performance.totalErrorRate >= total_q3, "totalPerformance" + ] = "LOW" + + return df_performance diff --git a/apps/analytics/src/modules/participant_performance/compute_response_error_rates.py b/apps/analytics/src/modules/participant_performance/compute_response_error_rates.py new file mode 100644 index 0000000000..edc40d61df --- /dev/null +++ b/apps/analytics/src/modules/participant_performance/compute_response_error_rates.py @@ -0,0 +1,54 @@ +def compute_response_error_rates(df_responses): + # compute the error rate for each response itself + df_responses["responseErrorRate"] = ( + df_responses["wrongCount"] / df_responses["trialsCount"] + ) + + # compute the total number of responses, number of wrong first and last responses, + # total number of wrong responses, and the average total error rate + df_response_count = ( + df_responses.groupby("participantId").size().reset_index(name="responseCount") + ) + df_first_response_wrong_count = ( + df_responses[df_responses["firstResponseCorrectness"] == "WRONG"] + .groupby("participantId") + .size() + .reset_index(name="wrongFirstResponseCount") + ) + df_last_response_wrong_count = ( + df_responses[df_responses["lastResponseCorrectness"] == "WRONG"] + .groupby("participantId") + .size() + .reset_index(name="wrongLastResponseCount") + ) + df_total_error_rate = ( + df_responses[["participantId", "responseErrorRate"]] + .groupby("participantId") + .agg("mean") + .reset_index() + .rename( + columns={ + "responseErrorRate": "totalErrorRate", + } + ) + ) + + # combine the dataframes into a single one + df_performance = ( + df_response_count.merge( + df_first_response_wrong_count, on="participantId", how="left" + ) + .merge(df_last_response_wrong_count, on="participantId", how="left") + .merge(df_total_error_rate, on="participantId", how="left") + .fillna(0) + ) + + # compute the first and last error rates + df_performance["firstErrorRate"] = ( + df_performance["wrongFirstResponseCount"] / df_performance["responseCount"] + ) + df_performance["lastErrorRate"] = ( + df_performance["wrongLastResponseCount"] / df_performance["responseCount"] + ) + + return df_performance diff --git a/apps/analytics/src/modules/participant_performance/save_participant_performance.py b/apps/analytics/src/modules/participant_performance/save_participant_performance.py new file mode 100644 index 0000000000..95d4705a06 --- /dev/null +++ b/apps/analytics/src/modules/participant_performance/save_participant_performance.py @@ -0,0 +1,30 @@ +def save_participant_performance(db, df_performance, course_id): + for _, row in df_performance.iterrows(): + db.participantperformance.upsert( + where={ + "participantId_courseId": { + "participantId": row["participantId"], + "courseId": course_id, + } + }, + data={ + "create": { + "firstErrorRate": row["firstErrorRate"], + "firstPerformance": row["firstPerformance"], + "lastErrorRate": row["lastErrorRate"], + "lastPerformance": row["lastPerformance"], + "totalErrorRate": row["totalErrorRate"], + "totalPerformance": row["totalPerformance"], + "participant": {"connect": {"id": row["participantId"]}}, + "course": {"connect": {"id": course_id}}, + }, + "update": { + "firstErrorRate": row["firstErrorRate"], + "firstPerformance": row["firstPerformance"], + "lastErrorRate": row["lastErrorRate"], + "lastPerformance": row["lastPerformance"], + "totalErrorRate": row["totalErrorRate"], + "totalPerformance": row["totalPerformance"], + }, + }, + ) diff --git a/apps/analytics/src/notebooks/participant_performance.ipynb b/apps/analytics/src/notebooks/participant_performance.ipynb new file mode 100644 index 0000000000..7cb68989b2 --- /dev/null +++ b/apps/analytics/src/notebooks/participant_performance.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import json\n", + "from datetime import datetime\n", + "from prisma import Prisma\n", + "import pandas as pd\n", + "import sys\n", + "\n", + "# set the python path correctly for module imports to work\n", + "sys.path.append(\"../../\")\n", + "\n", + "from src.modules.participant_course_analytics.get_running_past_courses import (\n", + " get_running_past_courses,\n", + ")\n", + "from src.modules.participant_performance.compute_response_error_rates import (\n", + " compute_response_error_rates,\n", + ")\n", + "from src.modules.participant_performance.compute_performance_levels import (\n", + " compute_performance_levels,\n", + ")\n", + "from src.modules.participant_performance.save_participant_performance import (\n", + " save_participant_performance,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "db = Prisma()\n", + "\n", + "# set the environment variable DATABASE_URL to the connection string of your database\n", + "os.environ[\"DATABASE_URL\"] = \"postgresql://klicker:klicker@localhost:5432/klicker-prod\"\n", + "\n", + "db.connect()\n", + "\n", + "# Script settings\n", + "verbose = False" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute Participant Performance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch all courses from the database\n", + "df_courses = get_running_past_courses(db)\n", + "\n", + "# Iterate over the course and fetch all question responses linked to it\n", + "for idx, course in df_courses.iterrows():\n", + " course_id = course[\"id\"]\n", + " print(f\"Processing course\", idx, \"of\", len(df_courses), \"with id\", course_id)\n", + "\n", + " # fetch all question responses linked to this course\n", + " question_responses = db.questionresponse.find_many(where={\"courseId\": course_id})\n", + " df_responses = pd.DataFrame(list(map(lambda x: x.dict(), question_responses)))\n", + "\n", + " # if no responses are linked to the course, skip the iteration\n", + " if df_responses.empty:\n", + " print(\"No responses linked to course\", course_id)\n", + " continue\n", + "\n", + " df_performance = compute_response_error_rates(df_responses)\n", + " df_performance = compute_performance_levels(df_performance)\n", + "\n", + " # store computed performance analytics in the corresponding database table\n", + " save_participant_performance(db, df_performance, course_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Disconnect from the database\n", + "db.disconnect()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "analytics-fkWWeYLw-py3.12", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/packages/prisma/src/prisma/migrations/20241205143213_learning_analytics_performance_progress/migration.sql b/packages/prisma/src/prisma/migrations/20241205154359_learning_analytics_performance_progress/migration.sql similarity index 97% rename from packages/prisma/src/prisma/migrations/20241205143213_learning_analytics_performance_progress/migration.sql rename to packages/prisma/src/prisma/migrations/20241205154359_learning_analytics_performance_progress/migration.sql index 2b0f6ac7a3..9fdc277283 100644 --- a/packages/prisma/src/prisma/migrations/20241205143213_learning_analytics_performance_progress/migration.sql +++ b/packages/prisma/src/prisma/migrations/20241205154359_learning_analytics_performance_progress/migration.sql @@ -75,6 +75,9 @@ CREATE TABLE "ActivityProgress" ( CONSTRAINT "ActivityProgress_pkey" PRIMARY KEY ("id") ); +-- CreateIndex +CREATE UNIQUE INDEX "ParticipantPerformance_participantId_courseId_key" ON "ParticipantPerformance"("participantId", "courseId"); + -- CreateIndex CREATE UNIQUE INDEX "InstancePerformance_instanceId_key" ON "InstancePerformance"("instanceId"); diff --git a/packages/prisma/src/prisma/schema/analytics.prisma b/packages/prisma/src/prisma/schema/analytics.prisma index e3c90011d8..ed27694f09 100644 --- a/packages/prisma/src/prisma/schema/analytics.prisma +++ b/packages/prisma/src/prisma/schema/analytics.prisma @@ -188,6 +188,8 @@ model ParticipantPerformance { createdAt DateTime @default(now()) updatedAt DateTime @updatedAt + + @@unique([participantId, courseId]) } model InstancePerformance {