uzh-bf · sjschlapbach · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/apps/analytics/daily_participant_analytics.ipynb b/apps/analytics/daily_participant_analytics.ipynb
@@ -0,0 +1,353 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preparation & Data Fetching"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "from datetime import datetime\n",
+    "from prisma import Prisma\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db = Prisma()\n",
+    "\n",
+    "# set the environment variable DATABASE_URL to the connection string of your database\n",
+    "os.environ['DATABASE_URL'] = 'postgresql://klicker:klicker@localhost:5432/klicker-prod'\n",
+    "\n",
+    "db.connect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Fetch all question response detail entries for a specific day\n",
+    "specific_date = '2024-06-10'\n",
+    "date_start = specific_date + 'T00:00:00.000Z'\n",
+    "date_end = specific_date + 'T23:59:59.999Z'\n",
+    "participant_response_details = db.participant.find_many(\n",
+    "    include={\n",
+    "        'detailQuestionResponses': {\n",
+    "            'where': {\n",
+    "                'createdAt': {\n",
+    "                    'gte': date_start,\n",
+    "                    'lte': date_end\n",
+    "                }\n",
+    "            },\n",
+    "            'include': {\n",
+    "                'practiceQuiz': True,\n",
+    "                'microLearning': True\n",
+    "            }\n",
+    "        },\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "# Print the first 5 question response details\n",
+    "print(\"Found {} participants for the timespan from {} to {}\".format(len(participant_response_details), date_start, date_end))\n",
+    "print(participant_response_details[0])\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compute Correctness Metrics for Responses"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert the question response details to a pandas dataframe\n",
+    "def map_details(detail, participantId):\n",
+    "    courseId = detail['practiceQuiz']['courseId'] if detail['practiceQuiz'] else detail['microLearning']['courseId']\n",
+    "    return {\n",
+    "        **detail,\n",
+    "        'participantId': participantId,\n",
+    "        'courseId': courseId\n",
+    "    }\n",
+    "\n",
+    "def map_participants(participant):\n",
+    "    participant_dict = participant.dict()\n",
+    "    return list(map(lambda detail: map_details(detail, participant_dict['id']), participant_dict['detailQuestionResponses']))\n",
+    "\n",
+    "def convert_to_df(participants):\n",
+    "    return pd.DataFrame([item for sublist in list(map(map_participants, participants)) for item in sublist])\n",
+    "\n",
+    "df_details = convert_to_df(participant_response_details)\n",
+    "df_details = df_details[['score', 'pointsAwarded', 'xpAwarded', 'timeSpent', 'response', 'elementInstanceId', 'participantId', 'courseId']]\n",
+    "print(\"Question detail responses:\", len(df_details))\n",
+    "print(\"Columns:\", df_details.columns)\n",
+    "df_details.head()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compute correctness of the responses and add them as a separate column\n",
+    "# Get related element instances\n",
+    "element_instance_ids = df_details['elementInstanceId'].unique()\n",
+    "element_instances = db.elementinstance.find_many(\n",
+    "    where={\n",
+    "        'id': {\n",
+    "            'in': element_instance_ids.tolist()\n",
+    "        }\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "# Map the element instances to the corresponding elementData.options entries and convert it to a dataframe\n",
+    "def map_element_instance_options(instance):\n",
+    "    instance_dict = instance.dict()\n",
+    "    return {\n",
+    "        'elementInstanceId': instance_dict['id'],\n",
+    "        'type': instance_dict['elementData']['type'],\n",
+    "        'options': instance_dict['elementData']['options'] if 'options' in instance_dict['elementData'] else None\n",
+    "    }\n",
+    "\n",
+    "df_element_instances = pd.DataFrame(list(map(map_element_instance_options, element_instances)))\n",
+    "df_element_instances.head()\n",
+    "\n",
+    "# Compute the correctness for every response entry based on the element instance options (depending on the type of the element)\n",
+    "def compute_correctness(row):\n",
+    "    element_instance = df_element_instances[df_element_instances['elementInstanceId'] == row['elementInstanceId']].iloc[0]\n",
+    "    response = row['response']\n",
+    "    options = element_instance['options']\n",
+    "\n",
+    "    if element_instance['type'] == 'FLASHCARD' or element_instance['type'] == 'CONTENT':\n",
+    "        return None\n",
+    "\n",
+    "    elif element_instance['type'] == 'SC':\n",
+    "        selected_choice = response['choices'][0]\n",
+    "        correct_choice = next((choice['ix'] for choice in options['choices'] if choice['correct']), None)\n",
+    "        return 'CORRECT' if selected_choice == correct_choice else 'INCORRECT'\n",
+    "\n",
+    "    elif element_instance['type'] == 'MC' or element_instance['type'] == 'KPRIM':\n",
+    "        selected_choices = response['choices']\n",
+    "        correct_choices = [choice['ix'] for choice in options['choices'] if choice['correct']]\n",
+    "        available_choices = len(options['choices'])\n",
+    "        \n",
+    "        selected_choices_array = [1 if ix in selected_choices else 0 for ix in range(available_choices)]\n",
+    "        correct_choices_array = [1 if ix in correct_choices else 0 for ix in range(available_choices)]\n",
+    "        hamming_distance = sum([1 for i in range(available_choices) if selected_choices_array[i] != correct_choices_array[i]])\n",
+    "\n",
+    "        if element_instance['type'] == 'MC':\n",
+    "            correctness = max(-2 * hamming_distance / available_choices + 1, 0)\n",
+    "            if correctness == 1:\n",
+    "                return 'CORRECT'\n",
+    "            elif correctness == 0:\n",
+    "                return 'INCORRECT'\n",
+    "            else:\n",
+    "                return 'PARTIAL'\n",
+    "        elif element_instance['type'] == 'KPRIM':\n",
+    "            return 'CORRECT' if hamming_distance == 0 else 'PARTIAL' if hamming_distance == 1 else 'INCORRECT'\n",
+    "\n",
+    "    elif element_instance['type'] == 'NUMERICAL':\n",
+    "        response_value = float(response['value'])\n",
+    "        within_range = list(map(lambda range: float(range['min']) <= response_value <= float(range['max']), options['solutionRanges']))\n",
+    "        if any(within_range):\n",
+    "            return 'CORRECT'\n",
+    "\n",
+    "        return 'INCORRECT'\n",
+    "\n",
+    "    elif element_instance['type'] == 'FREE_TEXT':\n",
+    "        raise NotImplementedError(\"Free text correctness computation not implemented yet\")\n",
+    "\n",
+    "    else:\n",
+    "        raise ValueError(\"Unknown element type: {}\".format(element_instance['type']))\n",
+    "\n",
+    "df_details['correctness'] = df_details.apply(compute_correctness, axis=1)\n",
+    "df_details = df_details.dropna(subset=['correctness'])\n",
+    "print(\"{} question response details remaining with correctness\".format(len(df_details)))\n",
+    "df_details.head()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Aggregate Metrics and Counts for Responses"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Aggregate the question response details for the participant and course level\n",
+    "df_analytics_counts = df_details.groupby(['participantId', 'courseId']).agg({\n",
+    "    'score': 'sum',\n",
+    "    'pointsAwarded': 'sum',\n",
+    "    'xpAwarded': 'sum',\n",
+    "    'timeSpent': 'sum',\n",
+    "    'elementInstanceId': ['count', 'nunique'] # count = trialsCount, nunique = responseCount\n",
+    "}).reset_index()\n",
+    "df_analytics_counts.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Count the 'CORRECT', 'PARTIAL', and 'INCORRECT' entries for each participantId and elementInstanceId combination\n",
+    "df_analytics_corr_temp = df_details.groupby(['participantId', 'elementInstanceId', 'courseId', 'correctness']).size().unstack(fill_value=0).reset_index()\n",
+    "\n",
+    "# Divide each of the correctness columns by the sum of all and rename them to meanCorrect, meanPartial, meanIncorrect\n",
+    "df_analytics_corr_temp['sum'] = df_analytics_corr_temp['CORRECT'] + df_analytics_corr_temp['PARTIAL'] + df_analytics_corr_temp['INCORRECT']\n",
+    "df_analytics_corr_temp['meanCorrect'] = df_analytics_corr_temp['CORRECT'] / df_analytics_corr_temp['sum']\n",
+    "df_analytics_corr_temp['meanPartial'] = df_analytics_corr_temp['PARTIAL'] / df_analytics_corr_temp['sum']\n",
+    "df_analytics_corr_temp['meanIncorrect'] = df_analytics_corr_temp['INCORRECT'] / df_analytics_corr_temp['sum']\n",
+    "\n",
+    "# Aggregate the correctness columns for each participantId and courseId\n",
+    "df_analytics_correctness = df_analytics_corr_temp.groupby(['participantId', 'courseId']).agg({\n",
+    "    'meanCorrect': 'sum',\n",
+    "    'meanPartial': 'sum',\n",
+    "    'meanIncorrect': 'sum'\n",
+    "}).reset_index().rename(columns={\n",
+    "    'meanCorrect': 'meanCorrectCount',\n",
+    "    'meanPartial': 'meanPartialCount',\n",
+    "    'meanIncorrect': 'meanWrongCount'\n",
+    "})\n",
+    "df_analytics_correctness.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Map the counts in the corresponding analytics dataframe to a single level\n",
+    "df_analytics_counts.columns = df_analytics_counts.columns.map('_'.join).str.strip('_')\n",
+    "df_analytics_counts = df_analytics_counts.rename(columns={\n",
+    "    'score_sum': 'totalScore',\n",
+    "    'pointsAwarded_sum': 'totalPoints',\n",
+    "    'xpAwarded_sum': 'totalXp',\n",
+    "    'timeSpent_sum': 'totalTimeSpent',\n",
+    "    'elementInstanceId_count': 'trialsCount',\n",
+    "    'elementInstanceId_nunique': 'responseCount'\n",
+    "})\n",
+    "df_analytics_counts.head()\n",
+    "\n",
+    "# Combine the analytics counts and correctness dataframes based on the unique participantId and courseId combinations\n",
+    "df_analytics = pd.merge(df_analytics_counts, df_analytics_correctness, on=['participantId', 'courseId'])\n",
+    "df_analytics.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Add Computed Metrics to the Database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create daily analytics entries for all participants\n",
+    "for index, row in df_analytics.iterrows():\n",
+    "    db.participantanalytics.upsert(\n",
+    "        where={\n",
+    "            'type_courseId_participantId_timestamp': {\n",
+    "                'type': 'DAILY',\n",
+    "                'courseId': row['courseId'],\n",
+    "                'participantId': row['participantId'],\n",
+    "                'timestamp': specific_date + 'T00:00:00.000Z'\n",
+    "            }\n",
+    "        },\n",
+    "        data={\n",
+    "            'create': {\n",
+    "                'type': 'DAILY',\n",
+    "                'timestamp': specific_date + 'T00:00:00.000Z',\n",
+    "                'trialsCount': row['trialsCount'],\n",
+    "                'responseCount': row['responseCount'],\n",
+    "                'totalScore': row['totalScore'],\n",
+    "                'totalPoints': row['totalPoints'],\n",
+    "                'totalXp': row['totalXp'],\n",
+    "                'meanCorrectCount': row['meanCorrectCount'],\n",
+    "                'meanPartialCorrectCount': row['meanPartialCount'],\n",
+    "                'meanWrongCount': row['meanWrongCount'],\n",
+    "                'participant': {\n",
+    "                    'connect': {\n",
+    "                        'id': row['participantId']\n",
+    "                    }\n",
+    "                },\n",
+    "                'course': {\n",
+    "                    'connect': {\n",
+    "                        'id': row['courseId']\n",
+    "                    }\n",
+    "                }\n",
+    "            },\n",
+    "            'update': {}\n",
+    "        }\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cleanup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db.disconnect()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "analytics-3uz8SvN3-py3.12",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/apps/analytics/pyproject.toml b/apps/analytics/pyproject.toml
@@ -2,7 +2,7 @@
 name = "@klicker-uzh/analytics"
 version = "0.0.1"
 description = ""
-authors = ["Roland Schlaefli <roland.schlaefli@df.uzh.ch>"]
+authors = ["Roland Schlaefli <roland.schlaefli@df.uzh.ch>", "Julius Schlapbach <julius.schlapbach@df.uzh.ch>"]
 license = "AGPL-3.0"
 readme = "README.md"
 packages = [{include = "@klicker_uzh"}]

diff --git a/...c/prisma/migrations/20240826113537_participant_analytics_correctness_counts/migration.sql b/...c/prisma/migrations/20240826113537_participant_analytics_correctness_counts/migration.sql
@@ -0,0 +1,16 @@
+/*
+  Warnings:
+
+  - You are about to drop the column `meanFirstCorrectCount` on the `ParticipantAnalytics` table. All the data in the column will be lost.
+  - You are about to drop the column `meanLastCorrectCount` on the `ParticipantAnalytics` table. All the data in the column will be lost.
+  - You are about to drop the column `collectedAchievements` on the `ParticipantAnalytics` table. All the data in the column will be lost.
+
+*/
+-- AlterTable
+ALTER TABLE "ParticipantAnalytics" DROP COLUMN "meanFirstCorrectCount",
+DROP COLUMN "meanLastCorrectCount",
+DROP COLUMN "collectedAchievements",
+ADD COLUMN     "firstCorrectCount" REAL,
+ADD COLUMN     "firstWrongCount" REAL,
+ADD COLUMN     "lastCorrectCount" REAL,
+ADD COLUMN     "lastWrongCount" REAL;