Skip to content

Commit

Permalink
Merge b04c32d into 70c203f
Browse files Browse the repository at this point in the history
  • Loading branch information
sjschlapbach authored Aug 26, 2024
2 parents 70c203f + b04c32d commit 2948359
Show file tree
Hide file tree
Showing 4 changed files with 382 additions and 12 deletions.
353 changes: 353 additions & 0 deletions apps/analytics/daily_participant_analytics.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,353 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preparation & Data Fetching"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"from datetime import datetime\n",
"from prisma import Prisma\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"db = Prisma()\n",
"\n",
"# set the environment variable DATABASE_URL to the connection string of your database\n",
"os.environ['DATABASE_URL'] = 'postgresql://klicker:klicker@localhost:5432/klicker-prod'\n",
"\n",
"db.connect()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Fetch all question response detail entries for a specific day\n",
"specific_date = '2024-06-10'\n",
"date_start = specific_date + 'T00:00:00.000Z'\n",
"date_end = specific_date + 'T23:59:59.999Z'\n",
"participant_response_details = db.participant.find_many(\n",
" include={\n",
" 'detailQuestionResponses': {\n",
" 'where': {\n",
" 'createdAt': {\n",
" 'gte': date_start,\n",
" 'lte': date_end\n",
" }\n",
" },\n",
" 'include': {\n",
" 'practiceQuiz': True,\n",
" 'microLearning': True\n",
" }\n",
" },\n",
" }\n",
")\n",
"\n",
"# Print the first 5 question response details\n",
"print(\"Found {} participants for the timespan from {} to {}\".format(len(participant_response_details), date_start, date_end))\n",
"print(participant_response_details[0])\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Compute Correctness Metrics for Responses"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Convert the question response details to a pandas dataframe\n",
"def map_details(detail, participantId):\n",
" courseId = detail['practiceQuiz']['courseId'] if detail['practiceQuiz'] else detail['microLearning']['courseId']\n",
" return {\n",
" **detail,\n",
" 'participantId': participantId,\n",
" 'courseId': courseId\n",
" }\n",
"\n",
"def map_participants(participant):\n",
" participant_dict = participant.dict()\n",
" return list(map(lambda detail: map_details(detail, participant_dict['id']), participant_dict['detailQuestionResponses']))\n",
"\n",
"def convert_to_df(participants):\n",
" return pd.DataFrame([item for sublist in list(map(map_participants, participants)) for item in sublist])\n",
"\n",
"df_details = convert_to_df(participant_response_details)\n",
"df_details = df_details[['score', 'pointsAwarded', 'xpAwarded', 'timeSpent', 'response', 'elementInstanceId', 'participantId', 'courseId']]\n",
"print(\"Question detail responses:\", len(df_details))\n",
"print(\"Columns:\", df_details.columns)\n",
"df_details.head()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Compute correctness of the responses and add them as a separate column\n",
"# Get related element instances\n",
"element_instance_ids = df_details['elementInstanceId'].unique()\n",
"element_instances = db.elementinstance.find_many(\n",
" where={\n",
" 'id': {\n",
" 'in': element_instance_ids.tolist()\n",
" }\n",
" }\n",
")\n",
"\n",
"# Map the element instances to the corresponding elementData.options entries and convert it to a dataframe\n",
"def map_element_instance_options(instance):\n",
" instance_dict = instance.dict()\n",
" return {\n",
" 'elementInstanceId': instance_dict['id'],\n",
" 'type': instance_dict['elementData']['type'],\n",
" 'options': instance_dict['elementData']['options'] if 'options' in instance_dict['elementData'] else None\n",
" }\n",
"\n",
"df_element_instances = pd.DataFrame(list(map(map_element_instance_options, element_instances)))\n",
"df_element_instances.head()\n",
"\n",
"# Compute the correctness for every response entry based on the element instance options (depending on the type of the element)\n",
"def compute_correctness(row):\n",
" element_instance = df_element_instances[df_element_instances['elementInstanceId'] == row['elementInstanceId']].iloc[0]\n",
" response = row['response']\n",
" options = element_instance['options']\n",
"\n",
" if element_instance['type'] == 'FLASHCARD' or element_instance['type'] == 'CONTENT':\n",
" return None\n",
"\n",
" elif element_instance['type'] == 'SC':\n",
" selected_choice = response['choices'][0]\n",
" correct_choice = next((choice['ix'] for choice in options['choices'] if choice['correct']), None)\n",
" return 'CORRECT' if selected_choice == correct_choice else 'INCORRECT'\n",
"\n",
" elif element_instance['type'] == 'MC' or element_instance['type'] == 'KPRIM':\n",
" selected_choices = response['choices']\n",
" correct_choices = [choice['ix'] for choice in options['choices'] if choice['correct']]\n",
" available_choices = len(options['choices'])\n",
" \n",
" selected_choices_array = [1 if ix in selected_choices else 0 for ix in range(available_choices)]\n",
" correct_choices_array = [1 if ix in correct_choices else 0 for ix in range(available_choices)]\n",
" hamming_distance = sum([1 for i in range(available_choices) if selected_choices_array[i] != correct_choices_array[i]])\n",
"\n",
" if element_instance['type'] == 'MC':\n",
" correctness = max(-2 * hamming_distance / available_choices + 1, 0)\n",
" if correctness == 1:\n",
" return 'CORRECT'\n",
" elif correctness == 0:\n",
" return 'INCORRECT'\n",
" else:\n",
" return 'PARTIAL'\n",
" elif element_instance['type'] == 'KPRIM':\n",
" return 'CORRECT' if hamming_distance == 0 else 'PARTIAL' if hamming_distance == 1 else 'INCORRECT'\n",
"\n",
" elif element_instance['type'] == 'NUMERICAL':\n",
" response_value = float(response['value'])\n",
" within_range = list(map(lambda range: float(range['min']) <= response_value <= float(range['max']), options['solutionRanges']))\n",
" if any(within_range):\n",
" return 'CORRECT'\n",
"\n",
" return 'INCORRECT'\n",
"\n",
" elif element_instance['type'] == 'FREE_TEXT':\n",
" raise NotImplementedError(\"Free text correctness computation not implemented yet\")\n",
"\n",
" else:\n",
" raise ValueError(\"Unknown element type: {}\".format(element_instance['type']))\n",
"\n",
"df_details['correctness'] = df_details.apply(compute_correctness, axis=1)\n",
"df_details = df_details.dropna(subset=['correctness'])\n",
"print(\"{} question response details remaining with correctness\".format(len(df_details)))\n",
"df_details.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Aggregate Metrics and Counts for Responses"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Aggregate the question response details for the participant and course level\n",
"df_analytics_counts = df_details.groupby(['participantId', 'courseId']).agg({\n",
" 'score': 'sum',\n",
" 'pointsAwarded': 'sum',\n",
" 'xpAwarded': 'sum',\n",
" 'timeSpent': 'sum',\n",
" 'elementInstanceId': ['count', 'nunique'] # count = trialsCount, nunique = responseCount\n",
"}).reset_index()\n",
"df_analytics_counts.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Count the 'CORRECT', 'PARTIAL', and 'INCORRECT' entries for each participantId and elementInstanceId combination\n",
"df_analytics_corr_temp = df_details.groupby(['participantId', 'elementInstanceId', 'courseId', 'correctness']).size().unstack(fill_value=0).reset_index()\n",
"\n",
"# Divide each of the correctness columns by the sum of all and rename them to meanCorrect, meanPartial, meanIncorrect\n",
"df_analytics_corr_temp['sum'] = df_analytics_corr_temp['CORRECT'] + df_analytics_corr_temp['PARTIAL'] + df_analytics_corr_temp['INCORRECT']\n",
"df_analytics_corr_temp['meanCorrect'] = df_analytics_corr_temp['CORRECT'] / df_analytics_corr_temp['sum']\n",
"df_analytics_corr_temp['meanPartial'] = df_analytics_corr_temp['PARTIAL'] / df_analytics_corr_temp['sum']\n",
"df_analytics_corr_temp['meanIncorrect'] = df_analytics_corr_temp['INCORRECT'] / df_analytics_corr_temp['sum']\n",
"\n",
"# Aggregate the correctness columns for each participantId and courseId\n",
"df_analytics_correctness = df_analytics_corr_temp.groupby(['participantId', 'courseId']).agg({\n",
" 'meanCorrect': 'sum',\n",
" 'meanPartial': 'sum',\n",
" 'meanIncorrect': 'sum'\n",
"}).reset_index().rename(columns={\n",
" 'meanCorrect': 'meanCorrectCount',\n",
" 'meanPartial': 'meanPartialCount',\n",
" 'meanIncorrect': 'meanWrongCount'\n",
"})\n",
"df_analytics_correctness.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Map the counts in the corresponding analytics dataframe to a single level\n",
"df_analytics_counts.columns = df_analytics_counts.columns.map('_'.join).str.strip('_')\n",
"df_analytics_counts = df_analytics_counts.rename(columns={\n",
" 'score_sum': 'totalScore',\n",
" 'pointsAwarded_sum': 'totalPoints',\n",
" 'xpAwarded_sum': 'totalXp',\n",
" 'timeSpent_sum': 'totalTimeSpent',\n",
" 'elementInstanceId_count': 'trialsCount',\n",
" 'elementInstanceId_nunique': 'responseCount'\n",
"})\n",
"df_analytics_counts.head()\n",
"\n",
"# Combine the analytics counts and correctness dataframes based on the unique participantId and courseId combinations\n",
"df_analytics = pd.merge(df_analytics_counts, df_analytics_correctness, on=['participantId', 'courseId'])\n",
"df_analytics.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Add Computed Metrics to the Database"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create daily analytics entries for all participants\n",
"for index, row in df_analytics.iterrows():\n",
" db.participantanalytics.upsert(\n",
" where={\n",
" 'type_courseId_participantId_timestamp': {\n",
" 'type': 'DAILY',\n",
" 'courseId': row['courseId'],\n",
" 'participantId': row['participantId'],\n",
" 'timestamp': specific_date + 'T00:00:00.000Z'\n",
" }\n",
" },\n",
" data={\n",
" 'create': {\n",
" 'type': 'DAILY',\n",
" 'timestamp': specific_date + 'T00:00:00.000Z',\n",
" 'trialsCount': row['trialsCount'],\n",
" 'responseCount': row['responseCount'],\n",
" 'totalScore': row['totalScore'],\n",
" 'totalPoints': row['totalPoints'],\n",
" 'totalXp': row['totalXp'],\n",
" 'meanCorrectCount': row['meanCorrectCount'],\n",
" 'meanPartialCorrectCount': row['meanPartialCount'],\n",
" 'meanWrongCount': row['meanWrongCount'],\n",
" 'participant': {\n",
" 'connect': {\n",
" 'id': row['participantId']\n",
" }\n",
" },\n",
" 'course': {\n",
" 'connect': {\n",
" 'id': row['courseId']\n",
" }\n",
" }\n",
" },\n",
" 'update': {}\n",
" }\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cleanup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"db.disconnect()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "analytics-3uz8SvN3-py3.12",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 1 addition & 1 deletion apps/analytics/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "@klicker-uzh/analytics"
version = "0.0.1"
description = ""
authors = ["Roland Schlaefli <roland.schlaefli@df.uzh.ch>"]
authors = ["Roland Schlaefli <roland.schlaefli@df.uzh.ch>", "Julius Schlapbach <julius.schlapbach@df.uzh.ch>"]
license = "AGPL-3.0"
readme = "README.md"
packages = [{include = "@klicker_uzh"}]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/*
Warnings:
- You are about to drop the column `meanFirstCorrectCount` on the `ParticipantAnalytics` table. All the data in the column will be lost.
- You are about to drop the column `meanLastCorrectCount` on the `ParticipantAnalytics` table. All the data in the column will be lost.
- You are about to drop the column `collectedAchievements` on the `ParticipantAnalytics` table. All the data in the column will be lost.
*/
-- AlterTable
ALTER TABLE "ParticipantAnalytics" DROP COLUMN "meanFirstCorrectCount",
DROP COLUMN "meanLastCorrectCount",
DROP COLUMN "collectedAchievements",
ADD COLUMN "firstCorrectCount" REAL,
ADD COLUMN "firstWrongCount" REAL,
ADD COLUMN "lastCorrectCount" REAL,
ADD COLUMN "lastWrongCount" REAL;
Loading

0 comments on commit 2948359

Please sign in to comment.