Merge 7113565 into 51bbba9

uzh-bf · Dec 3, 2024 · cfe3c16 · cfe3c16
2 parents 51bbba9 + 7113565
commit cfe3c16
Show file tree

Hide file tree

Showing 57 changed files with 2,783 additions and 2,725 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,4 +22,5 @@ packages/prisma/src/seed
 .turbo
 
 out/
+!out/.gitkeep
 .rollup.cache/
diff --git a/apps/analytics/README.md b/apps/analytics/README.md
@@ -0,0 +1,24 @@
+# KlickerUZH Analytics
+
+This service computes learning analytics for KlickerUZH, providing insights into student learning patterns and performance metrics.
+
+## Requirements
+
+- Python 3.12.x (e.g., installed through `asdf`)
+- Node.js 20.x.x
+- Poetry
+
+## Setup
+
+- The project uses Poetry for dependency management and environment isolation. Make sure you have Poetry installed before proceeding. Then run `poetry install` in this folder to prepare the virtual environment.
+- The project uses PNPM to simplify the execution of scripts and to provide a watch mode for execution. Make sure that you have executed `pnpm install` in the repository before trying to run the commands below.
+- Make sure that all `.prisma` files are available in `prisma/`. If this is not the case, run the `util/sync-schema.sh` script first.
+- Make sure that a valid Python environment is used (3.12). If poetry tries to use an environment not matching specifications, the install command or script execution might fail. The Python binary to be used can be set expliticly using `poetry env use /Users/.../bin/python` (after which `poetry install` has to be run). Tools like `asdf` allow the clean management of multiple Python versions on a single machine.
+
+## Available Commands
+
+The following commands are available through PNPM:
+
+- `pnpm generate` - Generate the Prisma client for database access in Python
+- `pnpm main` - Run the analytics service
+- `pnpm dev` - Start the service in watch mode for development
diff --git a/apps/analytics/package.json b/apps/analytics/package.json
@@ -0,0 +1,13 @@
+{
+  "name": "@klicker-uzh/analytics",
+  "version": "3.3.0-alpha.8",
+  "license": "AGPL-3.0",
+  "devDependencies": {
+    "nodemon": "~3.1.7"
+  },
+  "scripts": {
+    "dev": "doppler run --config dev -- nodemon --exec 'poetry run poe main' --watch src,prisma --ext py,prisma",
+    "generate": "poetry run poe generate",
+    "main": "doppler run --config dev -- poetry run poe main"
+  }
+}
diff --git a/apps/analytics/poetry.lock b/apps/analytics/poetry.lock
diff --git a/apps/analytics/pyproject.toml b/apps/analytics/pyproject.toml
@@ -2,25 +2,31 @@
 name = "@klicker-uzh/analytics"
 version = "0.0.1"
 description = ""
-authors = ["Roland Schlaefli <roland.schlaefli@df.uzh.ch>"]
+authors = ["Roland Schlaefli <roland.schlaefli@df.uzh.ch>", "Julius Schlapbach <julius.schlapbach@df.uzh.ch>"]
 license = "AGPL-3.0"
 readme = "README.md"
-packages = [{include = "@klicker_uzh"}]
+package-mode = false
 
 [tool.poetry.dependencies]
 python = "^3.12"
 pandas = "2.2.2"
-prisma = "0.14.0"
-xlsxwriter = "^3.2.0"
+prisma = "0.15.0"
+xlsxwriter = "3.2.0"
 
 [tool.poetry.dev-dependencies]
 poethepoet = "0.27.0"
 ipykernel = "6.29.5"
 
+[tool.poetry.group.dev.dependencies]
+pyright = "1.1.376"
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poe.tasks]
 generate = "prisma generate"
-main = "doppler run --config dev -- python main.py"
+main = "doppler run --config dev -- python -m src.main"
+
+[tool.pyright]
+typeCheckingMode = "strict"
diff --git a/apps/analytics/src/__init__.py b/apps/analytics/src/__init__.py
@@ -0,0 +1,2 @@
+from .modules import *
+from .notebooks import *
diff --git a/apps/analytics/main.py → apps/analytics/src/main.py b/apps/analytics/main.py → apps/analytics/src/main.py
diff --git a/apps/analytics/src/modules/__init__.py b/apps/analytics/src/modules/__init__.py
@@ -0,0 +1,2 @@
+from .participant_analytics import compute_correctness, get_participant_responses
+from .aggregated_analytics import compute_aggregated_analytics
diff --git a/apps/analytics/src/modules/aggregated_analytics/__init__.py b/apps/analytics/src/modules/aggregated_analytics/__init__.py
@@ -0,0 +1,4 @@
+from .compute_aggregated_analytics import compute_aggregated_analytics
+from .load_participant_analytics import load_participant_analytics
+from .aggregate_participant_analytics import aggregate_participant_analytics
+from .save_aggregated_analytics import save_aggregated_analytics
diff --git a/apps/analytics/src/modules/aggregated_analytics/aggregate_participant_analytics.py b/apps/analytics/src/modules/aggregated_analytics/aggregate_participant_analytics.py
@@ -0,0 +1,29 @@
+def aggregate_participant_analytics(df_participant_analytics, verbose=False):
+    # if the dataframe is empty, return None
+    if df_participant_analytics.empty:
+        if verbose:
+            print("No participant analytics to aggregate")
+
+        return None
+
+    # aggreagte all participant analytics for the specified time range and separate courses
+    df_aggregated_analytics = (
+        df_participant_analytics.groupby("courseId")
+        .agg(
+            {
+                "id": "count",
+                "responseCount": "sum",
+                "totalScore": "sum",
+                "totalPoints": "sum",
+                "totalXp": "sum",
+            }
+        )
+        .reset_index()
+        .rename(
+            columns={
+                "id": "participantCount",
+            }
+        )
+    )
+
+    return df_aggregated_analytics
diff --git a/apps/analytics/src/modules/aggregated_analytics/compute_aggregated_analytics.py b/apps/analytics/src/modules/aggregated_analytics/compute_aggregated_analytics.py
@@ -0,0 +1,34 @@
+from .load_participant_analytics import load_participant_analytics
+from .aggregate_participant_analytics import aggregate_participant_analytics
+from .save_aggregated_analytics import save_aggregated_analytics
+
+
+def compute_aggregated_analytics(
+    db, start_date, end_date, timestamp, analytics_type="DAILY", verbose=False
+):
+    # load all participant analytics for the given timestamp and analytics time range
+    df_participant_analytics = load_participant_analytics(
+        db, timestamp, analytics_type, verbose
+    )
+
+    # aggregate all participant analytics values by course
+    df_aggregated_analytics = aggregate_participant_analytics(
+        df_participant_analytics, verbose
+    )
+
+    if df_aggregated_analytics is not None and verbose:
+        print("Aggregated analytics for time range:" + start_date + " to " + end_date)
+        print(df_aggregated_analytics.head())
+    elif df_aggregated_analytics is None:
+        print(
+            "No aggregated analytics to compute for time range:"
+            + start_date
+            + " to "
+            + end_date
+        )
+
+    # store the computed aggregated analytics in the database
+    if df_aggregated_analytics is not None:
+        save_aggregated_analytics(
+            db, df_aggregated_analytics, timestamp, analytics_type
+        )
diff --git a/apps/analytics/src/modules/aggregated_analytics/load_participant_analytics.py b/apps/analytics/src/modules/aggregated_analytics/load_participant_analytics.py
@@ -0,0 +1,30 @@
+import pandas as pd
+
+
+def convert_to_df(analytics):
+    # convert the database query result into a pandas dataframe
+    rows = []
+    for item in analytics:
+        rows.append(dict(item))
+
+    return pd.DataFrame(rows)
+
+
+def load_participant_analytics(db, timestamp, analytics_type, verbose=False):
+    participant_analytics = db.participantanalytics.find_many(
+        where={"timestamp": timestamp, "type": analytics_type},
+    )
+
+    if verbose:
+        # Print the first participant analytics
+        print(
+            "Found {} analytics for the timespan from {} to {}".format(
+                len(participant_analytics), start_date, end_date
+            )
+        )
+        print(participant_analytics[0])
+
+    # convert the analytics to a dataframe
+    df_loaded_analytics = convert_to_df(participant_analytics)
+
+    return df_loaded_analytics
diff --git a/apps/analytics/src/modules/aggregated_analytics/save_aggregated_analytics.py b/apps/analytics/src/modules/aggregated_analytics/save_aggregated_analytics.py
@@ -0,0 +1,108 @@
+from datetime import datetime
+
+
+def save_aggregated_analytics(db, df_analytics, timestamp, analytics_type="DAILY"):
+    computedAt = datetime.now().strftime("%Y-%m-%d") + "T00:00:00.000Z"
+
+    # create daily / weekly / monthly analytics entries for all participants
+    if analytics_type in ["DAILY", "WEEKLY", "MONTHLY"]:
+        for _, row in df_analytics.iterrows():
+            db.aggregatedanalytics.upsert(
+                where={
+                    "type_courseId_timestamp": {
+                        "type": analytics_type,
+                        "courseId": row["courseId"],
+                        "timestamp": timestamp,
+                    }
+                },
+                data={
+                    "create": {
+                        "type": analytics_type,
+                        "timestamp": timestamp,
+                        "computedAt": computedAt,
+                        "participantCount": row["participantCount"],
+                        "responseCount": row["responseCount"],
+                        "totalScore": row["totalScore"],
+                        "totalPoints": row["totalPoints"],
+                        "totalXp": row["totalXp"],
+                        # TODO: set this value correctly for rolling updates in production code
+                        # (cannot be computed for past learning analytics -> therefore set to invalid value)
+                        "totalElementsAvailable": -1,
+                        "course": {"connect": {"id": row["courseId"]}},
+                    },
+                    "update": {},
+                },
+            )
+
+    # create or update course-wide analytics entries (should be unique for participant / course combination)
+    elif analytics_type == "COURSE":
+        for _, row in df_analytics.iterrows():
+            course = db.course.find_unique_or_raise(
+                where={"id": row["courseId"]},
+                include={
+                    "practiceQuizzes": {
+                        "include": {
+                            "stacks": {
+                                "include": {"elements": True},
+                            }
+                        }
+                    },
+                    "microLearnings": {
+                        "include": {
+                            "stacks": {
+                                "include": {"elements": True},
+                            }
+                        }
+                    },
+                },
+            )
+            course = dict(course)
+
+            # add all the number of elements in all practice quizzes and microlearnings together
+            totalElementsAvailable = 0
+            for practice_quiz in course["practiceQuizzes"]:
+                pq_dict = dict(practice_quiz)
+                for stack in pq_dict["stacks"]:
+                    stack_dict = dict(stack)
+                    totalElementsAvailable += len(stack_dict["elements"])
+            for microlearning in course["microLearnings"]:
+                ml_dict = dict(microlearning)
+                for stack in ml_dict["stacks"]:
+                    stack_dict = dict(stack)
+                    totalElementsAvailable += len(stack_dict["elements"])
+
+            db.aggregatedanalytics.upsert(
+                where={
+                    "type_courseId_timestamp": {
+                        "type": analytics_type,
+                        "courseId": row["courseId"],
+                        "timestamp": timestamp,
+                    }
+                },
+                data={
+                    "create": {
+                        "type": analytics_type,
+                        "timestamp": timestamp,
+                        "computedAt": computedAt,
+                        "participantCount": row["participantCount"],
+                        "responseCount": row["responseCount"],
+                        "totalScore": row["totalScore"],
+                        "totalPoints": row["totalPoints"],
+                        "totalXp": row["totalXp"],
+                        "totalElementsAvailable": totalElementsAvailable,
+                        "course": {"connect": {"id": row["courseId"]}},
+                    },
+                    "update": {
+                        "computedAt": computedAt,
+                        "participantCount": row["participantCount"],
+                        "responseCount": row["responseCount"],
+                        "totalScore": row["totalScore"],
+                        "totalPoints": row["totalPoints"],
+                        "totalXp": row["totalXp"],
+                        "totalElementsAvailable": totalElementsAvailable,
+                    },
+                },
+            )
+
+    else:
+        raise ValueError("Unknown analytics type: {}".format(analytics_type))
diff --git a/apps/analytics/src/modules/participant_analytics/__init__.py b/apps/analytics/src/modules/participant_analytics/__init__.py
@@ -0,0 +1,5 @@
+from .compute_correctness import compute_correctness
+from .get_participant_responses import get_participant_responses
+from .aggregate_analytics import aggregate_analytics
+from .save_participant_analytics import save_participant_analytics
+from .compute_participant_course_analytics import compute_participant_course_analytics
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,4 +22,5 @@ packages/prisma/src/seed @@
     .turbo
     out/
+    !out/.gitkeep
     .rollup.cache/
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .modules import *
		from .notebooks import *
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .participant_analytics import compute_correctness, get_participant_responses
		from .aggregated_analytics import compute_aggregated_analytics