diff --git a/AUTHORS.md b/AUTHORS.md index 85b0208..42d33e3 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -16,4 +16,7 @@ [Fangzhou Yao](https://github.com/fannazya) +[Longhu Qin](https://github.com/KenelmQLH) + The stared contributors are the main authors. + diff --git a/EduData/DataSet/download_data/download_data.py b/EduData/DataSet/download_data/download_data.py index 5285903..7f87413 100644 --- a/EduData/DataSet/download_data/download_data.py +++ b/EduData/DataSet/download_data/download_data.py @@ -70,7 +70,9 @@ "ktbd-ednet": "http://base.ustc.edu.cn/data/ktbd/EdNet/", "math23k": - "http://base.ustc.edu.cn/data/math23k.zip" + "http://base.ustc.edu.cn/data/math23k.zip", + "OLI-Fall-2011": + "http://base.ustc.edu.cn/data/OLI_data.zip" } diff --git a/README.md b/README.md index 56d17fc..550c923 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ The dataset includes: * [ASSISTments](https://sites.google.com/site/assistmentsdata/) [[Analysis]](docs/ASSISTments) -* [OLI Engineering Statics 2011](https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=507) [Analysis] (TBA) +* [OLI Engineering Statics 2011](https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=507) [[Analysis]](docs/OLI_Fall2011) * [JunyiAcademy Math Practicing Log](https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=1198) [[Analysis]](docs/junyi.md) diff --git a/docs/ASSISTments/ASSISTments2009-2010.ipynb b/docs/ASSISTments/ASSISTments2009-2010.ipynb index 78bbcde..04c5b47 100644 --- a/docs/ASSISTments/ASSISTments2009-2010.ipynb +++ b/docs/ASSISTments/ASSISTments2009-2010.ipynb @@ -1321,7 +1321,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.6.3" } }, "nbformat": 4, diff --git a/docs/OLI_Fall2011/OLI_2011F_problem.ipynb b/docs/OLI_Fall2011/OLI_2011F_problem.ipynb new file mode 100644 index 0000000..83581f5 --- /dev/null +++ b/docs/OLI_Fall2011/OLI_2011F_problem.ipynb @@ -0,0 +1,1708 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# OLI data in fall, 2011(problem)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import pandas as pd\n", + "import numpy as np\n", + "# global configuration: show every rows and cols\n", + "pd.set_option('display.max_rows', None)\n", + "pd.set_option('max_colwidth',None)\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Data Description\n", + "## 1.1 Column Description" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FieldAnnotation
0RowA row counter.
1SampleThe sample that includes this problem. If you select more than one sample to export, problems that occur in more than one sample will be duplicated in the export.
2Anon Student IDThe student that worked on the problem.
3Problem HierarchyThe location in the curriculum hierarchy where this problem occurs.
4Problem NameThe name of the problem.
5Problem ViewThe number of times the student encountered the problem so far. This counter increases with each instance of the same problem. See \"Problem View\" in the \"By Student-Step\" table above.
6Problem Start TimeIf the problem start time is not given in the original log data, then it is set to the time of the last transaction of the prior problem. If there is no prior problem for the session, the time of the earliest transaction is used. Earliest transaction time is equivalent to the minimum transaction time for the earliest step of the problem. For more detail on how problem start time is determined, see Determining Problem Start Time.
7Problem End TimeDerived from the maximum transaction time of the latest step of the problem.
8Latency (sec)The amount of time the student spent on this problem. Specifically, the difference between the problem start time and the last transaction on this problem.
9Steps Missing Start TimesThe number of steps (from the student-step table) with \"Step Start Time\" values of \"null\".
10HintsTotal number of hints the student requested for this problem.
11IncorrectsTotal number of incorrect attempts the student made on this problem.
12CorrectsTotal number of correct attempts the student made for this problem.
13Avg CorrectsThe total number of correct attempts / total number of steps in the problem.
14StepsTotal number of steps the student took while working on the problem.
15Avg Assistance ScoreCalculated as (total hints requested + total incorrect attempts) / total steps.
16Correct First AttemptsTotal number of correct first attempts made by the student for this problem.
17ConditionThe name and type of the condition the student is assigned to. In the case of a student assigned to multiple conditions (factors in a factorial design), condition names are separated by a comma and space. This differs from the transaction format, which optionally has \"Condition Name\" and \"Condition Type\" columns.
18KCsTotal number of KCs practiced by the student for this problem.
19Steps without KCsTotal number of steps in this problem (performed by the student) without an assigned KC.
20KC ListComma-delimited list of KCs practiced by the student for this problem.
\n", + "
" + ], + "text/plain": [ + " Field \\\n", + "0 Row \n", + "1 Sample \n", + "2 Anon Student ID \n", + "3 Problem Hierarchy \n", + "4 Problem Name \n", + "5 Problem View \n", + "6 Problem Start Time \n", + "7 Problem End Time \n", + "8 Latency (sec) \n", + "9 Steps Missing Start Times \n", + "10 Hints \n", + "11 Incorrects \n", + "12 Corrects \n", + "13 Avg Corrects \n", + "14 Steps \n", + "15 Avg Assistance Score \n", + "16 Correct First Attempts \n", + "17 Condition \n", + "18 KCs \n", + "19 Steps without KCs \n", + "20 KC List \n", + "\n", + " Annotation \n", + "0 A row counter. \n", + "1 The sample that includes this problem. If you select more than one sample to export, problems that occur in more than one sample will be duplicated in the export. \n", + "2 The student that worked on the problem. \n", + "3 The location in the curriculum hierarchy where this problem occurs. \n", + "4 The name of the problem. \n", + "5 The number of times the student encountered the problem so far. This counter increases with each instance of the same problem. See \"Problem View\" in the \"By Student-Step\" table above. \n", + "6 If the problem start time is not given in the original log data, then it is set to the time of the last transaction of the prior problem. If there is no prior problem for the session, the time of the earliest transaction is used. Earliest transaction time is equivalent to the minimum transaction time for the earliest step of the problem. For more detail on how problem start time is determined, see Determining Problem Start Time. \n", + "7 Derived from the maximum transaction time of the latest step of the problem. \n", + "8 The amount of time the student spent on this problem. Specifically, the difference between the problem start time and the last transaction on this problem. \n", + "9 The number of steps (from the student-step table) with \"Step Start Time\" values of \"null\". \n", + "10 Total number of hints the student requested for this problem. \n", + "11 Total number of incorrect attempts the student made on this problem. \n", + "12 Total number of correct attempts the student made for this problem. \n", + "13 The total number of correct attempts / total number of steps in the problem. \n", + "14 Total number of steps the student took while working on the problem. \n", + "15 Calculated as (total hints requested + total incorrect attempts) / total steps. \n", + "16 Total number of correct first attempts made by the student for this problem. \n", + "17 The name and type of the condition the student is assigned to. In the case of a student assigned to multiple conditions (factors in a factorial design), condition names are separated by a comma and space. This differs from the transaction format, which optionally has \"Condition Name\" and \"Condition Type\" columns. \n", + "18 Total number of KCs practiced by the student for this problem. \n", + "19 Total number of steps in this problem (performed by the student) without an assigned KC. \n", + "20 Comma-delimited list of KCs practiced by the student for this problem. " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# help_table3: the description for data by problems\n", + "df3 = pd.read_csv('OLI_data/help_table3.csv',sep=',',encoding=\"gbk\")\n", + "df3 = df3.loc[:, ['Field', 'Annotation']]\n", + "df3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.2 Summarization of Data\n", + "\n", + "**This table organizes the data as student-problem**" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RowSampleAnon Student IdProblem HierarchyProblem NameProblem ViewProblem Start TimeProblem End TimeLatency (sec)Steps Missing Start TimesHintsIncorrectsCorrectsAvg CorrectsStepsAvg Assistance ScoreCorrect First AttemptsConditionKCs (F2011)Steps without KCs (F2011)KC List (F2011)KCs (Single-KC)Steps without KCs (Single-KC)KC List (Single-KC)KCs (Unique-step)Steps without KCs (Unique-step)KC List (Unique-step)
01All DataStu_00b2b35fd027e7891e8a1a527125dd65sequence Statics, unit Concentrated Forces and Their Effects, module Introduction to Free Body Diagrams_m2_assess12011/9/21 17:352011/9/21 17:350009120.571210.42912NaN50gravitational_forces, identify_interaction, represent_interaction_cord, represent_interaction_spring, simple_step10Single-KC021.
12All DataStu_00b2b35fd027e7891e8a1a527125dd65sequence Statics, unit Concentrated Forces and Their Effects, module Effects of Forcetutor_03_0112011/9/21 17:492011/9/21 17:49900031.00030.0003NaN10distinguish_rotation_translation10Single-KC30KC523, KC680, KC768
\n", + "
" + ], + "text/plain": [ + " Row Sample Anon Student Id \\\n", + "0 1 All Data Stu_00b2b35fd027e7891e8a1a527125dd65 \n", + "1 2 All Data Stu_00b2b35fd027e7891e8a1a527125dd65 \n", + "\n", + " Problem Hierarchy \\\n", + "0 sequence Statics, unit Concentrated Forces and Their Effects, module Introduction to Free Body Diagrams \n", + "1 sequence Statics, unit Concentrated Forces and Their Effects, module Effects of Force \n", + "\n", + " Problem Name Problem View Problem Start Time Problem End Time \\\n", + "0 _m2_assess 1 2011/9/21 17:35 2011/9/21 17:35 \n", + "1 tutor_03_01 1 2011/9/21 17:49 2011/9/21 17:49 \n", + "\n", + " Latency (sec) Steps Missing Start Times Hints Incorrects Corrects \\\n", + "0 0 0 0 9 12 \n", + "1 9 0 0 0 3 \n", + "\n", + " Avg Corrects Steps Avg Assistance Score Correct First Attempts \\\n", + "0 0.571 21 0.429 12 \n", + "1 1.000 3 0.000 3 \n", + "\n", + " Condition KCs (F2011) Steps without KCs (F2011) \\\n", + "0 NaN 5 0 \n", + "1 NaN 1 0 \n", + "\n", + " KC List (F2011) \\\n", + "0 gravitational_forces, identify_interaction, represent_interaction_cord, represent_interaction_spring, simple_step \n", + "1 distinguish_rotation_translation \n", + "\n", + " KCs (Single-KC) Steps without KCs (Single-KC) KC List (Single-KC) \\\n", + "0 1 0 Single-KC \n", + "1 1 0 Single-KC \n", + "\n", + " KCs (Unique-step) Steps without KCs (Unique-step) KC List (Unique-step) \n", + "0 0 21 . \n", + "1 3 0 KC523, KC680, KC768 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_problem = pd.read_csv('OLI_data/AllData_problem_2011F.csv',low_memory=False) # sep=\"\\t\"\n", + "df_problem.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Data Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RowProblem ViewLatency (sec)Steps Missing Start TimesHintsIncorrectsCorrectsAvg CorrectsStepsAvg Assistance ScoreCorrect First AttemptsConditionKCs (F2011)Steps without KCs (F2011)KCs (Single-KC)Steps without KCs (Single-KC)KCs (Unique-step)Steps without KCs (Unique-step)
count45002.00000045002.00000045002.00000045002.00000045002.00000045002.00000045002.00000045002.00000045002.00000045002.00000045002.0000000.045002.00000045002.00000045002.045002.045002.00000045002.000000
mean22501.5000001.22114685.6398830.0070000.6202171.6444604.1763250.9595714.3319630.9280143.219479NaN1.2239231.7989201.00.04.2896540.042309
std12991.1027441.140622301.8953740.1067481.9563023.3782115.1257420.3588505.0794842.2219074.603916NaN1.7338563.8304710.00.05.0844900.557118
min1.0000001.0000000.0000000.0000000.0000000.0000000.0000000.0000001.0000000.0000000.000000NaN0.0000000.0000001.00.00.0000000.000000
25%11251.2500001.0000000.0000000.0000000.0000000.0000001.0000001.0000001.0000000.0000001.000000NaN0.0000000.0000001.00.01.0000000.000000
50%22501.5000001.00000020.0000000.0000000.0000001.0000003.0000001.0000003.0000000.2500002.000000NaN1.0000000.0000001.00.03.0000000.000000
75%33751.7500001.00000073.0000000.0000000.0000002.0000005.0000001.0000005.0000001.0000004.000000NaN2.0000002.0000001.00.05.0000000.000000
max45002.00000032.00000020426.0000008.00000050.000000413.000000232.00000019.33300032.000000210.50000032.000000NaN9.00000032.0000001.00.032.00000029.000000
\n", + "
" + ], + "text/plain": [ + " Row Problem View Latency (sec) Steps Missing Start Times \\\n", + "count 45002.000000 45002.000000 45002.000000 45002.000000 \n", + "mean 22501.500000 1.221146 85.639883 0.007000 \n", + "std 12991.102744 1.140622 301.895374 0.106748 \n", + "min 1.000000 1.000000 0.000000 0.000000 \n", + "25% 11251.250000 1.000000 0.000000 0.000000 \n", + "50% 22501.500000 1.000000 20.000000 0.000000 \n", + "75% 33751.750000 1.000000 73.000000 0.000000 \n", + "max 45002.000000 32.000000 20426.000000 8.000000 \n", + "\n", + " Hints Incorrects Corrects Avg Corrects Steps \\\n", + "count 45002.000000 45002.000000 45002.000000 45002.000000 45002.000000 \n", + "mean 0.620217 1.644460 4.176325 0.959571 4.331963 \n", + "std 1.956302 3.378211 5.125742 0.358850 5.079484 \n", + "min 0.000000 0.000000 0.000000 0.000000 1.000000 \n", + "25% 0.000000 0.000000 1.000000 1.000000 1.000000 \n", + "50% 0.000000 1.000000 3.000000 1.000000 3.000000 \n", + "75% 0.000000 2.000000 5.000000 1.000000 5.000000 \n", + "max 50.000000 413.000000 232.000000 19.333000 32.000000 \n", + "\n", + " Avg Assistance Score Correct First Attempts Condition KCs (F2011) \\\n", + "count 45002.000000 45002.000000 0.0 45002.000000 \n", + "mean 0.928014 3.219479 NaN 1.223923 \n", + "std 2.221907 4.603916 NaN 1.733856 \n", + "min 0.000000 0.000000 NaN 0.000000 \n", + "25% 0.000000 1.000000 NaN 0.000000 \n", + "50% 0.250000 2.000000 NaN 1.000000 \n", + "75% 1.000000 4.000000 NaN 2.000000 \n", + "max 210.500000 32.000000 NaN 9.000000 \n", + "\n", + " Steps without KCs (F2011) KCs (Single-KC) \\\n", + "count 45002.000000 45002.0 \n", + "mean 1.798920 1.0 \n", + "std 3.830471 0.0 \n", + "min 0.000000 1.0 \n", + "25% 0.000000 1.0 \n", + "50% 0.000000 1.0 \n", + "75% 2.000000 1.0 \n", + "max 32.000000 1.0 \n", + "\n", + " Steps without KCs (Single-KC) KCs (Unique-step) \\\n", + "count 45002.0 45002.000000 \n", + "mean 0.0 4.289654 \n", + "std 0.0 5.084490 \n", + "min 0.0 0.000000 \n", + "25% 0.0 1.000000 \n", + "50% 0.0 3.000000 \n", + "75% 0.0 5.000000 \n", + "max 0.0 32.000000 \n", + "\n", + " Steps without KCs (Unique-step) \n", + "count 45002.000000 \n", + "mean 0.042309 \n", + "std 0.557118 \n", + "min 0.000000 \n", + "25% 0.000000 \n", + "50% 0.000000 \n", + "75% 0.000000 \n", + "max 29.000000 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_problem.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (1)Analysis for Null and Unique value of column attributes" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------num_unique_toal and num_nonull_toal----------------------\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col_namenum_nonullnum_nullnum_unique
0Row45002045002
1Sample4500201
2Anon Student Id450020333
3Problem Hierarchy45002027
4Problem Name450020300
5Problem View45002032
6Problem Start Time45002025983
7Problem End Time45002025884
8Latency (sec)4500201290
9Steps Missing Start Times4500208
10Hints45002035
11Incorrects45002037
12Corrects45002051
13Avg Corrects450020195
14Steps45002031
15Avg Assistance Score450020335
16Correct First Attempts45002033
17Condition0450021
18KCs (F2011)45002010
19Steps without KCs (F2011)45002031
20KC List (F2011)450020170
21KCs (Single-KC)4500201
22Steps without KCs (Single-KC)4500201
23KC List (Single-KC)4500201
24KCs (Unique-step)45002032
25Steps without KCs (Unique-step)45002016
26KC List (Unique-step)4500201470
\n", + "
" + ], + "text/plain": [ + " col_name num_nonull num_null num_unique\n", + "0 Row 45002 0 45002\n", + "1 Sample 45002 0 1\n", + "2 Anon Student Id 45002 0 333\n", + "3 Problem Hierarchy 45002 0 27\n", + "4 Problem Name 45002 0 300\n", + "5 Problem View 45002 0 32\n", + "6 Problem Start Time 45002 0 25983\n", + "7 Problem End Time 45002 0 25884\n", + "8 Latency (sec) 45002 0 1290\n", + "9 Steps Missing Start Times 45002 0 8\n", + "10 Hints 45002 0 35\n", + "11 Incorrects 45002 0 37\n", + "12 Corrects 45002 0 51\n", + "13 Avg Corrects 45002 0 195\n", + "14 Steps 45002 0 31\n", + "15 Avg Assistance Score 45002 0 335\n", + "16 Correct First Attempts 45002 0 33\n", + "17 Condition 0 45002 1\n", + "18 KCs (F2011) 45002 0 10\n", + "19 Steps without KCs (F2011) 45002 0 31\n", + "20 KC List (F2011) 45002 0 170\n", + "21 KCs (Single-KC) 45002 0 1\n", + "22 Steps without KCs (Single-KC) 45002 0 1\n", + "23 KC List (Single-KC) 45002 0 1\n", + "24 KCs (Unique-step) 45002 0 32\n", + "25 Steps without KCs (Unique-step) 45002 0 16\n", + "26 KC List (Unique-step) 45002 0 1470" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def work_col_analysis(df_work):\n", + " num_nonull_toal = df_work.notnull().sum() # Not Null\n", + " dict_col_1 = {'col_name':num_nonull_toal.index,'num_nonull':num_nonull_toal.values}\n", + " df_work_col_1 = pd.DataFrame(dict_col_1)\n", + "\n", + " num_null_toal = df_work.isnull().sum() # Null\n", + " dict_col_2 = {'col_name':num_null_toal.index,'num_null':num_null_toal.values}\n", + " df_work_col_2 = pd.DataFrame(dict_col_2)\n", + "\n", + " num_unique_toal = df_work.apply(lambda col: len(col.unique())) # axis=0\n", + " print(type(num_unique_toal))\n", + " dict_col_3 = {'col_name':num_unique_toal.index,'num_unique':num_unique_toal.values}\n", + " df_work_col_3 = pd.DataFrame(dict_col_3)\n", + "\n", + " df_work_col = pd.merge(df_work_col_1, df_work_col_2, on=['col_name'])\n", + " df_work_col = pd.merge(df_work_col, df_work_col_3, on=['col_name'])\n", + " return df_work_col\n", + "print(\"-------------------num_unique_toal and num_nonull_toal----------------------\")\n", + "df_result = work_col_analysis(df_problem)\n", + "df_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (2)Analysis for Discrete value of column attributes\n", + "> Columns with a small number of discrete values may represent very informative, so identify these columns first and analyze them one by one" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Steps Missing Start Times : [0, 1, 2, 5, 7, 6, 3, 8]\n", + "--------------------------------------------------------------------------------\n", + "KCs (F2011) : [5, 1, 4, 2, 3, 9, 0, 8, 6, 7]\n", + "--------------------------------------------------------------------------------\n", + "Steps without KCs (Unique-step) : [21, 0, 17, 15, 9, 2, 5, 1, 4, 3, 12, 10, 8, 11, 14, 29]\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "discrete_cols = []\n", + "series = []\n", + "cols = list(df_problem.columns.values)\n", + "\n", + "for col in cols:\n", + " if len(df_problem[col].unique().tolist()) <= 20 and len(df_problem[col].unique().tolist()) >= 2:\n", + " discrete_cols.append(col)\n", + " series.append(df_problem[col].unique().tolist())\n", + "\n", + "for a,b in zip(discrete_cols,series):\n", + " print(a,\" : \",b)\n", + " print(\"-\"*80)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (3)Data Cleaning \n", + "> **Data Cleaning Suggestions**\n", + "> - Redundant columns: Columns that are all NULL or Single value.\n", + "> - Others" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df_problem_clear = df_problem.copy(deep=True) # deep copy" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the cols num before clear: 27\n", + "the cols num after clear: 22\n", + "drop:--- Sample\n", + "drop:--- Condition\n", + "drop:--- KCs (Single-KC)\n", + "drop:--- Steps without KCs (Single-KC)\n", + "drop:--- KC List (Single-KC)\n" + ] + } + ], + "source": [ + "# Clear all redundant columns directly.\n", + "cols = list(df_problem.columns.values)\n", + "drop_cols = []\n", + "for col in cols:\n", + " if len(df_problem_clear[col].unique().tolist()) == 1:\n", + " df_problem_clear.drop(col,axis =1,inplace=True)\n", + " drop_cols.append(col)\n", + "\n", + "print(\"the cols num before clear: \",len(df_problem.columns.to_list()))\n", + "print(\"the cols num after clear:\",len(df_problem_clear.columns.to_list()))\n", + "for col in drop_cols:\n", + " print(\"drop:---\",col)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RowAnon Student IdProblem HierarchyProblem NameProblem ViewProblem Start TimeProblem End TimeLatency (sec)Steps Missing Start TimesHintsIncorrectsCorrectsAvg CorrectsStepsAvg Assistance ScoreCorrect First AttemptsKCs (F2011)Steps without KCs (F2011)KC List (F2011)KCs (Unique-step)Steps without KCs (Unique-step)KC List (Unique-step)
01Stu_00b2b35fd027e7891e8a1a527125dd65sequence Statics, unit Concentrated Forces and Their Effects, module Introduction to Free Body Diagrams_m2_assess12011/9/21 17:352011/9/21 17:350009120.571210.4291250gravitational_forces, identify_interaction, represent_interaction_cord, represent_interaction_spring, simple_step021.
12Stu_00b2b35fd027e7891e8a1a527125dd65sequence Statics, unit Concentrated Forces and Their Effects, module Effects of Forcetutor_03_0112011/9/21 17:492011/9/21 17:49900031.00030.000310distinguish_rotation_translation30KC523, KC680, KC768
\n", + "
" + ], + "text/plain": [ + " Row Anon Student Id \\\n", + "0 1 Stu_00b2b35fd027e7891e8a1a527125dd65 \n", + "1 2 Stu_00b2b35fd027e7891e8a1a527125dd65 \n", + "\n", + " Problem Hierarchy \\\n", + "0 sequence Statics, unit Concentrated Forces and Their Effects, module Introduction to Free Body Diagrams \n", + "1 sequence Statics, unit Concentrated Forces and Their Effects, module Effects of Force \n", + "\n", + " Problem Name Problem View Problem Start Time Problem End Time \\\n", + "0 _m2_assess 1 2011/9/21 17:35 2011/9/21 17:35 \n", + "1 tutor_03_01 1 2011/9/21 17:49 2011/9/21 17:49 \n", + "\n", + " Latency (sec) Steps Missing Start Times Hints Incorrects Corrects \\\n", + "0 0 0 0 9 12 \n", + "1 9 0 0 0 3 \n", + "\n", + " Avg Corrects Steps Avg Assistance Score Correct First Attempts \\\n", + "0 0.571 21 0.429 12 \n", + "1 1.000 3 0.000 3 \n", + "\n", + " KCs (F2011) Steps without KCs (F2011) \\\n", + "0 5 0 \n", + "1 1 0 \n", + "\n", + " KC List (F2011) \\\n", + "0 gravitational_forces, identify_interaction, represent_interaction_cord, represent_interaction_spring, simple_step \n", + "1 distinguish_rotation_translation \n", + "\n", + " KCs (Unique-step) Steps without KCs (Unique-step) KC List (Unique-step) \n", + "0 0 21 . \n", + "1 3 0 KC523, KC680, KC768 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_problem_clear.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------num_unique_toal and num_nonull_toal----------------------\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col_namenum_nonullnum_nullnum_unique
0Row45002045002
1Anon Student Id450020333
2Problem Hierarchy45002027
3Problem Name450020300
4Problem View45002032
5Problem Start Time45002025983
6Problem End Time45002025884
7Latency (sec)4500201290
8Steps Missing Start Times4500208
9Hints45002035
10Incorrects45002037
11Corrects45002051
12Avg Corrects450020195
13Steps45002031
14Avg Assistance Score450020335
15Correct First Attempts45002033
16KCs (F2011)45002010
17Steps without KCs (F2011)45002031
18KC List (F2011)450020170
19KCs (Unique-step)45002032
20Steps without KCs (Unique-step)45002016
21KC List (Unique-step)4500201470
\n", + "
" + ], + "text/plain": [ + " col_name num_nonull num_null num_unique\n", + "0 Row 45002 0 45002\n", + "1 Anon Student Id 45002 0 333\n", + "2 Problem Hierarchy 45002 0 27\n", + "3 Problem Name 45002 0 300\n", + "4 Problem View 45002 0 32\n", + "5 Problem Start Time 45002 0 25983\n", + "6 Problem End Time 45002 0 25884\n", + "7 Latency (sec) 45002 0 1290\n", + "8 Steps Missing Start Times 45002 0 8\n", + "9 Hints 45002 0 35\n", + "10 Incorrects 45002 0 37\n", + "11 Corrects 45002 0 51\n", + "12 Avg Corrects 45002 0 195\n", + "13 Steps 45002 0 31\n", + "14 Avg Assistance Score 45002 0 335\n", + "15 Correct First Attempts 45002 0 33\n", + "16 KCs (F2011) 45002 0 10\n", + "17 Steps without KCs (F2011) 45002 0 31\n", + "18 KC List (F2011) 45002 0 170\n", + "19 KCs (Unique-step) 45002 0 32\n", + "20 Steps without KCs (Unique-step) 45002 0 16\n", + "21 KC List (Unique-step) 45002 0 1470" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the remaining columns\n", + "print(\"-------------------num_unique_toal and num_nonull_toal----------------------\")\n", + "df_result = work_col_analysis(df_problem_clear)\n", + "df_result " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Data Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.express as px\n", + "from plotly.subplots import make_subplots\n", + "import plotly.graph_objs as go\n", + "import matplotlib.pyplot as plt " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# The distribution of continuous values\n", + "def show_value_counts_histogram(colname, sort = True):\n", + " # create the bins\n", + " start = int(df_problem_clear[colname].min()/10)*10\n", + " end = int(df_problem_clear[colname].quantile(q=0.95)/10+1)*10\n", + " problem = int((end - start)/20)\n", + " print(start, end, problem)\n", + " counts, bins = np.histogram(df_problem_clear[colname],bins=range(start, end, problem))\n", + " bins = 0.5 * (bins[:-1] + bins[1:])\n", + "\n", + " fig = px.bar(x=bins, y=counts, labels={'x': colname, 'y':'count'})\n", + " fig.show(\"svg\")\n", + "\n", + "# Box distribution of continuous values\n", + "def show_value_counts_box(colname, sort = True):\n", + " # way1: plotly (too costy for box-plot)\n", + " # fig = px.box(df_problem_clear, y=colname)\n", + " # fig.show(\"svg\")\n", + " # way2: matplotlib\n", + " plt.figure(figsize=(10,5))\n", + " plt.title('Box-plot for '+ colname,fontsize=20)#标题,并设定字号大小\n", + " plt.boxplot([df_problem_clear[colname].tolist()])\n", + " plt.show(\"svg\")\n", + "\n", + "# Histogram of discrete values\n", + "def show_value_counts_bar(colname, sort = True):\n", + " ds = df_problem_clear[colname].value_counts().reset_index()\n", + " ds.columns = [\n", + " colname,\n", + " 'Count'\n", + " ]\n", + " if sort:\n", + " ds = ds.sort_values(by='Count', ascending=False)\n", + " # histogram\n", + " fig = px.bar(\n", + " ds,\n", + " x = colname,\n", + " y = 'Count',\n", + " title = colname + ' distribution'\n", + " )\n", + " fig.show(\"svg\")\n", + "\n", + "\n", + "# Pie of discrete values\n", + "def show_value_counts_pie(colname, sort = True):\n", + " ds = df_problem_clear[colname].value_counts().reset_index()\n", + " ds.columns = [\n", + " colname,\n", + " 'percent'\n", + " ]\n", + " ds['percent'] /= len(df_problem_clear)\n", + " if sort:\n", + " ds = ds.sort_values(by='percent', ascending=False)\n", + " fig = px.pie(\n", + " ds,\n", + " names = colname,\n", + " values = 'percent',\n", + " title = colname+ 'Percentage',\n", + " )\n", + " fig.show(\"svg\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (1)sort by single attributes" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "0246805k10k15k20kKCs (F2011) distributionKCs (F2011)Count" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/svg+xml": [ + "sequence Statics, unit Engineering Systems - Single Body Equilibrium, module Equilibrium of a Single Subsystemsequence Statics, unit Complex Interactions Between Bodies, module Statically Equivalent Loadssequence Statics, unit Friction, module Frictionsequence Statics, unit Engineering Systems - Single Body Equilibrium, module Choosing a Solvable Subsystemsequence Statics, unit Concentrated Forces and Their Effects, module Representing Interactions Between Bodiessequence Statics, unit Complex Interactions Between Bodies, module Applications of Static Equivalency to Distributed Forces, section1 Simplifying 3D loadings to 2D or 1D loadingsequence Statics, unit Complex Interactions Between Bodies, module Couplessequence Statics, unit Multiple Body Equilibrium - Frames, module Solving Multiple Subsystemssequence Statics, unit Multiple Body Equilibrium - Trusses, module Method of Jointssequence Statics, unit Concentrated Forces and Their Effects, module Effects of Forcesequence Statics, unit Concentrated Forces and Their Effects, module Equilibrium Under 2D Concentrated Forces, section1 Applying Force and Moment Equilibriumsequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connections, section1 Pin Connectionssequence Statics, unit Concentrated Forces and Their Effects, module Effects of Multiple Forces, section1 Combining Momentssequence Statics, unit Concentrated Forces and Their Effects, module Effects of Multiple Forces, section1 Combining Concurrent Forcessequence Statics, unit Complex Interactions Between Bodies, module Applications of Static Equivalency to Distributed Forces, section1 Center of Gravity and Centroidsequence Statics, unit Concentrated Forces and Their Effects, module Equilibrium Under 2D Concentrated Forces, section1 Applying Force Equilibriumsequence Statics, unit Multiple Body Equilibrium - Trusses, module Method of Sectionssequence Statics, unit Multiple Body Equilibrium - Frames, module Drawing FBDs of Multiple Subsystemssequence Statics, unit Moments of Inertia, module Second Moment of Areasequence Statics, unit Moments of Inertia, module Mass Moment of Inertiasequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connectionssequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connections, section1 Other Connectionssequence Statics, unit Concentrated Forces and Their Effects, module Introduction to Free Body Diagramssequence Statics, unit Concentrated Forces and Their Effects, module Equilibrium Under 2D Concentrated Forcessequence Statics, unit Concentrated Forces and Their Effects, module Effects of Multiple Forcessequence Statics, unit Engineering Systems - Single Body Equilibrium, module Drawing FBDs of a Single Subsystemsequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connections, section1 Fixed Connections020004000Problem Hierarchy distributionProblem HierarchyCount" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Bar\n", + "show_value_counts_bar('KCs (F2011)')\n", + "show_value_counts_bar('Problem Hierarchy')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 340 17\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "05010015020025030005k10k15k20kLatency (sec)count" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# analysis for \"duration\" \n", + "# It is obvious that there are unreasonable outliers\n", + "\n", + "show_value_counts_box('Latency (sec)') \n", + "show_value_counts_histogram('Latency (sec)')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (2)group by Problem Name, sorted by meam(avg corrects) " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "tutor_21_4_2tutor_05_11tutor_11_15tutor_09_13tutor_08_18tutor_22_02tutor_09_09tutor_11_25tutor_15_07tutor_08_02tutor_6_03c_m1_tutor9tutor_11_07tutor_17_06tutor_13_02tutor_17_08tutor_04_09tutor_6_10tutor_11_02tutor_03_13tutor_09_08tutor_7_14tutor_15_02tutor_07_18tutor_03_15tutor_08_01tutor_08_08tutor_08_06tutor_08_07tutor_22_11tutor_07_25_m6_assess_m8_assesstutor_22_20tutor_22_16tutor_20_3_4_m20_assesstutor_08_2400.511.5Questions sorted by the average accuracyProblem NameAvg Corrects" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Classification Statistic\n", + "\n", + "# Problem Name,Avg Corrects, Avg Assistance Score\n", + "df_problem_group1 = df_problem_clear.groupby(['Problem Name'])['Avg Corrects'].mean().reset_index()\n", + "df_problem_group1.columns = [\"Problem Name\",\"Avg Corrects\"]\n", + "df_problem_group1 = df_problem_group1.sort_values(by='Avg Corrects', ascending=False)\n", + "fig = px.bar(df_problem_group1, x=\"Problem Name\", y=\"Avg Corrects\", title=\"Questions sorted by the average accuracy\")\n", + "fig.show(\"svg\")" + ] + } + ], + "metadata": { + "celltoolbar": "原始单元格格式", + "kernelspec": { + "display_name": "Data", + "language": "python", + "name": "data" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/OLI_Fall2011/OLI_2011F_step.ipynb b/docs/OLI_Fall2011/OLI_2011F_step.ipynb new file mode 100644 index 0000000..4331a9f --- /dev/null +++ b/docs/OLI_Fall2011/OLI_2011F_step.ipynb @@ -0,0 +1,1657 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# OLI data in fall, 2011(step)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import pandas as pd\n", + "import numpy as np\n", + "# global configuration: show every rows and cols\n", + "pd.set_option('display.max_rows', None)\n", + "pd.set_option('max_colwidth',None)\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Data Description\n", + "## 1.1 Column Description" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FieldAnnotation
0RowA row counter.
1SampleThe sample that includes this step. If you select more than one sample to export, steps that occur in more than one sample will be duplicated in the export.
2Anon Student IDThe student that performed the step.
3Problem HierarchyThe location in the curriculum hierarchy where this step occurs.
4Problem NameThe name of the problem in which the step occurs.
5Problem ViewThe number of times the student encountered the problem so far. This counter increases with each instance of the same problem. Note that problem view increases regardless of whether or not the step was encountered in previous problem views. For example, a step can have a \"Problem View\" of \"3\", indicating the problem was viewed three times by this student, but that same step need not have been encountered by that student in all instances of the problem. If this number does not increase as you expect it to, it might be that DataShop has identified similar problems as distinct: two problems with the same \"Problem Name\" are considered different \"problems\" by DataShop if the following logged values are not identical: problem name, context, tutor_flag (whether or not the problem or activity is tutored) and \"other\" field. For more on the logging of these fields, see the description of the \"problem\" element in the Guide to the Tutor Message Format. For more detail on how problem view is determined, see Determining Problem View.
6Step NameFormed by concatenating the \"selection\" and \"action\". Also see the glossary entry for \"step\".
7Step Start TimeThe step start time is determined one of three ways: If it's the first step of the problem, the step start time is the same as the problem start time If it's a subsequent step, then the step start time is the time of the preceding transaction, if that transaction is within 10 minutes. If it's a subsequent step and the elapsed time between the previous transaction and the first transaction of this step is more than 10 minutes, then the step start time is set to null as it's considered an unreliable value. For a visual example, see the Examples page.
8First Transaction TimeThe time of the first transaction toward the step.
9Correct Transaction TimeThe time of the correct attempt toward the step, if there was one.
10Step End TimeThe time of the last transaction toward the step.
11Step Duration (sec)The elapsed time of the step in seconds, calculated by adding all of the durations for transactions that were attributed to the step. See the glossary entry for more detail. This column was previously labeled \"Assistance Time\". It differs from \"Assistance Time\" in that its values are derived by summing transaction durations, not finding the difference between only two points in time (step start time and the last correct attempt).
12Correct Step Duration (sec)The step duration if the first attempt for the step was correct. This might also be described as \"reaction time\" since it's the duration of time from the previous transaction or problem start event to the correct attempt. See the glossary entry for more detail. This column was previously labeled \"Correct Step Time (sec)\".
13Error Step Duration (sec)The step duration if the first attempt for the step was an error (incorrect attempt or hint request).
14First AttemptThe tutor's response to the student's first attempt on the step. Example values are \"hint\", \"correct\", and \"incorrect\".
15IncorrectsTotal number of incorrect attempts by the student on the step.
16HintsTotal number of hints requested by the student for the step.
17CorrectsTotal correct attempts by the student for the step. (Only increases if the step is encountered more than once.)
18ConditionThe name and type of the condition the student is assigned to. In the case of a student assigned to multiple conditions (factors in a factorial design), condition names are separated by a comma and space. This differs from the transaction format, which optionally has \"Condition Name\" and \"Condition Type\" columns.
19KC (model_name)(Only shown when the \"Knowledge Components\" option is selected.) Knowledge component(s) associated with the correct performance of this step. In the case of multiple KCs assigned to a single step, KC names are separated by two tildes (\"~~\").
20Opportunity (model_name)(Only shown when the \"Knowledge Components\" option is selected.) An opportunity is the first chance on a step for a student to demonstrate whether he or she has learned the associated knowledge component. Opportunity number is therefore a count that increases by one each time the student encounters a step with the listed knowledge component. In the case of multiple KCs assigned to a single step, opportunity number values are separated by two tildes (\"~~\") and are given in the same order as the KC names. Check here to see how opportunity count is computed when Event Type column is present in transaction data.
21Predicted Error Rate (model_name)A hypothetical error rate based on the Additive Factor Model (AFM) algorithm. A value of \"1\" is a prediction that a student's first attempt will be an error (incorrect attempt or hint request); a value of \"0\" is a prediction that the student's first attempt will be correct. For specifics, see below \"Predicted Error Rate\" and how it's calculated. In the case of multiple KCs assigned to a single step, Datashop implements a compensatory sum across all of the KCs, thus a single value of predicted error rate is provided (i.e., the same predicted error rate for each KC assigned to a step). For more detail on Datashop's implementation for multi-skilled step, see Model Values page.
\n", + "
" + ], + "text/plain": [ + " Field \\\n", + "0 Row \n", + "1 Sample \n", + "2 Anon Student ID \n", + "3 Problem Hierarchy \n", + "4 Problem Name \n", + "5 Problem View \n", + "6 Step Name \n", + "7 Step Start Time \n", + "8 First Transaction Time \n", + "9 Correct Transaction Time \n", + "10 Step End Time \n", + "11 Step Duration (sec) \n", + "12 Correct Step Duration (sec) \n", + "13 Error Step Duration (sec) \n", + "14 First Attempt \n", + "15 Incorrects \n", + "16 Hints \n", + "17 Corrects \n", + "18 Condition \n", + "19 KC (model_name) \n", + "20 Opportunity (model_name) \n", + "21 Predicted Error Rate (model_name) \n", + "\n", + " Annotation \n", + "0 A row counter. \n", + "1 The sample that includes this step. If you select more than one sample to export, steps that occur in more than one sample will be duplicated in the export. \n", + "2 The student that performed the step. \n", + "3 The location in the curriculum hierarchy where this step occurs. \n", + "4 The name of the problem in which the step occurs. \n", + "5 The number of times the student encountered the problem so far. This counter increases with each instance of the same problem. Note that problem view increases regardless of whether or not the step was encountered in previous problem views. For example, a step can have a \"Problem View\" of \"3\", indicating the problem was viewed three times by this student, but that same step need not have been encountered by that student in all instances of the problem. If this number does not increase as you expect it to, it might be that DataShop has identified similar problems as distinct: two problems with the same \"Problem Name\" are considered different \"problems\" by DataShop if the following logged values are not identical: problem name, context, tutor_flag (whether or not the problem or activity is tutored) and \"other\" field. For more on the logging of these fields, see the description of the \"problem\" element in the Guide to the Tutor Message Format. For more detail on how problem view is determined, see Determining Problem View. \n", + "6 Formed by concatenating the \"selection\" and \"action\". Also see the glossary entry for \"step\". \n", + "7 The step start time is determined one of three ways: If it's the first step of the problem, the step start time is the same as the problem start time If it's a subsequent step, then the step start time is the time of the preceding transaction, if that transaction is within 10 minutes. If it's a subsequent step and the elapsed time between the previous transaction and the first transaction of this step is more than 10 minutes, then the step start time is set to null as it's considered an unreliable value. For a visual example, see the Examples page. \n", + "8 The time of the first transaction toward the step. \n", + "9 The time of the correct attempt toward the step, if there was one. \n", + "10 The time of the last transaction toward the step. \n", + "11 The elapsed time of the step in seconds, calculated by adding all of the durations for transactions that were attributed to the step. See the glossary entry for more detail. This column was previously labeled \"Assistance Time\". It differs from \"Assistance Time\" in that its values are derived by summing transaction durations, not finding the difference between only two points in time (step start time and the last correct attempt). \n", + "12 The step duration if the first attempt for the step was correct. This might also be described as \"reaction time\" since it's the duration of time from the previous transaction or problem start event to the correct attempt. See the glossary entry for more detail. This column was previously labeled \"Correct Step Time (sec)\". \n", + "13 The step duration if the first attempt for the step was an error (incorrect attempt or hint request). \n", + "14 The tutor's response to the student's first attempt on the step. Example values are \"hint\", \"correct\", and \"incorrect\". \n", + "15 Total number of incorrect attempts by the student on the step. \n", + "16 Total number of hints requested by the student for the step. \n", + "17 Total correct attempts by the student for the step. (Only increases if the step is encountered more than once.) \n", + "18 The name and type of the condition the student is assigned to. In the case of a student assigned to multiple conditions (factors in a factorial design), condition names are separated by a comma and space. This differs from the transaction format, which optionally has \"Condition Name\" and \"Condition Type\" columns. \n", + "19 (Only shown when the \"Knowledge Components\" option is selected.) Knowledge component(s) associated with the correct performance of this step. In the case of multiple KCs assigned to a single step, KC names are separated by two tildes (\"~~\"). \n", + "20 (Only shown when the \"Knowledge Components\" option is selected.) An opportunity is the first chance on a step for a student to demonstrate whether he or she has learned the associated knowledge component. Opportunity number is therefore a count that increases by one each time the student encounters a step with the listed knowledge component. In the case of multiple KCs assigned to a single step, opportunity number values are separated by two tildes (\"~~\") and are given in the same order as the KC names. Check here to see how opportunity count is computed when Event Type column is present in transaction data. \n", + "21 A hypothetical error rate based on the Additive Factor Model (AFM) algorithm. A value of \"1\" is a prediction that a student's first attempt will be an error (incorrect attempt or hint request); a value of \"0\" is a prediction that the student's first attempt will be correct. For specifics, see below \"Predicted Error Rate\" and how it's calculated. In the case of multiple KCs assigned to a single step, Datashop implements a compensatory sum across all of the KCs, thus a single value of predicted error rate is provided (i.e., the same predicted error rate for each KC assigned to a step). For more detail on Datashop's implementation for multi-skilled step, see Model Values page. " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# help_table2: the description for data by steps\n", + "df2 = pd.read_csv('OLI_data/help_table2.csv',sep=',',encoding=\"gbk\")\n", + "df2 = df2.loc[:, ['Field', 'Annotation']]\n", + "df2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.2 Summarization of Data\n", + "\n", + "**This table organizes the data as student-problem-step**" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RowSampleAnon Student IdProblem HierarchyProblem NameProblem ViewStep NameStep Start TimeFirst Transaction TimeCorrect Transaction TimeStep End TimeStep Duration (sec)Correct Step Duration (sec)Error Step Duration (sec)First AttemptIncorrectsHintsCorrectsConditionKC (F2011)Opportunity (F2011)Predicted Error Rate (F2011)KC (Single-KC)Opportunity (Single-KC)Predicted Error Rate (Single-KC)KC (Unique-step)Opportunity (Unique-step)Predicted Error Rate (Unique-step)
01All DataStu_00b2b35fd027e7891e8a1a527125dd65sequence Statics, unit Concentrated Forces and Their Effects, module Introduction to Free Body Diagrams_m2_assess1q1_point1i1 UpdateComboBox2011/9/21 17:352011/9/21 17:352011/9/21 17:352011/9/21 17:3523.1323.13.correct001.identify_interaction10.3991Single-KC10.4373NaNNaNNaN
12All DataStu_00b2b35fd027e7891e8a1a527125dd65sequence Statics, unit Concentrated Forces and Their Effects, module Introduction to Free Body Diagrams_m2_assess1q1_point3i3 UpdateComboBox2011/9/21 17:352011/9/21 17:352011/9/21 17:352011/9/21 17:3523.1323.13.correct001.gravitational_forces10.1665Single-KC20.4373NaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " Row Sample Anon Student Id \\\n", + "0 1 All Data Stu_00b2b35fd027e7891e8a1a527125dd65 \n", + "1 2 All Data Stu_00b2b35fd027e7891e8a1a527125dd65 \n", + "\n", + " Problem Hierarchy \\\n", + "0 sequence Statics, unit Concentrated Forces and Their Effects, module Introduction to Free Body Diagrams \n", + "1 sequence Statics, unit Concentrated Forces and Their Effects, module Introduction to Free Body Diagrams \n", + "\n", + " Problem Name Problem View Step Name Step Start Time \\\n", + "0 _m2_assess 1 q1_point1i1 UpdateComboBox 2011/9/21 17:35 \n", + "1 _m2_assess 1 q1_point3i3 UpdateComboBox 2011/9/21 17:35 \n", + "\n", + " First Transaction Time Correct Transaction Time Step End Time \\\n", + "0 2011/9/21 17:35 2011/9/21 17:35 2011/9/21 17:35 \n", + "1 2011/9/21 17:35 2011/9/21 17:35 2011/9/21 17:35 \n", + "\n", + " Step Duration (sec) Correct Step Duration (sec) Error Step Duration (sec) \\\n", + "0 23.13 23.13 . \n", + "1 23.13 23.13 . \n", + "\n", + " First Attempt Incorrects Hints Corrects Condition KC (F2011) \\\n", + "0 correct 0 0 1 . identify_interaction \n", + "1 correct 0 0 1 . gravitational_forces \n", + "\n", + " Opportunity (F2011) Predicted Error Rate (F2011) KC (Single-KC) \\\n", + "0 1 0.3991 Single-KC \n", + "1 1 0.1665 Single-KC \n", + "\n", + " Opportunity (Single-KC) Predicted Error Rate (Single-KC) KC (Unique-step) \\\n", + "0 1 0.4373 NaN \n", + "1 2 0.4373 NaN \n", + "\n", + " Opportunity (Unique-step) Predicted Error Rate (Unique-step) \n", + "0 NaN NaN \n", + "1 NaN NaN " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_step = pd.read_csv('OLI_data/AllData_student_step_2011F.csv',low_memory=False) # sep=\"\\t\"\n", + "df_step.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Data Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RowProblem ViewIncorrectsHintsCorrectsPredicted Error Rate (F2011)Opportunity (Single-KC)Predicted Error Rate (Single-KC)Opportunity (Unique-step)Predicted Error Rate (Unique-step)
count194947.000000194947.000000194947.000000194947.000000194947.000000113992.000000194947.000000194947.000000193043.0000000.0
mean97474.0000001.1331540.3796110.1431720.9640720.237508419.7510660.2522331.035971NaN
std56276.4958010.7605151.3737970.8525200.4803460.158128288.3658620.0864060.384182NaN
min1.0000001.0000000.0000000.0000000.0000000.0029001.0000000.0386001.000000NaN
25%48737.5000001.0000000.0000000.0000001.0000000.117900171.0000000.1881001.000000NaN
50%97474.0000001.0000000.0000000.0000001.0000000.201400382.0000000.2405001.000000NaN
75%146210.5000001.0000000.0000000.0000001.0000000.319500635.0000000.2947001.000000NaN
max194947.00000032.000000413.00000043.00000086.0000000.9693001410.0000000.77360024.000000NaN
\n", + "
" + ], + "text/plain": [ + " Row Problem View Incorrects Hints \\\n", + "count 194947.000000 194947.000000 194947.000000 194947.000000 \n", + "mean 97474.000000 1.133154 0.379611 0.143172 \n", + "std 56276.495801 0.760515 1.373797 0.852520 \n", + "min 1.000000 1.000000 0.000000 0.000000 \n", + "25% 48737.500000 1.000000 0.000000 0.000000 \n", + "50% 97474.000000 1.000000 0.000000 0.000000 \n", + "75% 146210.500000 1.000000 0.000000 0.000000 \n", + "max 194947.000000 32.000000 413.000000 43.000000 \n", + "\n", + " Corrects Predicted Error Rate (F2011) Opportunity (Single-KC) \\\n", + "count 194947.000000 113992.000000 194947.000000 \n", + "mean 0.964072 0.237508 419.751066 \n", + "std 0.480346 0.158128 288.365862 \n", + "min 0.000000 0.002900 1.000000 \n", + "25% 1.000000 0.117900 171.000000 \n", + "50% 1.000000 0.201400 382.000000 \n", + "75% 1.000000 0.319500 635.000000 \n", + "max 86.000000 0.969300 1410.000000 \n", + "\n", + " Predicted Error Rate (Single-KC) Opportunity (Unique-step) \\\n", + "count 194947.000000 193043.000000 \n", + "mean 0.252233 1.035971 \n", + "std 0.086406 0.384182 \n", + "min 0.038600 1.000000 \n", + "25% 0.188100 1.000000 \n", + "50% 0.240500 1.000000 \n", + "75% 0.294700 1.000000 \n", + "max 0.773600 24.000000 \n", + "\n", + " Predicted Error Rate (Unique-step) \n", + "count 0.0 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_step.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "num_total: 194947\n", + "num_students: 333\n", + "num_problems: 300\n", + "num_kcs: 98\n", + "num_null_condition: 0\n", + "\n", + " ****************************** \n", + "\n", + "74004 27911 187943\n", + "0.6483968011923079\n" + ] + } + ], + "source": [ + "num_total = len(df_step)\n", + "num_students = len(df_step['Anon Student Id'].unique())\n", + "num_problems = len(df_step['Problem Name'].unique())\n", + "num_kcs = len(df_step['KC (F2011)'].unique())\n", + "num_null_condition = df_step['Condition'].isnull().sum() # 空值可不要\n", + "print(\"num_total:\",num_total)\n", + "print(\"num_students:\",num_students)\n", + "print(\"num_problems:\",num_problems)\n", + "print(\"num_kcs:\",num_kcs)\n", + "print(\"num_null_condition:\",num_null_condition)\n", + "\n", + "n_incorrects = df_step['Incorrects'].sum()\n", + "n_hints = df_step['Hints'].sum()\n", + "n_corrects = df_step['Corrects'].sum()\n", + "print(\"\\n\",\"*\"*30,\"\\n\")\n", + "print(n_incorrects,n_hints,n_corrects)\n", + "print(n_corrects / (n_incorrects + n_hints + n_corrects))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (1)Analysis for Null and Unique value of column attributes" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------num_unique_toal and num_nonull_toal----------------------\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col_namenum_nonullnum_nullnum_unique
0Row1949470194947
1Sample19494701
2Anon Student Id1949470333
3Problem Hierarchy194947027
4Problem Name1949470300
5Problem View194947032
6Step Name1949470382
7Step Start Time19463231533098
8First Transaction Time194947034578
9Correct Transaction Time1821321281533501
10Step End Time194947034351
11Step Duration (sec)19494702521
12Correct Step Duration (sec)19494702187
13Error Step Duration (sec)19494702105
14First Attempt19494703
15Incorrects194947032
16Hints194947030
17Corrects194947017
18Condition19494701
19KC (F2011)1139928095598
20Opportunity (F2011)113992809551206
21Predicted Error Rate (F2011)113992809557623
22KC (Single-KC)19494701
23Opportunity (Single-KC)19494701410
24Predicted Error Rate (Single-KC)1949470317
25KC (Unique-step)19304319041179
26Opportunity (Unique-step)193043190425
27Predicted Error Rate (Unique-step)01949471
\n", + "
" + ], + "text/plain": [ + " col_name num_nonull num_null num_unique\n", + "0 Row 194947 0 194947\n", + "1 Sample 194947 0 1\n", + "2 Anon Student Id 194947 0 333\n", + "3 Problem Hierarchy 194947 0 27\n", + "4 Problem Name 194947 0 300\n", + "5 Problem View 194947 0 32\n", + "6 Step Name 194947 0 382\n", + "7 Step Start Time 194632 315 33098\n", + "8 First Transaction Time 194947 0 34578\n", + "9 Correct Transaction Time 182132 12815 33501\n", + "10 Step End Time 194947 0 34351\n", + "11 Step Duration (sec) 194947 0 2521\n", + "12 Correct Step Duration (sec) 194947 0 2187\n", + "13 Error Step Duration (sec) 194947 0 2105\n", + "14 First Attempt 194947 0 3\n", + "15 Incorrects 194947 0 32\n", + "16 Hints 194947 0 30\n", + "17 Corrects 194947 0 17\n", + "18 Condition 194947 0 1\n", + "19 KC (F2011) 113992 80955 98\n", + "20 Opportunity (F2011) 113992 80955 1206\n", + "21 Predicted Error Rate (F2011) 113992 80955 7623\n", + "22 KC (Single-KC) 194947 0 1\n", + "23 Opportunity (Single-KC) 194947 0 1410\n", + "24 Predicted Error Rate (Single-KC) 194947 0 317\n", + "25 KC (Unique-step) 193043 1904 1179\n", + "26 Opportunity (Unique-step) 193043 1904 25\n", + "27 Predicted Error Rate (Unique-step) 0 194947 1" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def work_col_analysis(df_work):\n", + " num_nonull_toal = df_work.notnull().sum() # Not Null\n", + " dict_col_1 = {'col_name':num_nonull_toal.index,'num_nonull':num_nonull_toal.values}\n", + " df_work_col_1 = pd.DataFrame(dict_col_1)\n", + "\n", + " num_null_toal = df_work.isnull().sum() # Null\n", + " dict_col_2 = {'col_name':num_null_toal.index,'num_null':num_null_toal.values}\n", + " df_work_col_2 = pd.DataFrame(dict_col_2)\n", + "\n", + " num_unique_toal = df_work.apply(lambda col: len(col.unique())) # axis=0\n", + " print(type(num_unique_toal))\n", + " dict_col_3 = {'col_name':num_unique_toal.index,'num_unique':num_unique_toal.values}\n", + " df_work_col_3 = pd.DataFrame(dict_col_3)\n", + "\n", + " # df_work_col = pd.concat([df_work_col_1, df_work_col_2], axis=1)\n", + " df_work_col = pd.merge(df_work_col_1, df_work_col_2, on=['col_name'])\n", + " df_work_col = pd.merge(df_work_col, df_work_col_3, on=['col_name'])\n", + " return df_work_col\n", + "print(\"-------------------num_unique_toal and num_nonull_toal----------------------\")\n", + "df_result = work_col_analysis(df_step)\n", + "df_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (3)Data Cleaning \n", + "### Data Cleaning Suggestions\n", + "> - Redundant columns: Columns that are all NULL or Single value.\n", + "> - rows that KC (F2011) == null(Do not know the knowledge source)\n", + "> - rows that Step Start Time == null(This step is too short or more than 10mins, so the data is not reliable)\n", + "> - Others\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df_step_clear = df_step.copy(deep=True) # deep copy" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the cols num before clear: 28\n", + "the cols num after clear: 24\n", + "drop:--- Sample\n", + "drop:--- Condition\n", + "drop:--- KC (Single-KC)\n", + "drop:--- Predicted Error Rate (Unique-step)\n" + ] + } + ], + "source": [ + "# 直接清除所有”冗余列“\n", + "cols = list(df_step.columns.values)\n", + "drop_cols = []\n", + "for col in cols:\n", + " if len(df_step_clear[col].unique().tolist()) == 1:\n", + " df_step_clear.drop(col,axis =1,inplace=True)\n", + " drop_cols.append(col)\n", + "\n", + "print(\"the cols num before clear: \",len(df_step.columns.to_list()))\n", + "print(\"the cols num after clear:\",len(df_step_clear.columns.to_list()))\n", + "for col in drop_cols:\n", + " print(\"drop:---\",col)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Others:'KC (F2011)','Step Start Time' with null value\n", + "df_step_clear.dropna(axis=0, how='any', subset=['KC (F2011)','Step Start Time'],inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------num_unique_toal and num_nonull_toal----------------------\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col_namenum_nonullnum_nullnum_unique
0Row1138170113817
1Anon Student Id1138170331
2Problem Hierarchy113817026
3Problem Name1138170154
4Problem View113817032
5Step Name1138170240
6Step Start Time113817018856
7First Transaction Time113817019745
8Correct Transaction Time1034541036319146
9Step End Time113817019623
10Step Duration (sec)11381702382
11Correct Step Duration (sec)11381702093
12Error Step Duration (sec)11381701949
13First Attempt11381703
14Incorrects113817025
15Hints113817025
16Corrects113817015
17KC (F2011)113817097
18Opportunity (F2011)11381701205
19Predicted Error Rate (F2011)11381707622
20Opportunity (Single-KC)11381701164
21Predicted Error Rate (Single-KC)1138170315
22KC (Unique-step)112869948625
23Opportunity (Unique-step)11286994825
\n", + "
" + ], + "text/plain": [ + " col_name num_nonull num_null num_unique\n", + "0 Row 113817 0 113817\n", + "1 Anon Student Id 113817 0 331\n", + "2 Problem Hierarchy 113817 0 26\n", + "3 Problem Name 113817 0 154\n", + "4 Problem View 113817 0 32\n", + "5 Step Name 113817 0 240\n", + "6 Step Start Time 113817 0 18856\n", + "7 First Transaction Time 113817 0 19745\n", + "8 Correct Transaction Time 103454 10363 19146\n", + "9 Step End Time 113817 0 19623\n", + "10 Step Duration (sec) 113817 0 2382\n", + "11 Correct Step Duration (sec) 113817 0 2093\n", + "12 Error Step Duration (sec) 113817 0 1949\n", + "13 First Attempt 113817 0 3\n", + "14 Incorrects 113817 0 25\n", + "15 Hints 113817 0 25\n", + "16 Corrects 113817 0 15\n", + "17 KC (F2011) 113817 0 97\n", + "18 Opportunity (F2011) 113817 0 1205\n", + "19 Predicted Error Rate (F2011) 113817 0 7622\n", + "20 Opportunity (Single-KC) 113817 0 1164\n", + "21 Predicted Error Rate (Single-KC) 113817 0 315\n", + "22 KC (Unique-step) 112869 948 625\n", + "23 Opportunity (Unique-step) 112869 948 25" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the remaining columns\n", + "print(\"-------------------num_unique_toal and num_nonull_toal----------------------\")\n", + "df_result = work_col_analysis(df_step_clear)\n", + "df_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Outlier Analysis\n", + "> -

It is found that there is a non-numeric type in duration that is '.' , which should represent 0

\n", + "> - In addition, box diagrams can be used to analyze whether some outliers need to be removed" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Row', 'Anon Student Id', 'Problem Hierarchy', 'Problem Name', 'Problem View', 'Step Name', 'Step Start Time', 'First Transaction Time', 'Correct Transaction Time', 'Step End Time', 'Step Duration (sec)', 'Correct Step Duration (sec)', 'Error Step Duration (sec)', 'First Attempt', 'Incorrects', 'Hints', 'Corrects', 'KC (F2011)', 'Opportunity (F2011)', 'Predicted Error Rate (F2011)', 'Opportunity (Single-KC)', 'Predicted Error Rate (Single-KC)', 'KC (Unique-step)', 'Opportunity (Unique-step)']\n", + "----------------------------------------------------------------------------------------------------\n", + "['Row', 'Problem View', 'Incorrects', 'Hints', 'Corrects', 'Predicted Error Rate (F2011)', 'Opportunity (Single-KC)', 'Predicted Error Rate (Single-KC)', 'Opportunity (Unique-step)']\n", + "----------------------------------------------------------------------------------------------------\n", + "Row int64\n", + "Anon Student Id object\n", + "Problem Hierarchy object\n", + "Problem Name object\n", + "Problem View int64\n", + "Step Name object\n", + "Step Start Time object\n", + "First Transaction Time object\n", + "Correct Transaction Time object\n", + "Step End Time object\n", + "Step Duration (sec) object\n", + "Correct Step Duration (sec) object\n", + "Error Step Duration (sec) object\n", + "First Attempt object\n", + "Incorrects int64\n", + "Hints int64\n", + "Corrects int64\n", + "KC (F2011) object\n", + "Opportunity (F2011) object\n", + "Predicted Error Rate (F2011) float64\n", + "Opportunity (Single-KC) int64\n", + "Predicted Error Rate (Single-KC) float64\n", + "KC (Unique-step) object\n", + "Opportunity (Unique-step) float64\n", + "dtype: object\n" + ] + } + ], + "source": [ + "print(df_step_clear.columns.tolist())\n", + "print(\"-\"*100)\n", + "print(df_step_clear.describe().columns.tolist()) #有许多object类无法统计分析\n", + "print(\"-\"*100)\n", + "print(df_step_clear.dtypes)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step Duration (sec) float64\n", + "Correct Step Duration (sec) float64\n", + "Error Step Duration (sec) float64\n", + "dtype: object\n" + ] + } + ], + "source": [ + "# Change . to 0 in \"xxx-duration\"\n", + "rectify_cols = ['Step Duration (sec)', 'Correct Step Duration (sec)', 'Error Step Duration (sec)']\n", + "for col in rectify_cols:\n", + " df_step_clear[col] = df_step_clear[col].apply(lambda x: 0 if x=='.' else x)\n", + " df_step_clear[col] = df_step_clear[col].astype(float)\n", + "print(df_step_clear[rectify_cols].dtypes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Data Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.express as px\n", + "from plotly.subplots import make_subplots\n", + "import plotly.graph_objs as go\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "D:\\MySoftwares\\Anaconda\\envs\\data\\lib\\site-packages\\ipykernel_launcher.py:8: UserWarning:\n", + "\n", + "Matplotlib is currently using module://ipykernel.pylab.backend_inline, which is a non-GUI backend, so cannot show the figure.\n", + "\n" + ] + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAARFElEQVR4nO3dX4hdZ7nH8e/jdHDCsRVD0pJMEkekyo6DR+lQBXNhKBzjVb2xdIRjpRsCYqcKuUh0LmwvAu2NFwaPEEhoBc+Uglp7kR5PLVtkwD+dEaFJ51RDbe2YYFOHaqmMTuNzLrISZpJJMmuSWXt23u8Hhr33s9fa6xkYfvPyrr3eFZmJJKkM7+p2A5Kk5hj6klQQQ1+SCmLoS1JBDH1JKshN3W7gajZt2pRDQ0PdbkO6YG5ujtOnTzM/P8/AwABbtmxh48aN3W5LWmJ6evqNzNx8cX3dh/7Q0BBTU1PdbkMCYGJigvHxcY4dO8auXbuYnJyk3W6zb98+RkdHu92edEFEvLpsfb1/T39kZCQNfa0Xw8PDHDp0iN27d1+odTodxsbGOH78eBc7k5aKiOnMHLmkbuhLK9fX18f8/Dz9/f0XagsLCwwMDHD27NkudiYtdbnQ90SuVEOr1WJycnJJbXJyklar1aWOpHoMfamG8fFx2u02nU6HhYUFOp0O7Xab8fHxbrcmrci6P5ErrSfnT9aOjY0xMzNDq9Xi4MGDnsRVz3BOX5JuQM7pS5IMfUkqiaEvSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfSlmiYmJhgeHqavr4/h4WEmJia63ZK0Yq6yKdVw/naJR44cWXK7RMCVNtUTXGVTqsHbJapXeLtE6TrwdonqFS6tLF0H3i5Rve6qoR8R2yOiExEzEXEiIr5a1TdGxLMR8fvq8X2L9vl6RJyMiJci4jOL6ndExAvVe9+OiFibX0taG94uUb1uJSdy3wH2ZeZvIuJmYDoingW+BDyXmY9ExAHgALA/InYC9wIfAbYCP42ID2XmWeC7wF7gl8AxYA/wzPX+paS14u0S1euuGvqZeRo4XT1/KyJmgEHgbuDT1WaPAz8D9lf1JzLzH8AfIuIkcGdEvALckpm/AIiI7wGfw9BXjxkdHTXk1bNqzelHxBDwceBXwG3VP4Tz/xhurTYbBF5btNtsVRusnl9cX+44eyNiKiKmzpw5U6dFSdIVrDj0I+I9wA+Ar2Xm36606TK1vEL90mLm4cwcycyRzZs3r7RFSdJVrCj0I6Kfc4H//cz8YVX+c0Rsqd7fArxe1WeB7Yt23wacqurblqlLkhqykm/vBHAEmMnMby1662ngvur5fcCPF9XvjYh3R8QHgNuBX1dTQG9FxCerz/zion0kSQ1Yybd3PgX8J/BCRPy2qn0DeAR4MiLawB+BzwNk5omIeBJ4kXPf/PlK9c0dgC8DjwEbOHcC15O4ktSgq470M3MyMyMzP5qZH6t+jmXmXzLzrsy8vXqcW7TPwcz8YGZ+ODOfWVSfyszh6r0Hcr1fDiwtwwXX1MtccE2qwQXX1Otce0eqwQXX1Ctce0e6DmZmZpidnV0yvTM7O8vMzEy3W5NWxNCXati6dSsPPvggb7/9NpnJ22+/zYMPPsjWrVu73Zq0Is7pSzX8/e9/58033+TNN98E4JVXXgHgXe9y/KTe4F+qVMPc3FyturTeGPqSVBBDX5IKYuhLUkEMfUkqiKEvSQUx9CWpIIa+JBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQRoP/YjYExEvRcTJiDjQ9PElqWSNhn5E9AHfAT4L7ARGI2Jnkz1IUsmaHunfCZzMzJcz85/AE8DdDfcgScW6qeHjDQKvLXo9C3zi4o0iYi+wF2DHjh3NdKYby0PvXZOPzW/e0vgxeeiva/O5KlLToR/L1PKSQuZh4DDAyMjIJe9LV7VGQRmx3J/wOZn+qWr9a3p6ZxbYvuj1NuBUwz1IUrGiydFJRNwE/A64C/gT8Dzwhcw8cYV9zgCvNtOhtCJ3LFObbrwL6cren5mbLy42Or2Tme9ExAPAT4A+4OiVAr/a55KmpfUgIqYyc6TbfUh1NDrSl24khr56kVfkSlJBDH1p9Q53uwGpLqd3JKkgjvQlqSCGviQVxNCXaoqIoxHxekQc73YvUl2GvlTfY8CebjchrYahL9WUmT8H5rrdh7Qahr4kFcTQl6SCNL20cm2bNm3KoaGhbrchXTA3N8fAwADz8/Ns2LAht2zZwsaNG7vdlrTE9PT0G11fcG01hoaGmJqa6nYbEgATExOMj49z7Ngxdu3axeTkJO12m3379jE6Otrt9qQLImLZ1YnX/RW5IyMjaehrvRgeHubQoUPs3r37Qq3T6TA2Nsbx436DU+tHREwvtyCgoS/V0NfXx/z8PP39/RdqCwsLDAwMcPbs2S52Ji11udD3RK5UQ6vV4uGHH2Z4eJi+vj6Gh4d5+OGHabVa3W5NWhFDX6ph9+7dPProo9x///289dZb3H///Tz66KNLpnuk9czQl2rodDrs37+fo0ePcvPNN3P06FH2799Pp9PpdmvSijinL9XgnL56hXP60nXQarWYnJxcUpucnHROXz3D0JdqGB8fp91u0+l0WFhYoNPp0G63GR8f73Zr0oqs+4uzpPXk/AVYY2NjzMzM0Gq1OHjwoBdmqWc4py9JNyDn9CVJhr5U18TExJKLsyYmJrrdkrRizulLNZxfcO3IkSNLFlwDnNdXT3BOX6rBBdfUK1xwTboOvDhLvcITudJ10Gq1GBwcJCIu/AwODnpxlnqGc/pSDadPn2Zubo7+/n4WFhbo7+/nzJkzjvLVMxzpSzXMzc0B56Z0Fj+er0vrnaEvSQUx9CWpIIa+JBXkmkI/Il6JiBci4rcRMVXVNkbEsxHx++rxfYu2/3pEnIyIlyLiM9favCSpnusx0t+dmR9b9H3QA8BzmXk78Fz1mojYCdwLfATYA/xXRPRdh+NLklZoLaZ37gYer54/DnxuUf2JzPxHZv4BOAncuQbHlyRdxrWGfgL/GxHTEbG3qt2WmacBqsdbq/og8NqifWermiSpIdd6cdanMvNURNwKPBsR/3eFbWOZ2rJrQFT/QPYC7Nix4xpblCSdd00j/cw8VT2+DvyIc9M1f46ILQDV4+vV5rPA9kW7bwNOXeZzD2fmSGaObN68+VpalCQtsurQj4h/i4ibzz8H/gM4DjwN3Fdtdh/w4+r508C9EfHuiPgAcDvw69UeX5JU37VM79wG/Cgizn/Of2fm/0TE88CTEdEG/gh8HiAzT0TEk8CLwDvAVzLTBUskqUGrDv3MfBn492XqfwHuusw+B4GDqz2mJOnaeEWuJBXE0Jekghj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQQx9SSqIoS9JBTH0Jakghr4kFcTQl6SCGPqSVBBDX5IKYuhLUkEMfUkqiKEvSQUx9CWpIIa+JBXE0JekgjQe+hGxJyJeioiTEXGg6eNLUskaDf2I6AO+A3wW2AmMRsTOJnuQpJLd1PDx7gROZubLABHxBHA38GLDfUiXiIhG9s/MazqOdC2aDv1B4LVFr2eBT1y8UUTsBfYC7Nixo5nOdGN56L21d8lv3rIGjSyjbm8P/XVt+lCRmg795YZClwx7MvMwcBhgZGTEYZHqW6OgvNJo3hG8ekHTJ3Jnge2LXm8DTjXcg7Rqlwt2A1+9Ipr8Y42Im4DfAXcBfwKeB76QmSeusM8Z4NVmOpRq2QS80e0mpMt4f2ZuvrjY6PROZr4TEQ8APwH6gKNXCvxqn0ualtaDiJjKzJFu9yHV0ehIX7qRGPrqRV6RK0kFMfSl1Tvc7QakupzekaSCONKXpIIY+pJUEENfqikijkbE6xFxvNu9SHUZ+lJ9jwF7ut2EtBqGvlRTZv4cmOt2H9JqGPqSVBBDX5IK0vTSyrVt2rQph4aGut2GdMHc3BwDAwPMz8+zYcOG3LJlCxs3bux2W9IS09PTb3R9wbXVGBoaYmpqqtttSABMTEwwPj7OsWPH2LVrF5OTk7Tbbfbt28fo6Gi325MuiIhlVyde91fkjoyMpKGv9WJ4eJhDhw6xe/fuC7VOp8PY2BjHj/sNTq0fETG93IKAhr5UQ19fH/Pz8/T391+oLSwsMDAwwNmzZ7vYmbTU5ULfE7lSDa1Wi8nJySW1yclJWq1WlzqS6jH0pRrGx8dpt9t0Oh0WFhbodDq0223Gx8e73Zq0Iuv+RK60npw/WTs2NsbMzAytVouDBw96Elc9wzl9SboBOacvSTL0Jakkhr4kFcTQl6SCGPqSVBBDX5IKYuhLUkEMfUkqiKEvSQUx9CWpIIa+VNPExATDw8P09fUxPDzMxMREt1uSVswF16Qazt8568iRI0vunAW46Jp6gguuSTV45yz1Cu+cJV0H3jlLvcJVNqXrwDtnqdddNfQjYntEdCJiJiJORMRXq/rGiHg2In5fPb5v0T5fj4iTEfFSRHxmUf2OiHiheu/bERFr82tJa8M7Z6nXreRE7jvAvsz8TUTcDExHxLPAl4DnMvORiDgAHAD2R8RO4F7gI8BW4KcR8aHMPAt8F9gL/BI4BuwBnrnev5S0VrxzlnrdVUM/M08Dp6vnb0XEDDAI3A18utrsceBnwP6q/kRm/gP4Q0ScBO6MiFeAWzLzFwAR8T3gcxj66jGjo6OGvHpWrTn9iBgCPg78Crit+odw/h/DrdVmg8Bri3abrWqD1fOL65Kkhqw49CPiPcAPgK9l5t+utOkytbxCfblj7Y2IqYiYOnPmzEpblCRdxYpCPyL6ORf438/MH1blP0fElur9LcDrVX0W2L5o923Aqaq+bZn6JTLzcGaOZObI5s2bV/q7SJKuYiXf3gngCDCTmd9a9NbTwH3V8/uAHy+q3xsR746IDwC3A7+upoDeiohPVp/5xUX7SJIasJJv73wK+E/ghYj4bVX7BvAI8GREtIE/Ap8HyMwTEfEk8CLnvvnzleqbOwBfBh4DNnDuBK4ncSWpQVcd6WfmZGZGZn40Mz9W/RzLzL9k5l2ZeXv1OLdon4OZ+cHM/HBmPrOoPpWZw9V7D+R6vxxYWoYLrqmXueCaVIMLrqnXufaOVIMLrqlXuOCadB244Jp6hQuuSddBq9XinnvuYWBggIhgYGCAe+65xwXX1DMMfamGwcFBnnrqKRYWFoBzo/ynnnqKwUEvLldvMPSlGp577jkA/vWvfy15PF+X1jtDX6rhcvP2zuerVxj6klQQQ1+SCmLoS1JBDH1JKoihL0kFMfQlqSCGviQVxNCXpIIY+pJUEENfkgpi6EtSQQx9SSqIoS9JBTH0Jakghr4kFcTQl6SCGPqSVBBDX5IKYuhLUkEaD/2I2BMRL0XEyYg40PTxJalkjYZ+RPQB3wE+C+wERiNiZ5M9SFLJmh7p3wmczMyXM/OfwBPA3Q33IEnFuqnh4w0Cry16PQt84uKNImIvsBdgx44dzXSmG8tD712Tj81v3tL4MXnor2vzuSpS06Efy9TykkLmYeAwwMjIyCXvS1e1RkEZsdyf8DmZ/qlq/Wt6emcW2L7o9TbgVMM9SFKxosnRSUTcBPwOuAv4E/A88IXMPHGFfc4ArzbTobQidyxTm268C+nK3p+Zmy8uNjq9k5nvRMQDwE+APuDolQK/2ueSpqX1ICKmMnOk231IdTQ60pduJIa+epFX5EpSQQx9afUOd7sBqS6ndySpII70Jakghr4kFcTQl2qKiKMR8XpEHO92L1Jdhr5U32PAnm43Ia2GoS/VlJk/B+a63Ye0Goa+JBXE0Jekghj6klQQQ1+SCmLoSzVFxATwC+DDETEbEe1u9yStlMswSFJBHOlLUkEMfUkqiKEvSQUx9CWpIIa+JBXE0Jekghj6klSQ/wfi4zFtWDl6swAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Outlier analysis for each column\n", + "\n", + "fig=plt.figure()\n", + "box_cols = ['Step Duration (sec)', 'Correct Step Duration (sec)','Error Step Duration (sec)']\n", + "for i, col in enumerate(box_cols):\n", + " ax=fig.add_subplot(3, 1, i+1)\n", + " ax.boxplot(df_step_clear[df_step_clear[col].notnull()][col].tolist())\n", + "fig.show(\"svg\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# The distribution of continuous values\n", + "def show_value_counts_histogram(colname, sort = True):\n", + " # create the bins\n", + " start = int(df_step_clear[colname].min()/10)*10\n", + " end = int(df_step_clear[colname].quantile(q=0.95)/10+1)*10\n", + " step = int((end - start)/20)\n", + " print(start, end, step)\n", + " counts, bins = np.histogram(df_step_clear[colname],bins=range(start, end, step))\n", + " bins = 0.5 * (bins[:-1] + bins[1:])\n", + "\n", + " fig = px.bar(x=bins, y=counts, labels={'x': colname, 'y':'count'})\n", + " fig.show(\"svg\")\n", + "\n", + "\n", + "# Box distribution of continuous values\n", + "def show_value_counts_box(colname, sort = True):\n", + "# fig = px.box(df_step_clear, y=colname)\n", + "# fig.show(\"svg\")\n", + " plt.figure(figsize=(10,5))\n", + " plt.title('Box-plot for '+ colname,fontsize=20)#标题,并设定字号大小\n", + " plt.boxplot([df_step_clear[colname].tolist()])\n", + " plt.show(\"svg\")\n", + " \n", + "\n", + "# Histogram of discrete values\n", + "def show_value_counts_bar(colname, sort = True):\n", + " ds = df_step_clear[colname].value_counts().reset_index()\n", + " ds.columns = [\n", + " colname,\n", + " 'Count'\n", + " ]\n", + " if sort:\n", + " ds = ds.sort_values(by='Count', ascending=False)\n", + " # histogram\n", + " fig = px.bar(\n", + " ds,\n", + " x = colname,\n", + " y = 'Count',\n", + " title = colname + ' distribution'\n", + " )\n", + " fig.show(\"svg\")\n", + " \n", + "\n", + "# Pie of discrete values\n", + "def show_value_counts_pie(colname, sort = True):\n", + " ds = df_step_clear[colname].value_counts().reset_index()\n", + " ds.columns = [\n", + " colname,\n", + " 'percent'\n", + " ]\n", + " ds['percent'] /= len(df_step_clear)\n", + " if sort:\n", + " ds = ds.sort_values(by='percent', ascending=False)\n", + " fig = px.pie(\n", + " ds,\n", + " names = colname,\n", + " values = 'percent',\n", + " title = colname+ ' Percentage',\n", + " )\n", + " fig.update_traces(textposition='inside', textinfo='percent+label',showlegend=False)\n", + " fig.show(\"svg\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "correctincorrecthint020k40k60k80kFirst Attempt distributionFirst AttemptCount" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 70 3\n" + ] + }, + { + "data": { + "image/svg+xml": [ + "010203040506005k10k15k20k25k30k35k40kStep Duration (sec)count" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Bar\n", + "show_value_counts_bar('First Attempt')\n", + "show_value_counts_histogram('Step Duration (sec)')\n", + "show_value_counts_box('Step Duration (sec)')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "sequence Statics, unit Complex Interactions Between Bodies, module Statically Equivalent Loads13.4%sequence Statics, unit Concentrated Forces and Their Effects, module Representing Interactions Between Bodies12.4%sequence Statics, unit Complex Interactions Between Bodies, module Couples9.62%sequence Statics, unit Complex Interactions Between Bodies, module Applications of Static Equivalency to Distributed Forces, section1 Simplifying 3D loadings to 2D or 1D loading7.34%sequence Statics, unit Concentrated Forces and Their Effects, module Effects of Force6.14%sequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connections6.07%sequence Statics, unit Concentrated Forces and Their Effects, module Introduction to Free Body Diagrams5.11%sequence Statics, unit Concentrated Forces and Their Effects, module Equilibrium Under 2D Concentrated Forces, section1 Applying Force Equilibrium5.07%sequence Statics, unit Concentrated Forces and Their Effects, module Equilibrium Under 2D Concentrated Forces, section1 Applying Force and Moment Equilibrium4.4%sequence Statics, unit Engineering Systems - Single Body Equilibrium, module Equilibrium of a Single Subsystem4.1%sequence Statics, unit Multiple Body Equilibrium - Trusses, module Method of Joints3.64%sequence Statics, unit Concentrated Forces and Their Effects, module Effects of Multiple Forces, section1 Combining Moments3.45%sequence Statics, unit Concentrated Forces and Their Effects, module Equilibrium Under 2D Concentrated Forces2.88%sequence Statics, unit Complex Interactions Between Bodies, module Applications of Static Equivalency to Distributed Forces, section1 Center of Gravity and Centroid2.58%sequence Statics, unit Concentrated Forces and Their Effects, module Effects of Multiple Forces2.58%sequence Statics, unit Concentrated Forces and Their Effects, module Effects of Multiple Forces, section1 Combining Concurrent Forces2.24%sequence Statics, unit Multiple Body Equilibrium - Trusses, module Method of Sections1.57%sequence Statics, unit Engineering Systems - Single Body Equilibrium, module Choosing a Solvable Subsystem1.54%sequence Statics, unit Engineering Systems - Single Body Equilibrium, module Drawing FBDs of a Single Subsystem1.53%sequence Statics, unit Multiple Body Equilibrium - Frames, module Drawing FBDs of Multiple Subsystems1.17%sequence Statics, unit Multiple Body Equilibrium - Frames, module Solving Multiple Subsystems1.15%sequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connections, section1 Pin Connections0.92%sequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connections, section1 Other Connections0.459%sequence Statics, unit Moments of Inertia, module Second Moment of Area0.327%sequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connections, section1 Fixed Connections0.174%sequence Statics, unit Moments of Inertia, module Mass Moment of Inertia0.144%Problem Hierarchy Percentage" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/svg+xml": [ + "_m9_assess5.86%_m2_assess5.11%tutor_05_034.35%_m11_assess4.1%_m1_assess4.01%_m17_assess3.64%_m3_assess3.04%_m5_assess2.88%_m7_assess2.84%_m4_assess2.58%_m6_assess2.35%tutor_05_072.22%_m18_assess1.57%_m12_assess1.54%_m10_assess1.53%_m8_assess1.53%tutor_6_051.41%_m1_tutor11.25%_m14_assess1.17%_m15_assess1.15%tutor_08_011.13%tutor_04_100.978%tutor_6_070.973%tutor_7_160.962%_m1_tutor130.959%tutor_6_040.95%tutor_04_050.939%_m1_tutor110.908%tutor_04_130.892%_m1_tutor120.843%_m1_tutor140.843%tutor_03_110.838%_m8_assess_part20.831%tutor_6_060.807%tutor_07_230.798%tutor_07_240.775%tutor_04_080.694%tutor_7_100.693%tutor_08_040.686%tutor_6_100.658%tutor_6_03c0.655%tutor_07_210.636%tutor_7_020.604%tutor_08_060.601%_m1_tutor20.598%tutor_7_030.597%tutor_08_100.583%tutor_07_200.568%tutor_6_11b0.552%tutor_07_190.539%tutor_08_210.533%tutor_6_090.518%tutor_7_08a0.517%tutor_05_120.505%tutor_05_180.503%tutor_04_020.499%tutor_05_170.474%tutor_08_030.462%tutor_08_090.444%tutor_07_250.438%tutor_04_150.43%tutor_05_010.427%tutor_07_220.417%tutor_6_030.403%tutor_04_010.399%_m1_tutor40.397%_m1_tutor100.369%tutor_08_050.369%_m1_tutor90.362%tutor_7_060.359%tutor_08_110.35%tutor_7_110.342%tutor_7_120.337%_m20_assess0.327%tutor_03_120.322%tutor_03_150.313%tutor_03_130.312%tutor_7_140.309%tutor_07_170.303%tutor_09_190.274%tutor_08_170.265%tutor_08_180.257%tutor_08_080.249%tutor_08_020.233%tutor_08_070.231%tutor_07_270.228%tutor_03_010.225%_m1_tutor150.221%tutor_03_020.214%tutor_09_010.214%_m1_tutor50.213%_m1_tutor30.21%tutor_7_050.209%_m1_tutor60.205%_m1_tutor70.199%tutor_7_070.197%_m1_tutor160.197%tutor_7_040.197%_m1_tutor80.194%_m1_tutor190.194%_m1_tutor180.192%tutor_7_08b0.191%tutor_6_080.187%tutor_05_190.184%tutor_07_260.181%tutor_09_030.174%tutor_04_090.172%tutor_6_11a0.163%tutor_09_070.162%tutor_08_240.155%tutor_03_140.155%tutor_03_030.153%tutor_05_050.152%tutor_04_040.15%tutor_07_180.146%_m21_assess0.144%tutor_05_020.143%tutor_05_090.14%tutor_09_120.136%tutor_08_190.134%tutor_09_130.132%tutor_04_120.131%tutor_08_200.129%tutor_05_100.129%tutor_08_230.128%tutor_04_030.127%tutor_05_110.124%tutor_04_060.121%tutor_05_150.119%tutor_09_080.102%tutor_08_120.101%tutor_09_090.1%tutor_09_100.0993%tutor_09_180.0966%tutor_08_160.0958%tutor_08_220.0949%tutor_09_140.0949%tutor_09_150.094%tutor_08_130.0931%tutor_08_150.0914%tutor_08_140.0914%tutor_09_200.0879%tutor_03_060.0817%tutor_03_080.0817%tutor_03_070.0817%tutor_03_050.0817%tutor_03_100.0817%tutor_03_090.08%tutor_03_040.0773%tutor_08_250.0545%tutor_04_190.0518%_m1_tutor170.051%tutor_04_170.0492%tutor_04_180.0492%Problem Name Percentage" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Pie\n", + "# show_value_counts_pie('KC (F2011)')\n", + "show_value_counts_pie('Problem Hierarchy')\n", + "show_value_counts_pie('Problem Name')\n", + "# show_value_counts_pie('Step Name')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "resolve_into_componentsidentify_interactioncouple_represents_net_zero_forcerepresent_interaction_springsimple_stepreplace_general_loads_with_force_and_couplerepresent_interaction_cordrepresent_interaction_pin_connectionrepresent_interaction_contacting_bodystatics_problem_force_and_momentcouple_related_to_forcesfind_moment_armfind_symmetry_planereplace_forces_in_opposite_sense_with_force_and_couplejudge_equilibrium_qualitativelyanticipate_solved_variables~~determine_joint_is_solvable~~recognize_variable_solvable_from_subsystemrotation_sense_of_forcemoving_force_perpendicular_to_line_of_actionfind_linear_force_per_lengthrepresent_interaction_roller_connectionrecognize_equivalence_of_translated_forcesmoment_sign_sense_relationmoving_force_to_general_pointidentify_interaction~~body_draw_force_onfind_moment_arm~~moment_sign_sense_relation~~rotation_sense_of_forceidentify_two-force_memberforce_at_joint_implied_by_previous_analysis~~sense_if_assuming_tensionmoment_sign_sense_relation~~rotation_sense_of_forceequivalence_of_couplespossible_interaction_for_nonuniform_contactrepresent_interaction_pin_in_slot_connectionrecognize_equivalence_from_motioninterpret_equationdetermine_subsystem_is_solvable~~identify_external_load_points_on_section~~identify_internal_load_points_on_sectiongravitational_forcesidentify_forces_in_symmetry_planerotation_sense_of_force~~motion_dependence_on_forcemotion_dependence_on_forcebody_draw_force_oncouple_represents_net_zero_force~~couple_related_to_forcesmoment_about_point_due_to_couplerepresent_interaction_rigid_sliding_connectionrepresent_interaction_fixed_connectionrecognize_knowns_vs_unknownsfind_angle_given_componentsrecognize_conditions_for_full_equivalencesense_if_assuming_tension~~identify_external_load_points_on_section~~identify_internal_load_points_on_sectionfind_net_force_for_linear_distributionstatics_problem_collinearcentroid_of_composite_area020400200040006000sequence Statics, unit Engineering Systems - Single Body Equilibrium, module Choosing a Solvable Subsystemsequence Statics, unit Engineering Systems - Single Body Equilibrium, module Equilibrium of a Single Subsystemsequence Statics, unit Multiple Body Equilibrium - Trusses, module Method of Jointssequence Statics, unit Complex Interactions Between Bodies, module Statically Equivalent Loadssequence Statics, unit Concentrated Forces and Their Effects, module Representing Interactions Between Bodiessequence Statics, unit Friction, module Frictionsequence Statics, unit Multiple Body Equilibrium - Frames, module Solving Multiple Subsystemssequence Statics, unit Complex Interactions Between Bodies, module Couplessequence Statics, unit Complex Interactions Between Bodies, module Applications of Static Equivalency to Distributed Forces, section1 Simplifying 3D loadings to 2D or 1D loadingsequence Statics, unit Concentrated Forces and Their Effects, module Effects of Forcesequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connectionssequence Statics, unit Concentrated Forces and Their Effects, module Introduction to Free Body Diagramssequence Statics, unit Concentrated Forces and Their Effects, module Equilibrium Under 2D Concentrated Forces, section1 Applying Force Equilibriumsequence Statics, unit Concentrated Forces and Their Effects, module Equilibrium Under 2D Concentrated Forces, section1 Applying Force and Moment Equilibriumsequence Statics, unit Concentrated Forces and Their Effects, module Effects of Multiple Forces, section1 Combining Momentssequence Statics, unit Multiple Body Equilibrium - Frames, module Drawing FBDs of Multiple Subsystemssequence Statics, unit Multiple Body Equilibrium - Trusses, module Method of Sectionssequence Statics, unit Concentrated Forces and Their Effects, module Equilibrium Under 2D Concentrated Forcessequence Statics, unit Concentrated Forces and Their Effects, module Effects of Multiple Forcessequence Statics, unit Complex Interactions Between Bodies, module Applications of Static Equivalency to Distributed Forces, section1 Center of Gravity and Centroidsequence Statics, unit Concentrated Forces and Their Effects, module Effects of Multiple Forces, section1 Combining Concurrent Forcessequence Statics, unit Engineering Systems - Single Body Equilibrium, module Drawing FBDs of a Single Subsystemsequence Statics, unit Moments of Inertia, module Second Moment of Areasequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connections, section1 Pin Connectionssequence Statics, unit Moments of Inertia, module Mass Moment of Inertiasequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connections, section1 Other Connectionssequence Statics, unit Complex Interactions Between Bodies, module Representing Engineering Connections, section1 Fixed Connections0102005k10k15k20k_m9_assess_m2_assesstutor_17_09tutor_05_03tutor_12_10_m11_assess_m1_assess_m3_assess_m17_assess_m5_assesstutor_17_08_m7_assesstutor_15_14_m4_assess_m6_assesstutor_05_07_m22_assess_m18_assess_m12_assess_m8_assess_m10_assesstutor_12_17tutor_6_05tutor_15_11tutor_12_01_m1_tutor1tutor_17_03tutor_11_03_m14_assesstutor_12_05_m15_assesstutor_08_01tutor_12_06tutor_11_04tutor_12_11tutor_13_02tutor_12_18tutor_15_08tutor_11_27tutor_11_10tutor_04_05tutor_22_06tutor_15_04tutor_04_10tutor_22_07tutor_6_07tutor_17_06tutor_7_16tutor_22_21_m1_tutor13020400200040006000q1_A UpdateComboBoxq1_A UpdateRadioButtonq2_B UpdateComboBoxq1_i1 UpdateRadioButtonq2_B UpdateRadioButtonq2_i1 UpdateRadioButtonq3_i1 UpdateRadioButtonq3_C UpdateRadioButtonq1_i1 UpdateHotspotSingleq1_A UpdateShortAnswerq1_i1 UpdateComboBoxq1_A UpdateNumberFieldq5_E UpdateRadioButtonq1_B UpdateComboBoxq4_i1 UpdateRadioButtonq4_D UpdateRadioButtonq3_C UpdateComboBoxq6_F UpdateRadioButtonq2_A UpdateComboBoxq3_i1 UpdateHotspotSingleq1_i1 UpdateHotspotMultipleq1_C UpdateComboBoxq7_G UpdateRadioButtonq2_A UpdateRadioButtonq2_i1 UpdateHotspotSingleq4_i1 UpdateHotspotSingleq2_B UpdateNumberFieldq6_i1 UpdateRadioButtonq1_i2 UpdateComboBoxq1_i3 UpdateComboBoxq1_input1 UpdateComboBoxq2_C UpdateComboBoxq5_i1 UpdateRadioButtonq2_i2 UpdateComboBoxq2_i2 UpdateHotspotSingleq3_A UpdateRadioButtonq2_input2 UpdateComboBoxq7_i1 UpdateRadioButtonq2_i1 UpdateNumberFieldq4_A UpdateRadioButtonq3_A UpdateComboBoxq8_H UpdateRadioButtonq3_i3 UpdateHotspotSingleq1_i6 UpdateComboBoxq1_i4 UpdateComboBoxq1_i5 UpdateComboBoxq8_i1 UpdateRadioButtonq2_D UpdateComboBoxq3_i1 UpdateNumberFieldq3_B UpdateComboBox0204005k10kType: KC (F2011)Type: Problem HierarchyType: Problem NameType: Step NameBar of top 50 distributions for each typeKC (F2011)Problem HierarchyProblem NameStep Name" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# four column labels are individually distributed as follows \n", + "\n", + "topnum_max = 50 # show top 50 for each type\n", + "fig = make_subplots(rows=2, cols=2, # 2*2\n", + " start_cell=\"top-left\", \n", + " subplot_titles=('KC (F2011)','Problem Hierarchy','Problem Name','Step Name'), \n", + " column_widths=[0.5, 0.5]) \n", + "traces = [\n", + " go.Bar(\n", + " x = df_step[colname].value_counts().reset_index().index.tolist()[:topnum_max],\n", + " y = df_step[colname].value_counts().reset_index()[colname].tolist()[:topnum_max],\n", + " name = 'Type: ' + str(colname),\n", + " text = df_step[colname].value_counts().reset_index()['index'].tolist()[:topnum_max], \n", + " textposition = 'auto',\n", + " ) for colname in ['KC (F2011)','Problem Hierarchy','Problem Name','Step Name']\n", + "]\n", + "for i in range(len(traces)):\n", + " fig.append_trace(\n", + " traces[i],\n", + " (i //2) + 1, # pos_row\n", + " (i % 2) + 1 # pos_col\n", + " )\n", + " \n", + "fig.update_layout(\n", + " title_text = 'Bar of top 50 distributions for each type ',\n", + ")\n", + "\n", + "fig.show(\"svg\")\n" + ] + } + ], + "metadata": { + "celltoolbar": "原始单元格格式", + "kernelspec": { + "display_name": "Data", + "language": "python", + "name": "data" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/OLI_Fall2011/OLI_2011F_transaction.ipynb b/docs/OLI_Fall2011/OLI_2011F_transaction.ipynb new file mode 100644 index 0000000..34521ee --- /dev/null +++ b/docs/OLI_Fall2011/OLI_2011F_transaction.ipynb @@ -0,0 +1,2616 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# OLI data in fall, 2011(transaction)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import pandas as pd\n", + "import numpy as np\n", + "# global configuration: show every rows and cols\n", + "pd.set_option('display.max_rows', None)\n", + "pd.set_option('max_colwidth',None)\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Data Description\n", + "## 1.1 Column Description" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FieldAnnotation
0RowA row counter
1Sample NameThe sample that contains the transaction. If a transaction appears in multiple samples, the transaction will be repeated, but with a different sample name.
2Transaction IdA unique ID that identifies the transaction. Currently used for annotating transactions with custom fields via web services.
3Anon Student IdDataShop-generated anonymous student ID. To obtain original student identifiers or to learn more about data anonymization, see About data anonymization below.
4Session IdA dataset-unique string that identifies the user's session with the tutor.
5TimeTime the transaction occurred. For instance, if a student types \"25\" and presses return, the transaction time is at the point in which they press return.
6Time ZoneThe local time zone (e.g., EST, PST, US/Eastern).
7Duration (sec)Duration of the transaction in seconds. This is the time of the current transaction minus that of the preceding transaction or problem start event—whichever is closer in time to the current transaction. If this difference is greater than 10 minutes, or if the prior transaction occurred during a different user session, DataShop reports the duration as null (a dot). If the current transaction is preceded by neither another transaction or a problem start event, duration is shown as null. The duration is formatted without decimal places if the two times used in the calculation were without millisecond precision.
8Student Response TypeThe type of attempt made by the student (e.g., \"ATTEMPT\" or \"HINT_REQUEST\"). This is logged in the semantic_event element.
9Student Response SubtypeA more detailed classification of the student attempt. For example, the CTAT software describes actions taken by the tutor on behalf of the student as having subtype \"tutor-performed\".
10Tutor Response TypeThe type of response made by the tutor (e.g., \"RESULT\" or \"HINT_MSG\").
11Tutor Response SubtypeA more detailed classification of the tutor response.
12Level (level_type)The problem hierarchy name (e.g., \"Understanding Fractions\") of the type specified in the column header (e.g., \"Unit\"). There may be multiple \"Level\" columns if the problem hierarchy is more than one level deep. Level is logged in the level element.
13Problem NameThe name of the problem. Two problems with the same \"Problem Name\" are considered different \"problems\" by DataShop if the following logged values are not identical: problem name, context, tutor_flag (whether or not the problem or activity is tutored) and \"other\" field. These fields are logged in the problem element.
14Problem ViewThe number of times the student encountered the problem so far. This counter increases with each instance of the same problem. See \"Problem View\" in the \"By Student-Step\" table below.
15Problem Start TimeIf the problem start time is not given in the original log data, then it is set to the time of the last transaction of the prior problem. If there is no prior problem for the session, the time of the earliest transaction is used. Earliest transaction time is equivalent to the minimum transaction time for the earliest step of the problem. For more detail on how problem start time is determined, see Determining Problem Start Time.
16Step NameFormed by concatenating the \"selection\" and \"action\". Also see the glossary entry for \"step\".
17Attempt at StepAs of this transaction, the current number of attempts toward the identified step.
18OutcomeThe tutor's evaluation of the student's attempt. For example, \"CORRECT\", \"INCORRECT\", or \"HINT\". This is logged in the action_evaluation element.
19SelectionA description of the interface element(s) that the student selected or interacted with (for example, \"LowestCommonDenominatorCell\"). This is logged in the event_descriptor element.
20ActionA description of the manipulation applied to the selection.
21InputThe input the student submitted (e.g., the text entered, the text of a menu item or a combobox entry).
22Feedback TextThe body of a hint, success, or incorrect action message shown to the student. It is generally a text value, logged in the tutor_advice element.
23Feedback ClassificationThe type of error (e.g., \"sign error\") or type of hint.
24Help LevelIn the case of hierarchical hints, this is the depth of the hint. \"1\", for example, is an initial hint, while \"3\" is the third hint.
25Total Num HintsThe total number of hints available. This is logged in the action_evaluation element.
26Condition NameThe name of the condition (e.g., \"Unworked\").
27Condition TypeA condition classification (e.g., \"Experimental\", \"Control\"); optional at the time of logging.
28KC (model_name)The knowledge component for this transaction. It is a member of the knowledge component model named in the column header. One \"KC (model_name)\" column should appear in the export for each KC model in the dataset.
29KC Category (model_name)The knowledge component \"category\" logged by some tutors. It is a member of the knowledge component model named in the column header. One \"KC Category (model_name)\" column should appear in the export for each KC model in the dataset.
30SchoolThe name of the school where the student used the tutor to create this transaction.
31ClassThe name of the class the student was in when he or she used the tutor to create this transaction.
32CF (custom_field_name)The value of a custom field. This is usually information that did not fit into any of the other logging fields (i.e., any of the other columns), and so was logged in this special container.
33Event TypeAllowed values are \"assess\", \"instruct\" and \"assess_instruct\". Blank is also allowed. Only \"instruct\" and \"assess_instruct\" values are treated as learning opportunities.
\n", + "
" + ], + "text/plain": [ + " Field \\\n", + "0 Row \n", + "1 Sample Name \n", + "2 Transaction Id \n", + "3 Anon Student Id \n", + "4 Session Id \n", + "5 Time \n", + "6 Time Zone \n", + "7 Duration (sec) \n", + "8 Student Response Type \n", + "9 Student Response Subtype \n", + "10 Tutor Response Type \n", + "11 Tutor Response Subtype \n", + "12 Level (level_type) \n", + "13 Problem Name \n", + "14 Problem View \n", + "15 Problem Start Time \n", + "16 Step Name \n", + "17 Attempt at Step \n", + "18 Outcome \n", + "19 Selection \n", + "20 Action \n", + "21 Input \n", + "22 Feedback Text \n", + "23 Feedback Classification \n", + "24 Help Level \n", + "25 Total Num Hints \n", + "26 Condition Name \n", + "27 Condition Type \n", + "28 KC (model_name) \n", + "29 KC Category (model_name) \n", + "30 School \n", + "31 Class \n", + "32 CF (custom_field_name) \n", + "33 Event Type \n", + "\n", + " Annotation \n", + "0 A row counter \n", + "1 The sample that contains the transaction. If a transaction appears in multiple samples, the transaction will be repeated, but with a different sample name. \n", + "2 A unique ID that identifies the transaction. Currently used for annotating transactions with custom fields via web services. \n", + "3 DataShop-generated anonymous student ID. To obtain original student identifiers or to learn more about data anonymization, see About data anonymization below. \n", + "4 A dataset-unique string that identifies the user's session with the tutor. \n", + "5 Time the transaction occurred. For instance, if a student types \"25\" and presses return, the transaction time is at the point in which they press return. \n", + "6 The local time zone (e.g., EST, PST, US/Eastern). \n", + "7 Duration of the transaction in seconds. This is the time of the current transaction minus that of the preceding transaction or problem start event—whichever is closer in time to the current transaction. If this difference is greater than 10 minutes, or if the prior transaction occurred during a different user session, DataShop reports the duration as null (a dot). If the current transaction is preceded by neither another transaction or a problem start event, duration is shown as null. The duration is formatted without decimal places if the two times used in the calculation were without millisecond precision. \n", + "8 The type of attempt made by the student (e.g., \"ATTEMPT\" or \"HINT_REQUEST\"). This is logged in the semantic_event element. \n", + "9 A more detailed classification of the student attempt. For example, the CTAT software describes actions taken by the tutor on behalf of the student as having subtype \"tutor-performed\". \n", + "10 The type of response made by the tutor (e.g., \"RESULT\" or \"HINT_MSG\"). \n", + "11 A more detailed classification of the tutor response. \n", + "12 The problem hierarchy name (e.g., \"Understanding Fractions\") of the type specified in the column header (e.g., \"Unit\"). There may be multiple \"Level\" columns if the problem hierarchy is more than one level deep. Level is logged in the level element. \n", + "13 The name of the problem. Two problems with the same \"Problem Name\" are considered different \"problems\" by DataShop if the following logged values are not identical: problem name, context, tutor_flag (whether or not the problem or activity is tutored) and \"other\" field. These fields are logged in the problem element. \n", + "14 The number of times the student encountered the problem so far. This counter increases with each instance of the same problem. See \"Problem View\" in the \"By Student-Step\" table below. \n", + "15 If the problem start time is not given in the original log data, then it is set to the time of the last transaction of the prior problem. If there is no prior problem for the session, the time of the earliest transaction is used. Earliest transaction time is equivalent to the minimum transaction time for the earliest step of the problem. For more detail on how problem start time is determined, see Determining Problem Start Time. \n", + "16 Formed by concatenating the \"selection\" and \"action\". Also see the glossary entry for \"step\". \n", + "17 As of this transaction, the current number of attempts toward the identified step. \n", + "18 The tutor's evaluation of the student's attempt. For example, \"CORRECT\", \"INCORRECT\", or \"HINT\". This is logged in the action_evaluation element. \n", + "19 A description of the interface element(s) that the student selected or interacted with (for example, \"LowestCommonDenominatorCell\"). This is logged in the event_descriptor element. \n", + "20 A description of the manipulation applied to the selection. \n", + "21 The input the student submitted (e.g., the text entered, the text of a menu item or a combobox entry). \n", + "22 The body of a hint, success, or incorrect action message shown to the student. It is generally a text value, logged in the tutor_advice element. \n", + "23 The type of error (e.g., \"sign error\") or type of hint. \n", + "24 In the case of hierarchical hints, this is the depth of the hint. \"1\", for example, is an initial hint, while \"3\" is the third hint. \n", + "25 The total number of hints available. This is logged in the action_evaluation element. \n", + "26 The name of the condition (e.g., \"Unworked\"). \n", + "27 A condition classification (e.g., \"Experimental\", \"Control\"); optional at the time of logging. \n", + "28 The knowledge component for this transaction. It is a member of the knowledge component model named in the column header. One \"KC (model_name)\" column should appear in the export for each KC model in the dataset. \n", + "29 The knowledge component \"category\" logged by some tutors. It is a member of the knowledge component model named in the column header. One \"KC Category (model_name)\" column should appear in the export for each KC model in the dataset. \n", + "30 The name of the school where the student used the tutor to create this transaction. \n", + "31 The name of the class the student was in when he or she used the tutor to create this transaction. \n", + "32 The value of a custom field. This is usually information that did not fit into any of the other logging fields (i.e., any of the other columns), and so was logged in this special container. \n", + "33 Allowed values are \"assess\", \"instruct\" and \"assess_instruct\". Blank is also allowed. Only \"instruct\" and \"assess_instruct\" values are treated as learning opportunities. " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# help_table1: the description for data by transactions\n", + "df1 = pd.read_csv('OLI_data/help_table1.csv',sep=',',encoding=\"gbk\")\n", + "df1 = df1.loc[:, ['Field', 'Annotation']]\n", + "df1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.2 Summarization of Data\n", + "\n", + "**This table organizes the data as student-problem-step-transaction**" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RowSample NameTransaction IdAnon Student IdSession IdTimeTime ZoneDuration (sec)Student Response TypeStudent Response SubtypeTutor Response TypeTutor Response SubtypeLevel (Sequence)Level (Unit)Level (Module)Level (Section1)Problem NameProblem ViewProblem Start TimeStep NameAttempt At StepIs Last AttemptOutcomeSelectionActionInputInput.1Feedback TextFeedback ClassificationHelp LevelTotal Num HintsKC (Single-KC)KC Category (Single-KC)KC (Unique-step)KC Category (Unique-step)KC (F2011)KC Category (F2011)KC (F2011).1KC Category (F2011).1KC (F2011).2KC Category (F2011).2SchoolClassCF (oli:activityGuid)CF (oli:highStakes)CF (oli:purpose)CF (oli:resourceType)
01All Data2adbe4abefd649d48862d3f62b1abf5eStu_00b2b35fd027e7891e8a1a527125dd658dd109e680020ca6016f8e64290b56102011-09-21 17:26:36US/Eastern1VIEW_PAGEUI EventNaNNaNStaticsConcentrated Forces and Their EffectsIntroduction to Free Body DiagramsNaN_m2_assess12011-09-21 17:26:35NaNNaNNaNNaNNavigationSelectPageNumber1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNMarion Technical CollegeMET2010B-01NaNNaNNaNNaN
12All Data4393251e32a6f00502f3f1ef894af8feStu_00b2b35fd027e7891e8a1a527125dd658dd109e680020ca6016f8e64290b56102011-09-21 17:35:28US/Eastern23.13ATTEMPTNaNRESULTNaNStaticsConcentrated Forces and Their EffectsIntroduction to Free Body DiagramsNaN_m2_assess12011-09-21 17:26:35q1_point1i1 UpdateComboBox1.01.0CORRECTq1_point1i1UpdateComboBox<material>cord c</material>NaNNaNNaNNaNNaNSingle-KCNaNNaNNaNidentify_interactionNaNNaNNaNNaNNaNMarion Technical CollegeMET2010B-01NaNNaNNaNNaN
23All Datae2fb2cb788d10ebaa6f288e0757d1b09Stu_00b2b35fd027e7891e8a1a527125dd658dd109e680020ca6016f8e64290b56102011-09-21 17:35:28US/Eastern23.13ATTEMPTNaNRESULTNaNStaticsConcentrated Forces and Their EffectsIntroduction to Free Body DiagramsNaN_m2_assess12011-09-21 17:26:35q1_point3i3 UpdateComboBox1.01.0CORRECTq1_point3i3UpdateComboBox<material>120 lb</material>NaNNaNNaNNaNNaNSingle-KCNaNNaNNaNgravitational_forcesNaNNaNNaNNaNNaNMarion Technical CollegeMET2010B-01NaNNaNNaNNaN
34All Datae7e150d423862e346dc7e36a95e394e4Stu_00b2b35fd027e7891e8a1a527125dd658dd109e680020ca6016f8e64290b56102011-09-21 17:35:28US/Eastern23.13ATTEMPTNaNRESULTNaNStaticsConcentrated Forces and Their EffectsIntroduction to Free Body DiagramsNaN_m2_assess12011-09-21 17:26:35q1_point6i2 UpdateComboBox1.01.0INCORRECTq1_point6i2UpdateComboBox<material>no interaction</material>NaNNaNNaNNaNNaNSingle-KCNaNNaNNaNrepresent_interaction_springNaNNaNNaNNaNNaNMarion Technical CollegeMET2010B-01NaNNaNNaNNaN
45All Data684b1f770a225f21745c6c4c977ddc32Stu_00b2b35fd027e7891e8a1a527125dd658dd109e680020ca6016f8e64290b56102011-09-21 17:35:28US/Eastern23.13ATTEMPTNaNRESULTNaNStaticsConcentrated Forces and Their EffectsIntroduction to Free Body DiagramsNaN_m2_assess12011-09-21 17:26:35q1_point1i2 UpdateComboBox1.01.0CORRECTq1_point1i2UpdateComboBox<material>up</material>NaNNaNNaNNaNNaNSingle-KCNaNNaNNaNrepresent_interaction_cordNaNNaNNaNNaNNaNMarion Technical CollegeMET2010B-01NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " Row Sample Name Transaction Id \\\n", + "0 1 All Data 2adbe4abefd649d48862d3f62b1abf5e \n", + "1 2 All Data 4393251e32a6f00502f3f1ef894af8fe \n", + "2 3 All Data e2fb2cb788d10ebaa6f288e0757d1b09 \n", + "3 4 All Data e7e150d423862e346dc7e36a95e394e4 \n", + "4 5 All Data 684b1f770a225f21745c6c4c977ddc32 \n", + "\n", + " Anon Student Id Session Id \\\n", + "0 Stu_00b2b35fd027e7891e8a1a527125dd65 8dd109e680020ca6016f8e64290b5610 \n", + "1 Stu_00b2b35fd027e7891e8a1a527125dd65 8dd109e680020ca6016f8e64290b5610 \n", + "2 Stu_00b2b35fd027e7891e8a1a527125dd65 8dd109e680020ca6016f8e64290b5610 \n", + "3 Stu_00b2b35fd027e7891e8a1a527125dd65 8dd109e680020ca6016f8e64290b5610 \n", + "4 Stu_00b2b35fd027e7891e8a1a527125dd65 8dd109e680020ca6016f8e64290b5610 \n", + "\n", + " Time Time Zone Duration (sec) Student Response Type \\\n", + "0 2011-09-21 17:26:36 US/Eastern 1 VIEW_PAGE \n", + "1 2011-09-21 17:35:28 US/Eastern 23.13 ATTEMPT \n", + "2 2011-09-21 17:35:28 US/Eastern 23.13 ATTEMPT \n", + "3 2011-09-21 17:35:28 US/Eastern 23.13 ATTEMPT \n", + "4 2011-09-21 17:35:28 US/Eastern 23.13 ATTEMPT \n", + "\n", + " Student Response Subtype Tutor Response Type Tutor Response Subtype \\\n", + "0 UI Event NaN NaN \n", + "1 NaN RESULT NaN \n", + "2 NaN RESULT NaN \n", + "3 NaN RESULT NaN \n", + "4 NaN RESULT NaN \n", + "\n", + " Level (Sequence) Level (Unit) \\\n", + "0 Statics Concentrated Forces and Their Effects \n", + "1 Statics Concentrated Forces and Their Effects \n", + "2 Statics Concentrated Forces and Their Effects \n", + "3 Statics Concentrated Forces and Their Effects \n", + "4 Statics Concentrated Forces and Their Effects \n", + "\n", + " Level (Module) Level (Section1) Problem Name \\\n", + "0 Introduction to Free Body Diagrams NaN _m2_assess \n", + "1 Introduction to Free Body Diagrams NaN _m2_assess \n", + "2 Introduction to Free Body Diagrams NaN _m2_assess \n", + "3 Introduction to Free Body Diagrams NaN _m2_assess \n", + "4 Introduction to Free Body Diagrams NaN _m2_assess \n", + "\n", + " Problem View Problem Start Time Step Name \\\n", + "0 1 2011-09-21 17:26:35 NaN \n", + "1 1 2011-09-21 17:26:35 q1_point1i1 UpdateComboBox \n", + "2 1 2011-09-21 17:26:35 q1_point3i3 UpdateComboBox \n", + "3 1 2011-09-21 17:26:35 q1_point6i2 UpdateComboBox \n", + "4 1 2011-09-21 17:26:35 q1_point1i2 UpdateComboBox \n", + "\n", + " Attempt At Step Is Last Attempt Outcome Selection Action \\\n", + "0 NaN NaN NaN Navigation SelectPageNumber \n", + "1 1.0 1.0 CORRECT q1_point1i1 UpdateComboBox \n", + "2 1.0 1.0 CORRECT q1_point3i3 UpdateComboBox \n", + "3 1.0 1.0 INCORRECT q1_point6i2 UpdateComboBox \n", + "4 1.0 1.0 CORRECT q1_point1i2 UpdateComboBox \n", + "\n", + " Input Input.1 Feedback Text \\\n", + "0 1 NaN NaN \n", + "1 cord c NaN NaN \n", + "2 120 lb NaN NaN \n", + "3 no interaction NaN NaN \n", + "4 up NaN NaN \n", + "\n", + " Feedback Classification Help Level Total Num Hints KC (Single-KC) \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN Single-KC \n", + "2 NaN NaN NaN Single-KC \n", + "3 NaN NaN NaN Single-KC \n", + "4 NaN NaN NaN Single-KC \n", + "\n", + " KC Category (Single-KC) KC (Unique-step) KC Category (Unique-step) \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " KC (F2011) KC Category (F2011) KC (F2011).1 \\\n", + "0 NaN NaN NaN \n", + "1 identify_interaction NaN NaN \n", + "2 gravitational_forces NaN NaN \n", + "3 represent_interaction_spring NaN NaN \n", + "4 represent_interaction_cord NaN NaN \n", + "\n", + " KC Category (F2011).1 KC (F2011).2 KC Category (F2011).2 \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " School Class CF (oli:activityGuid) \\\n", + "0 Marion Technical College MET2010B-01 NaN \n", + "1 Marion Technical College MET2010B-01 NaN \n", + "2 Marion Technical College MET2010B-01 NaN \n", + "3 Marion Technical College MET2010B-01 NaN \n", + "4 Marion Technical College MET2010B-01 NaN \n", + "\n", + " CF (oli:highStakes) CF (oli:purpose) CF (oli:resourceType) \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_transaction = pd.read_csv('OLI_data/AllData_transaction_2011F.csv',low_memory=False) # sep=\"\\t\"\n", + "df_transaction.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Data Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RowTutor Response SubtypeProblem ViewAttempt At StepIs Last AttemptFeedback ClassificationHelp LevelTotal Num HintsKC Category (Single-KC)KC Category (Unique-step)KC Category (F2011)KC Category (F2011).1KC Category (F2011).2
count361092.0000000.0361092.000000289858.000000289858.0000000.00.00.00.00.00.00.00.0
mean180546.500000NaN1.1801922.3828670.658678NaNNaNNaNNaNNaNNaNNaNNaN
std104238.426039NaN0.9071729.9489410.474154NaNNaNNaNNaNNaNNaNNaNNaN
min1.000000NaN1.0000001.0000000.000000NaNNaNNaNNaNNaNNaNNaNNaN
25%90273.750000NaN1.0000001.0000000.000000NaNNaNNaNNaNNaNNaNNaNNaN
50%180546.500000NaN1.0000001.0000001.000000NaNNaNNaNNaNNaNNaNNaNNaN
75%270819.250000NaN1.0000002.0000001.000000NaNNaNNaNNaNNaNNaNNaNNaN
max361092.000000NaN32.000000427.0000001.000000NaNNaNNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " Row Tutor Response Subtype Problem View Attempt At Step \\\n", + "count 361092.000000 0.0 361092.000000 289858.000000 \n", + "mean 180546.500000 NaN 1.180192 2.382867 \n", + "std 104238.426039 NaN 0.907172 9.948941 \n", + "min 1.000000 NaN 1.000000 1.000000 \n", + "25% 90273.750000 NaN 1.000000 1.000000 \n", + "50% 180546.500000 NaN 1.000000 1.000000 \n", + "75% 270819.250000 NaN 1.000000 2.000000 \n", + "max 361092.000000 NaN 32.000000 427.000000 \n", + "\n", + " Is Last Attempt Feedback Classification Help Level Total Num Hints \\\n", + "count 289858.000000 0.0 0.0 0.0 \n", + "mean 0.658678 NaN NaN NaN \n", + "std 0.474154 NaN NaN NaN \n", + "min 0.000000 NaN NaN NaN \n", + "25% 0.000000 NaN NaN NaN \n", + "50% 1.000000 NaN NaN NaN \n", + "75% 1.000000 NaN NaN NaN \n", + "max 1.000000 NaN NaN NaN \n", + "\n", + " KC Category (Single-KC) KC Category (Unique-step) \\\n", + "count 0.0 0.0 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " KC Category (F2011) KC Category (F2011).1 KC Category (F2011).2 \n", + "count 0.0 0.0 0.0 \n", + "mean NaN NaN NaN \n", + "std NaN NaN NaN \n", + "min NaN NaN NaN \n", + "25% NaN NaN NaN \n", + "50% NaN NaN NaN \n", + "75% NaN NaN NaN \n", + "max NaN NaN NaN " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_transaction.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (1)Analysis for Null and Unique value of column attributes" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------num_unique_toal and num_nonull_toal----------------------\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col_namenum_nonullnum_nullnum_unique
0Row3610920361092
1Sample Name36109201
2Transaction Id3610920361092
3Anon Student Id3610920335
4Session Id36109206656
5Time3610920263172
6Time Zone36109201
7Duration (sec)36109202565
8Student Response Type36109205
9Student Response Subtype712342898582
10Tutor Response Type289858712343
11Tutor Response Subtype03610921
12Level (Sequence)36109201
13Level (Unit)36109207
14Level (Module)361092019
15Level (Section1)5948030161210
16Problem Name3610920300
17Problem View361092032
18Problem Start Time361092046473
19Step Name28985871234383
20Attempt At Step28985871234428
21Is Last Attempt289858712343
22Outcome289858712344
23Selection36108210287
24Action3610821010
25Input302086590066827
26Input.113610912
27Feedback Text2310631300291579
28Feedback Classification03610921
29Help Level03610921
30Total Num Hints03610921
31KC (Single-KC)289858712342
32KC Category (Single-KC)03610921
33KC (Unique-step)283336777561179
34KC Category (Unique-step)03610921
35KC (F2011)15259220850081
36KC Category (F2011)03610921
37KC (F2011).11690434418819
38KC Category (F2011).103610921
39KC (F2011).266903544029
40KC Category (F2011).203610921
41School36109207
42Class36109209
43CF (oli:activityGuid)450023160901244
44CF (oli:highStakes)450023160903
45CF (oli:purpose)445163165764
46CF (oli:resourceType)450023160903
\n", + "
" + ], + "text/plain": [ + " col_name num_nonull num_null num_unique\n", + "0 Row 361092 0 361092\n", + "1 Sample Name 361092 0 1\n", + "2 Transaction Id 361092 0 361092\n", + "3 Anon Student Id 361092 0 335\n", + "4 Session Id 361092 0 6656\n", + "5 Time 361092 0 263172\n", + "6 Time Zone 361092 0 1\n", + "7 Duration (sec) 361092 0 2565\n", + "8 Student Response Type 361092 0 5\n", + "9 Student Response Subtype 71234 289858 2\n", + "10 Tutor Response Type 289858 71234 3\n", + "11 Tutor Response Subtype 0 361092 1\n", + "12 Level (Sequence) 361092 0 1\n", + "13 Level (Unit) 361092 0 7\n", + "14 Level (Module) 361092 0 19\n", + "15 Level (Section1) 59480 301612 10\n", + "16 Problem Name 361092 0 300\n", + "17 Problem View 361092 0 32\n", + "18 Problem Start Time 361092 0 46473\n", + "19 Step Name 289858 71234 383\n", + "20 Attempt At Step 289858 71234 428\n", + "21 Is Last Attempt 289858 71234 3\n", + "22 Outcome 289858 71234 4\n", + "23 Selection 361082 10 287\n", + "24 Action 361082 10 10\n", + "25 Input 302086 59006 6827\n", + "26 Input.1 1 361091 2\n", + "27 Feedback Text 231063 130029 1579\n", + "28 Feedback Classification 0 361092 1\n", + "29 Help Level 0 361092 1\n", + "30 Total Num Hints 0 361092 1\n", + "31 KC (Single-KC) 289858 71234 2\n", + "32 KC Category (Single-KC) 0 361092 1\n", + "33 KC (Unique-step) 283336 77756 1179\n", + "34 KC Category (Unique-step) 0 361092 1\n", + "35 KC (F2011) 152592 208500 81\n", + "36 KC Category (F2011) 0 361092 1\n", + "37 KC (F2011).1 16904 344188 19\n", + "38 KC Category (F2011).1 0 361092 1\n", + "39 KC (F2011).2 6690 354402 9\n", + "40 KC Category (F2011).2 0 361092 1\n", + "41 School 361092 0 7\n", + "42 Class 361092 0 9\n", + "43 CF (oli:activityGuid) 45002 316090 1244\n", + "44 CF (oli:highStakes) 45002 316090 3\n", + "45 CF (oli:purpose) 44516 316576 4\n", + "46 CF (oli:resourceType) 45002 316090 3" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def work_col_analysis(df_work):\n", + " num_nonull_toal = df_work.notnull().sum() # Not Null\n", + " dict_col_1 = {'col_name':num_nonull_toal.index,'num_nonull':num_nonull_toal.values}\n", + " df_work_col_1 = pd.DataFrame(dict_col_1)\n", + "\n", + " num_null_toal = df_work.isnull().sum() # Null\n", + " dict_col_2 = {'col_name':num_null_toal.index,'num_null':num_null_toal.values}\n", + " df_work_col_2 = pd.DataFrame(dict_col_2)\n", + "\n", + " num_unique_toal = df_work.apply(lambda col: len(col.unique())) # axis=0\n", + " print(type(num_unique_toal))\n", + " dict_col_3 = {'col_name':num_unique_toal.index,'num_unique':num_unique_toal.values}\n", + " df_work_col_3 = pd.DataFrame(dict_col_3)\n", + "\n", + " df_work_col = pd.merge(df_work_col_1, df_work_col_2, on=['col_name'])\n", + " df_work_col = pd.merge(df_work_col, df_work_col_3, on=['col_name'])\n", + " return df_work_col\n", + "print(\"-------------------num_unique_toal and num_nonull_toal----------------------\")\n", + "df_result = work_col_analysis(df_transaction)\n", + "df_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (2)Analysis for Discrete value of column attributes\n", + "> Columns with a small number of discrete values may represent very informative, so identify these columns first and analyze them one by one" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Student Response Type : ['VIEW_PAGE', 'ATTEMPT', 'SAVE_ATTEMPT', 'SUBMIT_ATTEMPT', 'HINT_REQUEST']\n", + "--------------------------------------------------------------------------------\n", + "Student Response Subtype : ['UI Event', nan]\n", + "--------------------------------------------------------------------------------\n", + "Tutor Response Type : [nan, 'RESULT', 'HINT_MSG']\n", + "--------------------------------------------------------------------------------\n", + "Level (Unit) : ['Concentrated Forces and Their Effects', 'Engineering Systems - Single Body Equilibrium', 'Complex Interactions Between Bodies', 'Multiple Body Equilibrium - Frames', 'Multiple Body Equilibrium - Trusses', 'Friction', 'Moments of Inertia']\n", + "--------------------------------------------------------------------------------\n", + "Level (Module) : ['Introduction to Free Body Diagrams', 'Effects of Force', 'Representing Interactions Between Bodies', 'Effects of Multiple Forces', 'Equilibrium Under 2D Concentrated Forces', 'Equilibrium of a Single Subsystem', 'Couples', 'Statically Equivalent Loads', 'Applications of Static Equivalency to Distributed Forces', 'Representing Engineering Connections', 'Drawing FBDs of a Single Subsystem', 'Choosing a Solvable Subsystem', 'Drawing FBDs of Multiple Subsystems', 'Solving Multiple Subsystems', 'Method of Joints', 'Method of Sections', 'Friction', 'Second Moment of Area', 'Mass Moment of Inertia']\n", + "--------------------------------------------------------------------------------\n", + "Level (Section1) : [nan, 'Combining Concurrent Forces', 'Combining Moments', 'Applying Force Equilibrium', 'Applying Force and Moment Equilibrium', 'Simplifying 3D loadings to 2D or 1D loading', 'Fixed Connections', 'Pin Connections', 'Other Connections', 'Center of Gravity and Centroid']\n", + "--------------------------------------------------------------------------------\n", + "Is Last Attempt : [nan, 1.0, 0.0]\n", + "--------------------------------------------------------------------------------\n", + "Outcome : [nan, 'CORRECT', 'INCORRECT', 'HINT']\n", + "--------------------------------------------------------------------------------\n", + "Action : ['SelectPageNumber', 'UpdateComboBox', 'Click', 'UpdateRadioButton', 'UpdateCheckbox', 'UpdateNumberField', 'UpdateShortAnswer', 'UpdateHotspotSingle', 'UpdateHotspotMultiple', nan]\n", + "--------------------------------------------------------------------------------\n", + "Input.1 : [nan, 'No, the forces of B on A and A on B shown on the diagram on the right are not correct because body B and body A are interacting on one another when ???F??? is applied to the body ???B??? but A opposite senses on each other. In this case B will push A in a']\n", + "--------------------------------------------------------------------------------\n", + "KC (Single-KC) : [nan, 'Single-KC']\n", + "--------------------------------------------------------------------------------\n", + "KC (F2011).1 : [nan, 'rotation_sense_of_force', 'identify_interaction', 'motion_dependence_on_force', 'couple_represents_net_zero_force', 'recognize_equivalence_from_motion', 'relate_direction_normal_force_and_contact', 'moment_sign_sense_relation', 'possible_interaction_for_nonuniform_contact', 'represent_interaction_contacting_body', 'represent_forces_two-force_member', 'represent_interaction_cord', 'identify_enabling_unknown', 'identify_equation_isolates_specific_unknown', 'sense_if_assuming_tension', 'determine_joint_is_solvable', 'judge_force_sense_based_on_sign', 'identify_internal_load_points_on_section', 'identify_external_load_points_on_section']\n", + "--------------------------------------------------------------------------------\n", + "KC (F2011).2 : [nan, 'rotation_sense_of_force', 'statics_problem_force_and_moment', 'represent_interaction_cord', 'represent_interaction_pin_connection', 'recognize_variable_solvable_from_subsystem', 'tension_vs_compression_given_force_senses', 'sense_if_assuming_tension', 'identify_internal_load_points_on_section']\n", + "--------------------------------------------------------------------------------\n", + "School : ['Marion Technical College', 'Sinclair Community College', 'Carnegie Mellon University', 'Kettering University', 'Miami University', 'University of Maryland Eastern Shore', 'University of Mississippi']\n", + "--------------------------------------------------------------------------------\n", + "Class : ['MET2010B-01', 'F11-E213-01', '24-261Fall11', 'FEA-Fall11', 'DesignFall11', 'F11-E213-50', 'F11-MME-211', 'ENGE260-F11', 'ENGR309H-F11']\n", + "--------------------------------------------------------------------------------\n", + "CF (oli:highStakes) : [nan, False, True]\n", + "--------------------------------------------------------------------------------\n", + "CF (oli:purpose) : [nan, 'quiz', 'didigetthis', 'learnbydoing']\n", + "--------------------------------------------------------------------------------\n", + "CF (oli:resourceType) : [nan, 'x-oli-assessment2', 'x-oli-inline-assessment']\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "discrete_cols = []\n", + "series = []\n", + "cols = list(df_transaction.columns.values)\n", + "\n", + "for col in cols:\n", + " if len(df_transaction[col].unique().tolist()) <= 20 and len(df_transaction[col].unique().tolist()) >= 2:\n", + " discrete_cols.append(col)\n", + " series.append(df_transaction[col].unique().tolist())\n", + "\n", + "for a,b in zip(discrete_cols,series):\n", + " print(a,\" : \",b)\n", + " print(\"-\"*80)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## (3)Data Cleaning \n", + "> **Data Cleaning Suggestions**\n", + "> - Redundant columns: Columns that are all NULL or Single value.\n", + "> - Others" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df_transaction_clear = df_transaction.copy(deep=True) # deep copy" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "the cols num before clear: 47\n", + "the cols num after clear: 35\n", + "drop:--- Sample Name\n", + "drop:--- Time Zone\n", + "drop:--- Tutor Response Subtype\n", + "drop:--- Level (Sequence)\n", + "drop:--- Feedback Classification\n", + "drop:--- Help Level\n", + "drop:--- Total Num Hints\n", + "drop:--- KC Category (Single-KC)\n", + "drop:--- KC Category (Unique-step)\n", + "drop:--- KC Category (F2011)\n", + "drop:--- KC Category (F2011).1\n", + "drop:--- KC Category (F2011).2\n" + ] + } + ], + "source": [ + "# 直接清除所有”冗余列“\n", + "cols = list(df_transaction.columns.values)\n", + "drop_cols = []\n", + "for col in cols:\n", + " if len(df_transaction_clear[col].unique().tolist()) == 1:\n", + " df_transaction_clear.drop(col,axis =1,inplace=True)\n", + " drop_cols.append(col)\n", + "\n", + "print(\"the cols num before clear: \",len(df_transaction.columns.to_list()))\n", + "print(\"the cols num after clear:\",len(df_transaction_clear.columns.to_list()))\n", + "for col in drop_cols:\n", + " print(\"drop:---\",col)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RowTransaction IdAnon Student IdSession IdTimeDuration (sec)Student Response TypeStudent Response SubtypeTutor Response TypeLevel (Unit)Level (Module)Level (Section1)Problem NameProblem ViewProblem Start TimeStep NameAttempt At StepIs Last AttemptOutcomeSelectionActionInputInput.1Feedback TextKC (Single-KC)KC (Unique-step)KC (F2011)KC (F2011).1KC (F2011).2SchoolClassCF (oli:activityGuid)CF (oli:highStakes)CF (oli:purpose)CF (oli:resourceType)
012adbe4abefd649d48862d3f62b1abf5eStu_00b2b35fd027e7891e8a1a527125dd658dd109e680020ca6016f8e64290b56102011-09-21 17:26:361VIEW_PAGEUI EventNaNConcentrated Forces and Their EffectsIntroduction to Free Body DiagramsNaN_m2_assess12011-09-21 17:26:35NaNNaNNaNNaNNavigationSelectPageNumber1NaNNaNNaNNaNNaNNaNNaNMarion Technical CollegeMET2010B-01NaNNaNNaNNaN
124393251e32a6f00502f3f1ef894af8feStu_00b2b35fd027e7891e8a1a527125dd658dd109e680020ca6016f8e64290b56102011-09-21 17:35:2823.13ATTEMPTNaNRESULTConcentrated Forces and Their EffectsIntroduction to Free Body DiagramsNaN_m2_assess12011-09-21 17:26:35q1_point1i1 UpdateComboBox1.01.0CORRECTq1_point1i1UpdateComboBox<material>cord c</material>NaNNaNSingle-KCNaNidentify_interactionNaNNaNMarion Technical CollegeMET2010B-01NaNNaNNaNNaN
23e2fb2cb788d10ebaa6f288e0757d1b09Stu_00b2b35fd027e7891e8a1a527125dd658dd109e680020ca6016f8e64290b56102011-09-21 17:35:2823.13ATTEMPTNaNRESULTConcentrated Forces and Their EffectsIntroduction to Free Body DiagramsNaN_m2_assess12011-09-21 17:26:35q1_point3i3 UpdateComboBox1.01.0CORRECTq1_point3i3UpdateComboBox<material>120 lb</material>NaNNaNSingle-KCNaNgravitational_forcesNaNNaNMarion Technical CollegeMET2010B-01NaNNaNNaNNaN
34e7e150d423862e346dc7e36a95e394e4Stu_00b2b35fd027e7891e8a1a527125dd658dd109e680020ca6016f8e64290b56102011-09-21 17:35:2823.13ATTEMPTNaNRESULTConcentrated Forces and Their EffectsIntroduction to Free Body DiagramsNaN_m2_assess12011-09-21 17:26:35q1_point6i2 UpdateComboBox1.01.0INCORRECTq1_point6i2UpdateComboBox<material>no interaction</material>NaNNaNSingle-KCNaNrepresent_interaction_springNaNNaNMarion Technical CollegeMET2010B-01NaNNaNNaNNaN
45684b1f770a225f21745c6c4c977ddc32Stu_00b2b35fd027e7891e8a1a527125dd658dd109e680020ca6016f8e64290b56102011-09-21 17:35:2823.13ATTEMPTNaNRESULTConcentrated Forces and Their EffectsIntroduction to Free Body DiagramsNaN_m2_assess12011-09-21 17:26:35q1_point1i2 UpdateComboBox1.01.0CORRECTq1_point1i2UpdateComboBox<material>up</material>NaNNaNSingle-KCNaNrepresent_interaction_cordNaNNaNMarion Technical CollegeMET2010B-01NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " Row Transaction Id \\\n", + "0 1 2adbe4abefd649d48862d3f62b1abf5e \n", + "1 2 4393251e32a6f00502f3f1ef894af8fe \n", + "2 3 e2fb2cb788d10ebaa6f288e0757d1b09 \n", + "3 4 e7e150d423862e346dc7e36a95e394e4 \n", + "4 5 684b1f770a225f21745c6c4c977ddc32 \n", + "\n", + " Anon Student Id Session Id \\\n", + "0 Stu_00b2b35fd027e7891e8a1a527125dd65 8dd109e680020ca6016f8e64290b5610 \n", + "1 Stu_00b2b35fd027e7891e8a1a527125dd65 8dd109e680020ca6016f8e64290b5610 \n", + "2 Stu_00b2b35fd027e7891e8a1a527125dd65 8dd109e680020ca6016f8e64290b5610 \n", + "3 Stu_00b2b35fd027e7891e8a1a527125dd65 8dd109e680020ca6016f8e64290b5610 \n", + "4 Stu_00b2b35fd027e7891e8a1a527125dd65 8dd109e680020ca6016f8e64290b5610 \n", + "\n", + " Time Duration (sec) Student Response Type \\\n", + "0 2011-09-21 17:26:36 1 VIEW_PAGE \n", + "1 2011-09-21 17:35:28 23.13 ATTEMPT \n", + "2 2011-09-21 17:35:28 23.13 ATTEMPT \n", + "3 2011-09-21 17:35:28 23.13 ATTEMPT \n", + "4 2011-09-21 17:35:28 23.13 ATTEMPT \n", + "\n", + " Student Response Subtype Tutor Response Type \\\n", + "0 UI Event NaN \n", + "1 NaN RESULT \n", + "2 NaN RESULT \n", + "3 NaN RESULT \n", + "4 NaN RESULT \n", + "\n", + " Level (Unit) Level (Module) \\\n", + "0 Concentrated Forces and Their Effects Introduction to Free Body Diagrams \n", + "1 Concentrated Forces and Their Effects Introduction to Free Body Diagrams \n", + "2 Concentrated Forces and Their Effects Introduction to Free Body Diagrams \n", + "3 Concentrated Forces and Their Effects Introduction to Free Body Diagrams \n", + "4 Concentrated Forces and Their Effects Introduction to Free Body Diagrams \n", + "\n", + " Level (Section1) Problem Name Problem View Problem Start Time \\\n", + "0 NaN _m2_assess 1 2011-09-21 17:26:35 \n", + "1 NaN _m2_assess 1 2011-09-21 17:26:35 \n", + "2 NaN _m2_assess 1 2011-09-21 17:26:35 \n", + "3 NaN _m2_assess 1 2011-09-21 17:26:35 \n", + "4 NaN _m2_assess 1 2011-09-21 17:26:35 \n", + "\n", + " Step Name Attempt At Step Is Last Attempt Outcome \\\n", + "0 NaN NaN NaN NaN \n", + "1 q1_point1i1 UpdateComboBox 1.0 1.0 CORRECT \n", + "2 q1_point3i3 UpdateComboBox 1.0 1.0 CORRECT \n", + "3 q1_point6i2 UpdateComboBox 1.0 1.0 INCORRECT \n", + "4 q1_point1i2 UpdateComboBox 1.0 1.0 CORRECT \n", + "\n", + " Selection Action Input Input.1 \\\n", + "0 Navigation SelectPageNumber 1 NaN \n", + "1 q1_point1i1 UpdateComboBox cord c NaN \n", + "2 q1_point3i3 UpdateComboBox 120 lb NaN \n", + "3 q1_point6i2 UpdateComboBox no interaction NaN \n", + "4 q1_point1i2 UpdateComboBox up NaN \n", + "\n", + " Feedback Text KC (Single-KC) KC (Unique-step) KC (F2011) \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN Single-KC NaN identify_interaction \n", + "2 NaN Single-KC NaN gravitational_forces \n", + "3 NaN Single-KC NaN represent_interaction_spring \n", + "4 NaN Single-KC NaN represent_interaction_cord \n", + "\n", + " KC (F2011).1 KC (F2011).2 School Class \\\n", + "0 NaN NaN Marion Technical College MET2010B-01 \n", + "1 NaN NaN Marion Technical College MET2010B-01 \n", + "2 NaN NaN Marion Technical College MET2010B-01 \n", + "3 NaN NaN Marion Technical College MET2010B-01 \n", + "4 NaN NaN Marion Technical College MET2010B-01 \n", + "\n", + " CF (oli:activityGuid) CF (oli:highStakes) CF (oli:purpose) \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " CF (oli:resourceType) \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_transaction_clear.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------num_unique_toal and num_nonull_toal----------------------\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
col_namenum_nonullnum_nullnum_unique
0Row3610920361092
1Transaction Id3610920361092
2Anon Student Id3610920335
3Session Id36109206656
4Time3610920263172
5Duration (sec)36109202565
6Student Response Type36109205
7Student Response Subtype712342898582
8Tutor Response Type289858712343
9Level (Unit)36109207
10Level (Module)361092019
11Level (Section1)5948030161210
12Problem Name3610920300
13Problem View361092032
14Problem Start Time361092046473
15Step Name28985871234383
16Attempt At Step28985871234428
17Is Last Attempt289858712343
18Outcome289858712344
19Selection36108210287
20Action3610821010
21Input302086590066827
22Input.113610912
23Feedback Text2310631300291579
24KC (Single-KC)289858712342
25KC (Unique-step)283336777561179
26KC (F2011)15259220850081
27KC (F2011).11690434418819
28KC (F2011).266903544029
29School36109207
30Class36109209
31CF (oli:activityGuid)450023160901244
32CF (oli:highStakes)450023160903
33CF (oli:purpose)445163165764
34CF (oli:resourceType)450023160903
\n", + "
" + ], + "text/plain": [ + " col_name num_nonull num_null num_unique\n", + "0 Row 361092 0 361092\n", + "1 Transaction Id 361092 0 361092\n", + "2 Anon Student Id 361092 0 335\n", + "3 Session Id 361092 0 6656\n", + "4 Time 361092 0 263172\n", + "5 Duration (sec) 361092 0 2565\n", + "6 Student Response Type 361092 0 5\n", + "7 Student Response Subtype 71234 289858 2\n", + "8 Tutor Response Type 289858 71234 3\n", + "9 Level (Unit) 361092 0 7\n", + "10 Level (Module) 361092 0 19\n", + "11 Level (Section1) 59480 301612 10\n", + "12 Problem Name 361092 0 300\n", + "13 Problem View 361092 0 32\n", + "14 Problem Start Time 361092 0 46473\n", + "15 Step Name 289858 71234 383\n", + "16 Attempt At Step 289858 71234 428\n", + "17 Is Last Attempt 289858 71234 3\n", + "18 Outcome 289858 71234 4\n", + "19 Selection 361082 10 287\n", + "20 Action 361082 10 10\n", + "21 Input 302086 59006 6827\n", + "22 Input.1 1 361091 2\n", + "23 Feedback Text 231063 130029 1579\n", + "24 KC (Single-KC) 289858 71234 2\n", + "25 KC (Unique-step) 283336 77756 1179\n", + "26 KC (F2011) 152592 208500 81\n", + "27 KC (F2011).1 16904 344188 19\n", + "28 KC (F2011).2 6690 354402 9\n", + "29 School 361092 0 7\n", + "30 Class 361092 0 9\n", + "31 CF (oli:activityGuid) 45002 316090 1244\n", + "32 CF (oli:highStakes) 45002 316090 3\n", + "33 CF (oli:purpose) 44516 316576 4\n", + "34 CF (oli:resourceType) 45002 316090 3" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the remaining columns\n", + "print(\"-------------------num_unique_toal and num_nonull_toal----------------------\")\n", + "df_result = work_col_analysis(df_transaction_clear)\n", + "df_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Outlier Analysis\n", + "> -

It is found that there is a non-numeric type in duration that is '.' , which should represent 0

" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Duration (sec) float64\n", + "dtype: object\n" + ] + } + ], + "source": [ + "# Change . to 0 in \"duration\"\n", + "rectify_cols = ['Duration (sec)']\n", + "for col in rectify_cols:\n", + " df_transaction_clear[col] = df_transaction_clear[col].apply(lambda x: 0 if x=='.' else x)\n", + " df_transaction_clear[col] = df_transaction_clear[col].astype(float)\n", + "print(df_transaction_clear[rectify_cols].dtypes)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Data Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.express as px\n", + "from plotly.subplots import make_subplots\n", + "import plotly.graph_objs as go" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Histogram of discrete values\n", + "def show_value_counts_bar(colname, sort = True):\n", + " ds = df_transaction[colname].value_counts().reset_index()\n", + " ds.columns = [\n", + " colname,\n", + " 'Count'\n", + " ]\n", + " if sort:\n", + " ds = ds.sort_values(by='Count', ascending=False)\n", + " # histogram\n", + " fig = px.bar(\n", + " ds,\n", + " x = colname,\n", + " y = 'Count',\n", + " title = colname + ' distribution'\n", + " )\n", + " fig.show(\"svg\")\n", + "\n", + "\n", + "# Pie of discrete values\n", + "def show_value_counts_pie(colname, sort = True):\n", + " ds = df_transaction[colname].value_counts().reset_index()\n", + " ds.columns = [\n", + " colname,\n", + " 'percent'\n", + " ]\n", + " ds['percent'] /= len(df_transaction)\n", + " if sort:\n", + " ds = ds.sort_values(by='percent', ascending=False)\n", + " fig = px.pie(\n", + " ds,\n", + " names = colname,\n", + " values = 'percent',\n", + " title = colname+ ' Percentage',\n", + " )\n", + " fig.show(\"svg\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "72.5%11.2%7.73%7.27%1.21%ATTEMPTVIEW_PAGEHINT_REQUESTSAVE_ATTEMPTSUBMIT_ATTEMPTStudent Response Type Percentage" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/svg+xml": [ + "90.4%9.63%RESULTHINT_MSGTutor Response Type Percentage" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/svg+xml": [ + "64.8%25.5%9.63%CORRECTINCORRECTHINTOutcome Percentage" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "col_pies = ['Student Response Type','Tutor Response Type','Outcome']\n", + "for col in col_pies:\n", + " show_value_counts_pie(col)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Analysis by label description:**\n", + "> - If the Student Response Type == ATTEMPT, then the Tutor Response Type == Result, then the Student Response Type => correct or incorrect\n", + "\n", + "> - If the Student Response Type == HINT_REQUEST, then the Tutor Response Type == HINT_MSG, then the outCome => hint\n", + "\n", + "> - If Student Response Type == other,then the Tutor Response Type == NaN, then the outCome => NaN" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "71.7%28.3%CORRECTINCORRECTOutcome Percentage when Student Response Type ==ATTEMPT" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/svg+xml": [ + "100%HINTOutcome Percentage when Student Response Type ==HINT_REQUEST" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "def show_value_counts_pie2(col1,type1,col2, sort = True):\n", + " df_tmp = df_transaction[df_transaction[col1] == type1]\n", + " ds = df_tmp[col2].value_counts().reset_index()\n", + " ds.columns = [\n", + " col2,\n", + " 'percent'\n", + " ]\n", + " ds['percent'] /= len(df_tmp)\n", + " if sort:\n", + " ds = ds.sort_values(by='percent', ascending=False)\n", + " fig = px.pie(\n", + " ds,\n", + " names = col2,\n", + " values = 'percent',\n", + " title = col2+ ' Percentage when ' + col1 + ' =='+ type1,\n", + " )\n", + " fig.show(\"svg\")\n", + "\n", + "# Take Student Response Type as an example\n", + "col1 = 'Student Response Type'\n", + "col2 = 'Outcome'\n", + "# col1 = 'Tutor Response Type'\n", + "# col2 = 'Outcome'\n", + "\n", + "show_value_counts_pie2(col1,\"ATTEMPT\",col2)\n", + "show_value_counts_pie2(col1,\"HINT_REQUEST\",col2)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "Complex Interactions Between BodiesConcentrated Forces and Their EffectsEngineering Systems - Single Body EquilibriumMultiple Body Equilibrium - TrussesFrictionMultiple Body Equilibrium - FramesMoments of Inertia0246050k100kChoosing a Solvable SubsystemEquilibrium of a Single SubsystemStatically Equivalent LoadsApplications of Static Equivalency to Distributed ForcesFrictionMethod of JointsEquilibrium Under 2D Concentrated ForcesRepresenting Interactions Between BodiesRepresenting Engineering ConnectionsCouplesEffects of Multiple ForcesSolving Multiple SubsystemsEffects of ForceMethod of SectionsIntroduction to Free Body DiagramsDrawing FBDs of Multiple SubsystemsSecond Moment of AreaDrawing FBDs of a Single SubsystemMass Moment of Inertia051015010k20k30k40kSimplifying 3D loadings to 2D or 1D loadingApplying Force EquilibriumCenter of Gravity and CentroidApplying Force and Moment EquilibriumCombining MomentsCombining Concurrent ForcesPin ConnectionsOther ConnectionsFixed Connections0246805k10k15k20kresolve_into_componentscouple_represents_net_zero_forcereplace_general_loads_with_force_and_coupleidentify_interactionsimple_stepcouple_related_to_forcesfind_moment_armrepresent_interaction_springstatics_problem_force_and_momentfind_linear_force_per_lengthrepresent_interaction_cordjudge_equilibrium_qualitativelyfind_symmetry_planerepresent_interaction_contacting_bodyrepresent_interaction_pin_connectionreplace_forces_in_opposite_sense_with_force_and_couplemoment_sign_sense_relationbody_draw_force_onmoving_force_perpendicular_to_line_of_actioncentroid_of_composite_arearotation_sense_of_forceanticipate_solved_variablesmoving_force_to_general_pointmotion_dependence_on_forcerecognize_equivalence_of_translated_forcesinterpret_equationrepresent_interaction_roller_connectionequivalence_of_couplesidentify_forces_in_symmetry_planerecognize_conditions_for_full_equivalenceforce_at_joint_implied_by_previous_analysisrecognize_equivalence_from_motionstatics_problem_collineardetermine_subsystem_is_solvablejudge_force_sense_based_on_signidentify_two-force_memberpossible_interaction_for_nonuniform_contactfind_net_force_for_linear_distributionfind_pressure_under_linear_distributionrepresent_interaction_pin_in_slot_connectiongravitational_forcesfind_net_force_position_for_linear_distributionrecognize_knowns_vs_unknownsmoment_about_point_due_to_couplefind_uniform_force_per_lengthrepresent_interaction_fixed_connectionNewtons_third_lawrepresent_interaction_rigid_sliding_connectionreplace_general_loads_with_single_forcefind_angle_given_componentsreplace_forces_in_same_sense_with_one_forcefind_uniform_pressure_under_weightidentify_external_load_points_on_sectioncount_independent_equationscan_connection_be_modeled_in_2Drecognize_forces_concurrentfind_magnitude_given_componentsimpose_solve_concurrent_equilibriumdistinguish_rotation_translationrecognize_forces_collinearrepresent_forces_two-force_memberis_net_moment_sense_obviousdetermine_momentfind_net_force_for_uniform_distributionidentify_centroidrelate_direction_normal_force_and_contactfind_net_force_position_for_uniform_distributionchoose_subsystemcount_unknownschoose_moment_methoddistinguish_fixed_pin_connectionsidentify_enabling_unknownidentify_equation_isolates_specific_unknownconditions_equal_force_pulleysecond_moment_of_area_tabulated_shaperelative_magnitudes_moment_of_areasecond_moment_of_area_parallel_axis_theorempolar_moment_of_areamass_moment_of_inertia_parallel_axis_theoremrelative_magnitudes_mass_moment_of_inertia020406005k10kType: Level (Unit)Type: Level (Module)Type: Level (Section1)Type: KC (F2011)Bar of distributions for every typeLevel (Unit)Level (Module)Level (Section1)KC (F2011)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "col_bars = ['Level (Unit)','Level (Module)','Level (Section1)','KC (F2011)']\n", + "\n", + "fig = make_subplots(rows=3, cols=2, # 2*2\n", + " start_cell=\"top-left\", \n", + " subplot_titles=col_bars, \n", + " column_widths=[0.5, 0.5]) \n", + "traces = [\n", + " go.Bar(\n", + " x = df_transaction[colname].value_counts().reset_index().index.tolist(),\n", + " y = df_transaction[colname].value_counts().reset_index()[colname].tolist(),\n", + " name = 'Type: ' + str(colname),\n", + " text = df_transaction[colname].value_counts().reset_index()['index'].tolist(),\n", + " textposition = 'auto',\n", + " ) for colname in col_bars\n", + "]\n", + "for i in range(len(traces)):\n", + " fig.append_trace(\n", + " traces[i],\n", + " (i //2) + 1, # pos_row\n", + " (i % 2) + 1 # pos_col\n", + " )\n", + " \n", + "fig.update_layout(\n", + " title_text = 'Bar of distributions for every type',\n", + ")\n", + "\n", + "fig.show(\"svg\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> According to the chart below, there are 3 schools with a smaller sample of students." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/svg+xml": [ + "Marion Technical CollegeSinclair Community CollegeCarnegie Mellon UniversityKettering UniversityMiami UniversityUniversity of Maryland Eastern ShoreUniversity of MississippiMarion Technical CollegeSinclair Community CollegeCarnegie Mellon UniversityKettering UniversityMiami UniversityUniversity of Maryland Eastern ShoreUniversity of Mississippi050100150" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/svg+xml": [ + "Carnegie Mellon University44.2%Miami University23.9%Sinclair Community College11.3%Kettering University11.3%University of Maryland Eastern Shore4.78%Marion Technical College2.69%University of Mississippi1.79%Carnegie Mellon UniversityMiami UniversitySinclair Community CollegeKettering UniversityUniversity of Maryland Eastern ShoreMarion Technical CollegeUniversity of Mississippi" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# 按学校统计学生人数\n", + "schools = [item for item in df_transaction_clear['School'].unique().tolist()]\n", + "students = [len(df_transaction_clear[df_transaction_clear['School'] == sch]['Anon Student Id'].unique()) for sch in schools]\n", + "fig = go.Figure(data=[go.Bar(\n", + " x = schools,\n", + " y = students,\n", + " name = 'The number of students is counted by school',\n", + " text = schools,\n", + " textposition = 'auto',\n", + " )])\n", + "fig.show(\"svg\")\n", + "fig = go.Figure(data=[go.Pie(\n", + " labels = schools,\n", + " values = students,\n", + " name = 'The number of students is counted by school',\n", + " text = schools, \n", + " textposition = 'auto',\n", + " )])\n", + "fig.show(\"svg\")" + ] + } + ], + "metadata": { + "celltoolbar": "原始单元格格式", + "kernelspec": { + "display_name": "Data", + "language": "python", + "name": "data" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}