diff --git a/assets/scripts/1_row_count_check.sql b/assets/scripts/1_row_count_check.sql new file mode 100644 index 0000000..8c874c8 --- /dev/null +++ b/assets/scripts/1_row_count_check.sql @@ -0,0 +1,12 @@ +/* + +# 1. Row count check + +Count the total number of records (or rows) are in the SQL view + +*/ + +SELECT + COUNT(*) AS no_of_rows +FROM + view_uk_youtubers_2024; diff --git a/assets/scripts/2_column_count_check.sql b/assets/scripts/2_column_count_check.sql new file mode 100644 index 0000000..49912ee --- /dev/null +++ b/assets/scripts/2_column_count_check.sql @@ -0,0 +1,14 @@ +/* +# 2. Column count check + +Count the total number of columns (or fields) are in the SQL view + +*/ + + +SELECT + COUNT(*) AS column_count +FROM + INFORMATION_SCHEMA.COLUMNS +WHERE + TABLE_NAME = 'view_uk_youtubers_2024' \ No newline at end of file diff --git a/assets/scripts/3_data_type_check.sql b/assets/scripts/3_data_type_check.sql new file mode 100644 index 0000000..2f56b88 --- /dev/null +++ b/assets/scripts/3_data_type_check.sql @@ -0,0 +1,15 @@ +/* +# 3. Data type check + +Check the data types of each column from the view by checking the INFORMATION SCHEMA view + +*/ + + +SELECT + COLUMN_NAME, + DATA_TYPE +FROM + INFORMATION_SCHEMA.COLUMNS +WHERE + TABLE_NAME = 'view_uk_youtubers_2024'; \ No newline at end of file diff --git a/assets/scripts/4_duplicate_records_check.sql b/assets/scripts/4_duplicate_records_check.sql new file mode 100644 index 0000000..9040627 --- /dev/null +++ b/assets/scripts/4_duplicate_records_check.sql @@ -0,0 +1,25 @@ +/* + +# 4. Duplicate records check + +-- 1. Check for duplicate rows in the view +-- 2. Group by the channel name +-- 3. Filter for groups with more than one row + +*/ + + +-- 1. +SELECT + channel_name, + COUNT(*) AS duplicate_count +FROM + view_uk_youtubers_2024 + +-- 2. +GROUP BY + channel_name + +-- 3. +HAVING + COUNT(*) > 1; \ No newline at end of file diff --git a/assets/scripts/data_quality_tests_full.sql b/assets/scripts/data_quality_tests_full.sql new file mode 100644 index 0000000..68ff6f4 --- /dev/null +++ b/assets/scripts/data_quality_tests_full.sql @@ -0,0 +1,70 @@ +/* + +# Data quality tests + +1. The data needs to be 100 records of YouTube channels (row count test) --- (passed!!!) +2. The data needs 4 fields (column count test) --- (passed!!!) +3. The channel name column must be string format, and the other columns must be numerical data types (data type check) --- (passed!!!) +4. Each record must be unique in the dataset (duplicate count check) --- (passed!!!) + + +Row count - 100 +Column count - 4 + + +Data types + +channel_name = VARCHAR +total_subscribers = INTEGER +total_views = INTEGER +total_videos = INTEGER + +Duplicate count = 0 + + +*/ + + +-- 1. Row count check + +SELECT + COUNT(*) as no_of_rows +FROM + view_uk_youtubers_2024 + + +-- 2. Column count check + +SELECT + COUNT(*) as column_count +FROM + INFORMATION_SCHEMA.COLUMNS +WHERE + TABLE_NAME = 'view_uk_youtubers_2024' + + + +-- 3. Data type check + + +SELECT + COLUMN_NAME, + DATA_TYPE +FROM + INFORMATION_SCHEMA.COLUMNS +WHERE + TABLE_NAME = 'view_uk_youtubers_2024' + + + +-- 4. Duplicate records check + +SELECT + channel_name, + COUNT(*) as duplicate_count +FROM + view_uk_youtubers_2024 +GROUP BY + channel_name +HAVING + COUNT(*) > 1 \ No newline at end of file