Skip to content

Commit

Permalink
Add SQL data quality checks
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnOlutubo authored May 20, 2024
1 parent 7fd5d9a commit 2bbc034
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 0 deletions.
12 changes: 12 additions & 0 deletions assets/scripts/1_row_count_check.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
/*
# 1. Row count check
Count the total number of records (or rows) are in the SQL view
*/

SELECT
COUNT(*) AS no_of_rows
FROM
view_uk_youtubers_2024;
14 changes: 14 additions & 0 deletions assets/scripts/2_column_count_check.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/*
# 2. Column count check
Count the total number of columns (or fields) are in the SQL view
*/


SELECT
COUNT(*) AS column_count
FROM
INFORMATION_SCHEMA.COLUMNS
WHERE
TABLE_NAME = 'view_uk_youtubers_2024'
15 changes: 15 additions & 0 deletions assets/scripts/3_data_type_check.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/*
# 3. Data type check
Check the data types of each column from the view by checking the INFORMATION SCHEMA view
*/


SELECT
COLUMN_NAME,
DATA_TYPE
FROM
INFORMATION_SCHEMA.COLUMNS
WHERE
TABLE_NAME = 'view_uk_youtubers_2024';
25 changes: 25 additions & 0 deletions assets/scripts/4_duplicate_records_check.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
# 4. Duplicate records check
-- 1. Check for duplicate rows in the view
-- 2. Group by the channel name
-- 3. Filter for groups with more than one row
*/


-- 1.
SELECT
channel_name,
COUNT(*) AS duplicate_count
FROM
view_uk_youtubers_2024

-- 2.
GROUP BY
channel_name

-- 3.
HAVING
COUNT(*) > 1;
70 changes: 70 additions & 0 deletions assets/scripts/data_quality_tests_full.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
# Data quality tests
1. The data needs to be 100 records of YouTube channels (row count test) --- (passed!!!)
2. The data needs 4 fields (column count test) --- (passed!!!)
3. The channel name column must be string format, and the other columns must be numerical data types (data type check) --- (passed!!!)
4. Each record must be unique in the dataset (duplicate count check) --- (passed!!!)
Row count - 100
Column count - 4
Data types
channel_name = VARCHAR
total_subscribers = INTEGER
total_views = INTEGER
total_videos = INTEGER
Duplicate count = 0
*/


-- 1. Row count check

SELECT
COUNT(*) as no_of_rows
FROM
view_uk_youtubers_2024


-- 2. Column count check

SELECT
COUNT(*) as column_count
FROM
INFORMATION_SCHEMA.COLUMNS
WHERE
TABLE_NAME = 'view_uk_youtubers_2024'



-- 3. Data type check


SELECT
COLUMN_NAME,
DATA_TYPE
FROM
INFORMATION_SCHEMA.COLUMNS
WHERE
TABLE_NAME = 'view_uk_youtubers_2024'



-- 4. Duplicate records check

SELECT
channel_name,
COUNT(*) as duplicate_count
FROM
view_uk_youtubers_2024
GROUP BY
channel_name
HAVING
COUNT(*) > 1

0 comments on commit 2bbc034

Please sign in to comment.