-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data _Structures.sql
312 lines (250 loc) · 8.9 KB
/
Data _Structures.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
use role sysadmin;
-- Create a new database and set the context to use the new database
CREATE DATABASE LIBRARY_CARD_CATALOG COMMENT = 'DWW Lesson 9 ';
USE DATABASE LIBRARY_CARD_CATALOG;
-- Create Author table
CREATE OR REPLACE TABLE AUTHOR (
AUTHOR_UID NUMBER
,FIRST_NAME VARCHAR(50)
,MIDDLE_NAME VARCHAR(50)
,LAST_NAME VARCHAR(50)
);
-- Insert the first two authors into the Author table
INSERT INTO AUTHOR(AUTHOR_UID,FIRST_NAME,MIDDLE_NAME, LAST_NAME)
Values
(1, 'Fiona', '','Macdonald')
,(2, 'Gian','Paulo','Faleschini');
-- Look at your table with it's new rows
SELECT *
FROM AUTHOR;
use role sysadmin;
-- See how the nextval function works
SELECT SEQ_AUTHOR_UID.nextval;
show sequences;
use role sysadmin;
-- Drop and recreate the counter (sequence) so that it starts at 3
-- then we'll add the other author records to our author table
CREATE OR REPLACE SEQUENCE "LIBRARY_CARD_CATALOG"."PUBLIC"."SEQ_AUTHOR_UID"
START 3
INCREMENT 1
COMMENT = 'Use this to fill in the AUTHOR_UID every time you add a row';
-- Add the remaining author records and use the nextval function instead
-- of putting in the numbers
INSERT INTO AUTHOR(AUTHOR_UID,FIRST_NAME,MIDDLE_NAME, LAST_NAME)
Values
(SEQ_AUTHOR_UID.nextval, 'Laura', 'K','Egendorf')
,(SEQ_AUTHOR_UID.nextval, 'Jan', '','Grover')
,(SEQ_AUTHOR_UID.nextval, 'Jennifer', '','Clapp')
,(SEQ_AUTHOR_UID.nextval, 'Kathleen', '','Petelinsek');
select * from author;
USE DATABASE LIBRARY_CARD_CATALOG;
-- Create a new sequence, this one will be a counter for the book table
CREATE OR REPLACE SEQUENCE "LIBRARY_CARD_CATALOG"."PUBLIC"."SEQ_BOOK_UID"
START 1
INCREMENT 1
COMMENT = 'Use this to fill in the BOOK_UID everytime you add a row';
-- Create the book table and use the NEXTVAL as the
-- default value each time a row is added to the table
CREATE OR REPLACE TABLE BOOK
( BOOK_UID NUMBER DEFAULT SEQ_BOOK_UID.nextval
,TITLE VARCHAR(50)
,YEAR_PUBLISHED NUMBER(4,0)
);
-- Insert records into the book table
-- You don't have to list anything for the
-- BOOK_UID field because the default setting
-- will take care of it for you
INSERT INTO BOOK(TITLE,YEAR_PUBLISHED)
VALUES
('Food',2001)
,('Food',2006)
,('Food',2008)
,('Food',2016)
,('Food',2015);
-- Create the relationships table
-- this is sometimes called a "Many-to-Many table"
CREATE TABLE BOOK_TO_AUTHOR
( BOOK_UID NUMBER
,AUTHOR_UID NUMBER
);
-- Insert rows of the known relationships
INSERT INTO BOOK_TO_AUTHOR(BOOK_UID,AUTHOR_UID)
VALUES
(1,1) // This row links the 2001 book to Fiona Macdonald
,(1,2) // This row links the 2001 book to Gian Paulo Faleschini
,(2,3) // Links 2006 book to Laura K Egendorf
,(3,4) // Links 2008 book to Jan Grover
,(4,5) // Links 2016 book to Jennifer Clapp
,(5,6); // Links 2015 book to Kathleen Petelinsek
-- Check your work by joining the 3 tables together
-- You should get 1 row for every author
select *
from book_to_author ba
join author a
on ba.author_uid = a.author_uid
join book b
on b.book_uid=ba.book_uid;
-- // JSON DDL Scripts
USE LIBRARY_CARD_CATALOG;
-- // Create an Ingestion Table for JSON Data
CREATE TABLE LIBRARY_CARD_CATALOG.PUBLIC.AUTHOR_INGEST_JSON
(
RAW_AUTHOR VARIANT
);
-- //Create File Format for JSON Data
CREATE or replace FILE FORMAT LIBRARY_CARD_CATALOG.PUBLIC.JSON_FILE_FORMAT
TYPE = 'JSON'
COMPRESSION = 'AUTO'
ENABLE_OCTAL = FALSE
ALLOW_DUPLICATE = FALSE
STRIP_OUTER_ARRAY = TRUE
STRIP_NULL_VALUES = FALSE
IGNORE_UTF8_ERRORS = FALSE;
list @like_a_window_into_an_s3_bucket;
copy into AUTHOR_INGEST_JSON
from @util_db.public.like_a_window_into_an_s3_bucket
files = ( 'author_with_header.json')
file_format = ( format_name=LIBRARY_CARD_CATALOG.PUBLIC.JSON_FILE_FORMAT);
select * from author_ingest_json;
-- //returns AUTHOR_UID value from top-level object's attribute
select raw_author:AUTHOR_UID
from author_ingest_json;
-- //returns the data in a way that makes it look like a normalized table
SELECT
raw_author:AUTHOR_UID
,raw_author:FIRST_NAME::STRING as FIRST_NAME
,raw_author:MIDDLE_NAME::STRING as MIDDLE_NAME
,raw_author:LAST_NAME::STRING as LAST_NAME
FROM AUTHOR_INGEST_JSON;
select GRADER(step, (actual = expected), actual, expected, description) as graded_results from
(
SELECT 'DWW16' as step
,( select row_count
from LIBRARY_CARD_CATALOG.INFORMATION_SCHEMA.TABLES
where table_name = 'AUTHOR_INGEST_JSON') as actual
,6 as expected
,'Check number of rows' as description
);
-- // Create an Ingestion Table for the NESTED JSON Data
CREATE OR REPLACE TABLE LIBRARY_CARD_CATALOG.PUBLIC.NESTED_INGEST_JSON
(
"RAW_NESTED_BOOK" VARIANT
);
copy into NESTED_INGEST_JSON
from @util_db.public.like_a_window_into_an_s3_bucket
files = ( 'json_book_author_nested.json')
file_format = ( format_name=LIBRARY_CARD_CATALOG.PUBLIC.JSON_FILE_FORMAT);
-- //a few simple queries
SELECT RAW_NESTED_BOOK
FROM NESTED_INGEST_JSON;
SELECT RAW_NESTED_BOOK:year_published
FROM NESTED_INGEST_JSON;
SELECT RAW_NESTED_BOOK:authors
FROM NESTED_INGEST_JSON;
-- //Use these example flatten commands to explore flattening the nested book and author data
SELECT value:first_name
FROM NESTED_INGEST_JSON
,LATERAL FLATTEN(input => RAW_NESTED_BOOK:authors);
SELECT value:first_name
FROM NESTED_INGEST_JSON
,table(flatten(RAW_NESTED_BOOK:authors));
-- //Add a CAST command to the fields returned
SELECT value:first_name::VARCHAR, value:last_name::VARCHAR
FROM NESTED_INGEST_JSON
,LATERAL FLATTEN(input => RAW_NESTED_BOOK:authors);
-- //Assign new column names to the columns using "AS"
SELECT value:first_name::VARCHAR AS FIRST_NM
, value:last_name::VARCHAR AS LAST_NM
FROM NESTED_INGEST_JSON
,LATERAL FLATTEN(input => RAW_NESTED_BOOK:authors);
-- Set your worksheet drop lists. DO NOT EDIT THE DORA CODE.
select GRADER(step, (actual = expected), actual, expected, description) as graded_results from (
SELECT 'DWW17' as step
,( select row_count
from LIBRARY_CARD_CATALOG.INFORMATION_SCHEMA.TABLES
where table_name = 'NESTED_INGEST_JSON') as actual
, 5 as expected
,'Check number of rows' as description
);
-- //Create a new database to hold the Twitter file
CREATE DATABASE SOCIAL_MEDIA_FLOODGATES
COMMENT = 'Theres so much data from social media - flood warning';
USE DATABASE SOCIAL_MEDIA_FLOODGATES;
-- //Create a table in the new database
CREATE TABLE SOCIAL_MEDIA_FLOODGATES.PUBLIC.TWEET_INGEST
("RAW_STATUS" VARIANT)
COMMENT = 'Bring in tweets, one row per tweet or status entity';
-- //Create a JSON file format in the new database
CREATE FILE FORMAT SOCIAL_MEDIA_FLOODGATES.PUBLIC.JSON_FILE_FORMAT
TYPE = 'JSON'
COMPRESSION = 'AUTO'
ENABLE_OCTAL = FALSE
ALLOW_DUPLICATE = FALSE
STRIP_OUTER_ARRAY = TRUE
STRIP_NULL_VALUES = FALSE
IGNORE_UTF8_ERRORS = FALSE;
copy into TWEET_INGEST
from @util_db.public.like_a_window_into_an_s3_bucket
files = ( 'nutrition_tweets.json')
file_format = ( format_name=SOCIAL_MEDIA_FLOODGATES.PUBLIC.JSON_FILE_FORMAT);
select * from tweet_ingest;
-- //select statements as seen in the video
SELECT RAW_STATUS
FROM TWEET_INGEST;
SELECT RAW_STATUS:entities
FROM TWEET_INGEST;
SELECT RAW_STATUS:entities:hashtags
FROM TWEET_INGEST;
-- //Explore looking at specific hashtags by adding bracketed numbers
-- //This query returns just the first hashtag in each tweet
SELECT RAW_STATUS:entities:hashtags[0].text
FROM TWEET_INGEST;
-- //This version adds a WHERE clause to get rid of any tweet that
-- //doesn't include any hashtags
SELECT RAW_STATUS:entities:hashtags[0].text
FROM TWEET_INGEST
WHERE RAW_STATUS:entities:hashtags[0].text is not null;
-- //Perform a simple CAST on the created_at key
-- //Add an ORDER BY clause to sort by the tweet's creation date
SELECT RAW_STATUS:created_at::DATE
FROM TWEET_INGEST
ORDER BY RAW_STATUS:created_at::DATE;
-- Flatten statements that return the whole hashtag entity
SELECT value
FROM TWEET_INGEST
,LATERAL FLATTEN
(input => RAW_STATUS:entities:hashtags);
SELECT value
FROM TWEET_INGEST
,TABLE(FLATTEN(RAW_STATUS:entities:hashtags));
-- //Flatten statement that restricts the value to just the TEXT of the hashtag
SELECT value:text
FROM TWEET_INGEST
,LATERAL FLATTEN
(input => RAW_STATUS:entities:hashtags);
-- //Flatten and return just the hashtag text, CAST the text as VARCHAR
SELECT value:text::VARCHAR
FROM TWEET_INGEST
,LATERAL FLATTEN
(input => RAW_STATUS:entities:hashtags);
-- //Flatten and return just the hashtag text, CAST the text as VARCHAR
-- // Use the AS command to name the column
SELECT value:text::VARCHAR AS THE_HASHTAG
FROM TWEET_INGEST
,LATERAL FLATTEN
(input => RAW_STATUS:entities:hashtags);
-- Add the Tweet ID and User ID to the returned table
SELECT RAW_STATUS:user:id AS USER_ID
,RAW_STATUS:id AS TWEET_ID
,value:text::VARCHAR AS HASHTAG_TEXT
FROM TWEET_INGEST
,LATERAL FLATTEN
(input => RAW_STATUS:entities:hashtags);
create or replace view SOCIAL_MEDIA_FLOODGATES.PUBLIC.HASHTAGS_NORMALIZED as
(SELECT RAW_STATUS:user:id AS USER_ID
,RAW_STATUS:id AS TWEET_ID
,value:text::VARCHAR AS HASHTAG_TEXT
FROM TWEET_INGEST
,LATERAL FLATTEN
(input => RAW_STATUS:entities:hashtags)
);