-
Notifications
You must be signed in to change notification settings - Fork 13
/
08.softmax.sql
388 lines (388 loc) · 15.5 KB
/
08.softmax.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
WITH RECURSIVE
initial AS
(
SELECT ARRAY[6307, 47701, 318, 1049] AS input
),
hparams AS
(
SELECT 12 AS n_block,
5 AS top_n,
ARRAY_LENGTH(input, 1) AS n_seq
FROM initial
),
embeddings AS
(
SELECT place, values
FROM initial
CROSS JOIN
hparams
CROSS JOIN LATERAL
UNNEST(input) WITH ORDINALITY AS tokens (token, ordinality)
CROSS JOIN LATERAL
(
SELECT ordinality - 1 AS place
) o
CROSS JOIN LATERAL
(
SELECT wte.values + wpe.values AS values
FROM wte
CROSS JOIN
wpe
WHERE wte.token = tokens.token
AND wpe.place = o.place
) embedding
),
transform AS
(
SELECT 0 AS block, place, values
FROM embeddings
UNION ALL
(
WITH previous AS
(
SELECT *
FROM transform
)
SELECT block + 1 AS block, transformed_layer.*
FROM hparams
CROSS JOIN LATERAL
(
SELECT block
FROM previous
WHERE block < 12
LIMIT 1
) q
CROSS JOIN LATERAL
(
WITH ln_2_b AS
(
SELECT *
FROM ln_2_b
WHERE block = q.block
),
ln_2_g AS
(
SELECT *
FROM ln_2_g
WHERE block = q.block
),
c_proj_w AS
(
SELECT *
FROM c_proj_w
WHERE block = q.block
),
c_proj_b AS
(
SELECT *
FROM c_proj_b
WHERE block = q.block
),
mlp_c_fc_w AS
(
SELECT *
FROM mlp_c_fc_w
WHERE block = q.block
),
mlp_c_fc_b AS
(
SELECT *
FROM mlp_c_fc_b
WHERE block = q.block
),
mlp_c_proj_w AS
(
SELECT *
FROM mlp_c_proj_w
WHERE block = q.block
),
mlp_c_proj_b AS
(
SELECT *
FROM mlp_c_proj_b
WHERE block = q.block
),
c_attn_w AS
(
SELECT *
FROM c_attn_w
WHERE block = q.block
),
c_attn_b AS
(
SELECT *
FROM c_attn_b
WHERE block = q.block
),
ln_1_g AS
(
SELECT *
FROM ln_1_g
WHERE block = q.block
),
ln_1_b AS
(
SELECT *
FROM ln_1_b
WHERE block = q.block
),
mha_norm AS
(
SELECT place, mm.values + c_attn_b.values AS values
FROM (
SELECT place, ARRAY_AGG(INNER_PRODUCT(c_attn_w.values, layer_norm.values) ORDER BY y)::VECTOR(2304) AS values
FROM (
SELECT place, agg.values * ln_1_g.values + ln_1_b.values AS values
FROM (
SELECT place, norm.values
FROM previous
CROSS JOIN LATERAL
(
SELECT AVG(value) AS mean,
VAR_POP(value) AS variance
FROM UNNEST(values::REAL[]) value
) agg
CROSS JOIN LATERAL
(
SELECT ARRAY_AGG((value - mean) / SQRT(variance + 1E-5) ORDER BY ordinality)::VECTOR(768) AS values
FROM UNNEST(values::REAL[]) WITH ORDINALITY AS n(value, ordinality)
) norm
) agg
CROSS JOIN
ln_1_b
CROSS JOIN
ln_1_g
) layer_norm
CROSS JOIN
c_attn_w
GROUP BY
place
) mm
CROSS JOIN
c_attn_b
),
heads AS
(
SELECT place, head,
(values::REAL[])[(head * 64 + 1):(head * 64 + 64)]::VECTOR(64) AS q,
(values::REAL[])[(head * 64 + 1 + 768):(head * 64 + 64 + 768)]::VECTOR(64) AS k,
(values::REAL[])[(head * 64 + 1 + 1536):(head * 64 + 64 + 1536)]::VECTOR(64) AS v
FROM mha_norm
CROSS JOIN
GENERATE_SERIES(0, 11) head
),
sm_input AS
(
SELECT head, h1.place AS x, h2.place AS y, INNER_PRODUCT(h1.q, h2.k) / 8 + CASE WHEN h2.place > h1.place THEN -1E10 ELSE 0 END AS value
FROM heads h1
JOIN heads h2
USING (head)
),
sm_diff AS
(
SELECT head, x, y, value - MAX(value) OVER (PARTITION BY head, x) AS diff
FROM sm_input
),
sm_exp AS
(
SELECT head, x, y, CASE WHEN diff < -745.13 THEN 0 ELSE EXP(diff) END AS e
FROM sm_diff
),
softmax AS
(
SELECT head, x, y AS place, e / SUM(e) OVER (PARTITION BY head, x) AS value
FROM sm_exp
),
attention AS
(
SELECT place, ARRAY_AGG(value ORDER BY head * 64 + ordinality)::VECTOR(768) AS values
FROM (
SELECT head, x AS place, SUM(ARRAY_FILL(softmax.value, ARRAY[64])::VECTOR(64) * heads.v) AS values
FROM softmax
JOIN heads
USING (head, place)
GROUP BY
head, x
) q
CROSS JOIN LATERAL
UNNEST(values::REAL[]) WITH ORDINALITY v (value, ordinality)
GROUP BY
place
),
mha AS
(
SELECT place, w.values + c_proj_b.values + previous.values AS values
FROM (
SELECT attention.place, ARRAY_AGG(INNER_PRODUCT(attention.values, c_proj_w.values) ORDER BY c_proj_w.place)::VECTOR(768) AS values
FROM attention
CROSS JOIN
c_proj_w
GROUP BY
attention.place
) w
CROSS JOIN
c_proj_b
JOIN previous
USING (place)
),
ffn_norm AS
(
SELECT place, agg.values * ln_2_g.values + ln_2_b.values AS values
FROM (
SELECT place, norm.values
FROM mha
CROSS JOIN LATERAL
(
SELECT AVG(value) AS mean,
VAR_POP(value) AS variance
FROM UNNEST(values::REAL[]) value
) agg
CROSS JOIN LATERAL
(
SELECT ARRAY_AGG((value - mean) / SQRT(variance + 1E-5) ORDER BY ordinality)::VECTOR(768) AS values
FROM UNNEST(values::REAL[]) WITH ORDINALITY AS n(value, ordinality)
) norm
) agg
CROSS JOIN
ln_2_b
CROSS JOIN
ln_2_g
),
ffn_a AS
(
SELECT gelu.place, gelu.values
FROM (
SELECT place, w.values + mlp_c_fc_b.values AS values
FROM (
SELECT ffn_norm.place, ARRAY_AGG(INNER_PRODUCT(ffn_norm.values, mlp_c_fc_w.values) ORDER BY mlp_c_fc_w.place)::VECTOR(3072) AS values
FROM ffn_norm
CROSS JOIN
mlp_c_fc_w
GROUP BY
ffn_norm.place
) w
CROSS JOIN
mlp_c_fc_b
) v
CROSS JOIN LATERAL
(
SELECT place, ARRAY_AGG(0.5 * value * (1 + TANH(0.797884560802 * (value + 0.044715 * value*value*value))) ORDER BY ordinality)::VECTOR(3072) AS values
FROM UNNEST(values::REAL[]) WITH ORDINALITY n (value, ordinality)
GROUP BY
place
) gelu
),
ffn AS
(
SELECT place, w.values + mlp_c_proj_b.values + mha.values AS values
FROM (
SELECT ffn_a.place, ARRAY_AGG(INNER_PRODUCT(ffn_a.values, mlp_c_proj_w.values) ORDER BY mlp_c_proj_w.place)::VECTOR(768) AS values
FROM ffn_a
CROSS JOIN
mlp_c_proj_w
GROUP BY
ffn_a.place
) w
CROSS JOIN
mlp_c_proj_b
JOIN mha
USING (place)
)
SELECT *
FROM ffn
) transformed_layer
)
),
block_output AS
(
SELECT *
FROM hparams
JOIN transform
ON transform.block = n_block
),
ln_f AS
(
SELECT place, norm.values * ln_f_g.values + ln_f_b.values AS values
FROM block_output
CROSS JOIN LATERAL
(
SELECT AVG(value) AS mean,
VAR_POP(value) AS variance
FROM UNNEST(values::REAL[]) AS n(value)
) agg
CROSS JOIN LATERAL
(
SELECT ARRAY_AGG((value - mean) / SQRT(variance + 1E-5) ORDER BY ordinality)::VECTOR(768) AS values
FROM UNNEST(values::REAL[]) WITH ORDINALITY AS n (value, ordinality)
) norm
CROSS JOIN
ln_f_b
CROSS JOIN
ln_f_g
),
logits AS
(
SELECT logits.*
FROM hparams
CROSS JOIN LATERAL
(
SELECT token, INNER_PRODUCT(ln_f.values, wte.values) AS value
FROM ln_f
CROSS JOIN
wte
WHERE ln_f.place = n_seq - 1
ORDER BY
value DESC
LIMIT (top_n)
) logits
),
temperatures (temperature) AS
(
VALUES
(0.5),
(1),
(2)
),
tokens AS
(
SELECT token, value, softmax, temperature
FROM temperatures
CROSS JOIN LATERAL
(
SELECT *, (e / SUM(e) OVER ()) AS softmax
FROM (
SELECT *,
(value - MAX(value) OVER ()) / temperature AS diff
FROM logits
) exp_x
CROSS JOIN LATERAL
(
SELECT CASE WHEN diff < -745.13 THEN 0 ELSE EXP(diff) END AS e
) exp
) q
)
SELECT token,
cluster,
TO_CHAR(t1.value, 'S00.000') AS score,
TO_CHAR(t1.softmax, '0.00') AS "temperature = 0.5",
TO_CHAR(t2.softmax, '0.00') AS "temperature = 1",
TO_CHAR(t3.softmax, '0.00') AS "temperature = 2"
FROM (
SELECT *
FROM tokens
WHERE temperature = 0.5
) t1
JOIN (
SELECT *
FROM tokens
WHERE temperature = 1
) t2
USING (token)
JOIN (
SELECT *
FROM tokens
WHERE temperature = 2
) t3
USING (token)
JOIN tokenizer
USING (token)