-
Notifications
You must be signed in to change notification settings - Fork 66
/
NLP-linear.tex
1088 lines (759 loc) · 37.1 KB
/
NLP-linear.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%\documentclass[mathserif]{beamer}
\documentclass[handout]{beamer}
%\usetheme{Goettingen}
%\usetheme{Warsaw}
\usetheme{Singapore}
%\usetheme{Frankfurt}
%\usetheme{Copenhagen}
%\usetheme{Szeged}
%\usetheme{Montpellier}
%\usetheme{CambridgeUS}
%\usecolortheme{}
%\setbeamercovered{transparent}
\usepackage[english, activeacute]{babel}
\usepackage[utf8]{inputenc}
\usepackage{amsmath, amssymb}
\usepackage{dsfont}
\usepackage{graphics}
\usepackage{cases}
\usepackage{graphicx}
\usepackage{pgf}
\usepackage{epsfig}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{amstext}
\usepackage[ruled,vlined,lined]{algorithm2e}
\usepackage{amsmath}
\usepackage{epic}
\usepackage{epsfig}
\usepackage{fontenc}
\usepackage{framed,color}
\usepackage{palatino, url, multicol}
%\algsetup{indent=2em}
\newcommand{\factorial}{\ensuremath{\mbox{\sc Factorial}}}
\newcommand{\BIGOP}[1]{\mathop{\mathchoice%
{\raise-0.22em\hbox{\huge $#1$}}%
{\raise-0.05em\hbox{\Large $#1$}}{\hbox{\large $#1$}}{#1}}}
\newcommand{\bigtimes}{\BIGOP{\times}}
\vspace{-0.5cm}
\title{Natural Language Processing \\ Linear Models}
\vspace{-0.5cm}
\author[Felipe Bravo Márquez]{\footnotesize
%\author{\footnotesize
\textcolor[rgb]{0.00,0.00,1.00}{Felipe Bravo-Marquez}}
\date{\today}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
\begin{frame}{Supervised Learning}
\begin{scriptsize}
\begin{itemize}
\item The essence of supervised machine learning is the creation of mechanisms that can look at examples and produce generalizations. \cite{goldberg2017neural}
\item We design an algorithm whose input is a set of labeled examples, and
its output is a function (or a program) that receives an instance and produces the desired label.
\item Example: if the task is to distinguish from spam and not-spam email, the labeled examples are emails labeled as spam and emails labeled as not-spam.
\item It is expected that the resulting function will produce correct label
predictions also for instances it has not seen during training.
\item This approach differs from designing an algorithm to perform the task (e.g., manually designed rule-based systems).
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Parameterized Functions}
\begin{scriptsize}
\begin{itemize}
\item Searching over the set of all possible functions is a very hard (and rather ill-defined) problem. \cite{goldberg2017neural}
\item We often restrict ourselves to search over specific families of functions.
\item Example: the space of all linear functions with $d_{in}$ inputs and $d_{out}$ outputs,
\item Such families of functions are called \textbf{hypothesis classes}.
\item By restricting ourselves to a specific hypothesis class, we are injecting the learner with \textbf{inductive bias}.
\item Inductive bias: a set of assumptions about the form of the desired solution.
\item Some hypothesis classes facilitate efficient procedures for searching for the solution. \cite{goldberg2017neural}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Linear Models}
\begin{scriptsize}
\begin{itemize}
\item One common hypothesis class is that of high-dimensional linear function:
\begin{equation}
\begin{split}
f(\vec{x}) = \vec{x} \cdot W + \vec{b}\\
\vec{x} \in \mathcal{R}^{d_{in}} & \quad W \in \mathcal{R}^{d_{in}\times d_{out}} \quad \vec{b} \in \mathcal{R}^{d_{out}}
\end{split}
\end{equation}
\item The vector $\vec{x}$ is the input to the function.
\item The matrix $W$ and the vector $\vec{b}$ are the \textbf{parameters}.
\item The goal of the learner is to set the values of the parameters $W$ and $\vec{b}$ such that the function behaves as intended on a collection of input values $\vec{x}_{1:k} = \vec{x}_1,\dots,\vec{x}_k$ and the corresponding desired outputs $\vec{y}_{1:k} = \vec{y}_1,\dots,\vec{y}_k$
\item The task of searching over the space of functions is thus reduced to one of searching over the space of parameters. \cite{goldberg2017neural}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Example: Language Detection}
\begin{scriptsize}
\begin{itemize}
\item Consider the task of distinguishing documents written in English from documents written in German.
\item This is a binary classification problem
\begin{equation}
\begin{split}
f(\vec{x}) = \vec{x} \cdot \vec{w} + b\\
\end{split}
\end{equation}
$d_{out}=1$, $\vec{w}$ is a vector, and $b$ is a scalar.
\item The range of the linear function in is $[-\infty,\infty]$.
\item In order to use it for binary classification, it is common to pass the output of $f(x)$ through the $sign$ function, mapping negative values to -1 (the negative class) and non-negative values to +1 (the positive class).
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Example: Language Detection}
\begin{scriptsize}
\begin{itemize}
\item Letter frequencies make for quite good predictors (features) for this task.
\item Even more informative are counts of letter bigrams, i.e., pairs of consecutive letters.
\item One may think that words will also be good predictors i.e., using a bag of word representation of documents.
\item Letters, or letter-bigrams are far more robust.
\item We are likely to encounter a new document without any of the words we observed in the training set.
\item While a document without any of the distinctive letter-bigrams is significantly less likely. \cite{goldberg2017neural}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Example: Language Detection}
\begin{scriptsize}
\begin{itemize}
\item We assume we have an alphabet of 28 letters (a–z, space, and a special symbol for all other characters including digits, punctuations, etc.)
\item Documents are represented as $28\times28$ dimensional vectors $\vec{x} \in \mathcal{R}^{784}$.
\item Each entry $\vec{x}_{[i]}$ represents a count of a particular letter combination in the document, normalized by the document's length.
\item For example, denoting by $\vec{x}_{ab}$ the entry of $\vec{x}$ corresponding to the letter bigram $ab$:
\begin{equation}
\vec{x}_{ab}= \frac{\#ab}{|D|}
\end{equation}
where $\#ab$ is the number of times the bigram $ab$ appears in the document, and $|D|$ is the total number of bigrams in the document (the document’s length).
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Example: Language Detection}
\begin{figure}[htb]
\centering
\includegraphics[scale=0.26]{pics/langbigrams.png}
\end{figure}
\scriptsize{Character-bigram histograms for documents in English (left, blue) and German(right,green). Underscores denote spaces. Only the top frequent character-bigrams are showed.}
\footnotetext{Source:\cite{goldberg2017neural}}
\end{frame}
\begin{frame}{Example: Language Detection}
\begin{scriptsize}
\begin{itemize}
\item Previous figure showed clear patterns in the data, and, given a new item, such as:
\begin{figure}[htb]
\centering
\includegraphics[scale=0.4]{pics/langbigramstest.png}
\end{figure}
\item We could probably tell that it is more similar to the German group than to the English one (observe the frequency of ``th'' and ``ie'').
\item We can't use a single definite rule such as ``if it has th its English'' or ``if it has ie its German''.
\item While German texts have considerably less ``th'' than English, the ``th'' may and does occur in German texts, and similarly the ``ie'' combination does occur in English.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Example: Language Detection}
\begin{scriptsize}
\begin{itemize}
\item The decision requires weighting different factors relative to each other.
\item We can formalize the problem in a machine-learning setup using a linear model:
\begin{equation}
\begin{split}
\hat{y} \quad = sign(f(\vec{x})) & = sign(\vec{x}\cdot \vec{w} + b) \\
& = sign(\vec{x}_{aa}\times \vec{w}_{aa}+ \vec{x}_{ab}\times \vec{w}_{ab}+ \vec{x}_{ac}\times \vec{w}_{ac} \dots +b)
\end{split}
\end{equation}
\item A document will be considered English if $f(\vec{x}) \geq 0$ and as German otherwise.
\end{itemize}
\begin{block}{Intuition}
\begin{enumerate}
\item Learning should assign large positive values to $\vec{w}$ entries associated with letter pairs that are much more common in English than in German (i.e., ``th'').
\item It should also assign negative values to letter pairs that are much more common in German than in English (``ie'', ``en'').
\item Finally, it should assign values around zero to letter pairs that are either common or rare in both languages.
\end{enumerate}
\end{block}
\end{scriptsize}
\end{frame}
\begin{frame}{Log-linear Binary classification}
\begin{scriptsize}
\begin{itemize}
\item The output $f(\vec{x})$ is in the range $[-\infty,\infty]$ , and we map it to one of two classes $\{-1,+1\}$ using the $sign$ function.
\item This is a good fit if all we care about is the assigned class.
\item We may be interested also in the confidence of the decision, or the probability that the classifier assigns to the class.
\item An alternative that facilitates this is to map instead to the range $[0,1]$, by pushing the output through a squashing function such as the sigmoid $\sigma(x)$:
\begin{equation}
\sigma(x) = \frac{1}{1+e^{-x}}
\end{equation}
resulting in:
\begin{equation}
\hat{y}=\sigma(f(\vec{x})) = \frac{1}{1+e^{-\vec{x}\cdot \vec{w}+b}}
\end{equation}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{The Sigmoid function}
\begin{figure}[htb]
\centering
\includegraphics[scale=0.5]{pics/sigmoid.png}
\end{figure}
\end{frame}
\begin{frame}{The Sigmoid function}
\begin{scriptsize}
\begin{itemize}
\item The sigmoid function is is monotonically increasing, and maps values
to the range $[0, 1]$, with $0$ being mapped to $\frac{1}{2}$.
\item When used with a suitable loss function (discussed later) the binary predictions made through the log-linear model can be interpreted as class membership probability estimates:
\begin{equation}
\sigma(f(\vec{x})) = P(\hat{y} = 1| \vec{x}) \quad \text{of $\vec{x}$ belonging to the positive class.}
\end{equation}
\item We also get $P(\hat{y} = 0| \vec{x}) = 1 - P(\hat{y} = 1| \vec{x}) = 1 - \sigma(f(\vec{x}))$
\item The closer the value is to $0$ or $1$ the more certain the model is in its class membership prediction, with the value of $0.5$ indicating model uncertainty.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Multi-class Classification}
\begin{scriptsize}
\begin{itemize}
\item Most classification problems are of a multi-class nature: examples are assigned to one of $k$ different classes.
\item Example: we are given a document and asked to classify it into one of six possible languages: English, French, German, Italian, Spanish, Other.
\item Possible solution: consider six weight vectors $\vec{w}_{EN}$,$\vec{w}_{FR},\dots$ and biases (one for each language).
\item Predict the language resulting in the highest score:
\begin{equation}
\hat{y} = f(\vec{x}) = \operatorname{argmax}_{L \in \{ EN,FR,GR,IT,SP,O \}} \quad \vec{x}\cdot \vec{w}_L+ b_{L}
\end{equation}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Multi-class Classification}
\begin{scriptsize}
\begin{itemize}
\item The six sets of parameters $\vec{w}_L \in \mathcal{R}^{784}$ and $b_L$ can be arranged as a matrix $W \in \mathcal{R}^{784\times6}$ and vector $\vec{b} \in \mathcal{R}^6$ , and the equation re-written as:
\begin{equation}
\begin{split}
\vec{\hat{y}} = f(\vec{x}) = \quad & \vec{x} \cdot W + \vec{b}\\
\text{prediction} = \hat{y} = \quad & \operatorname{argmax}_i \vec{\hat{y}}_{[i]}
\end{split}
\end{equation}
\item Here $\vec{\hat{y}} \in \mathcal{R}^6$ is a vector of the scores assigned by the model to each language, and we again determine the predicted language by taking the argmax over the entries of $\vec{\hat{y}}$ (the columns with the highest value).
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Representations}
\begin{scriptsize}
\begin{itemize}
\item Consider the vector $\vec{\hat{y}}$ resulting from applying a trained model to a document.
\item The vector can be considered as a representation of the document.
\item It captures the properties of the document that are important to us: the scores of the different languages.
\item The representation $\vec{\hat{y}}$ contains strictly more information than the prediction $\operatorname{argmax}_i \vec{\hat{y}}_{[i]} $.
\item Example: $\vec{\hat{y}}$ can be used to distinguish documents in which the main language is German, but which also contain a sizeable amount of French words.
\item Clustering documents based on $\vec{\hat{y}}$ could help to discover documents written in regional dialects, or by multilingual authors.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Representations}
\begin{scriptsize}
\begin{itemize}
\item The vectors $\vec{x}$ containing the normalized letter-bigram counts for the documents are also representations of the documents.
\item Arguably containing a similar kind of information to the vectors $\vec{\hat{y}}$.
\item However, the representations in $\vec{\hat{y}}$ is more compact (6 entries instead of 784) and more specialized for the language prediction objective.
\item Clustering by the vectors $\vec{x}$ would likely reveal document similarities that are not due to a particular mix of languages, but perhaps due to the document's topic or writing styles.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Representations}
\begin{scriptsize}
\begin{itemize}
\item The trained matrix $W \in \mathcal{R}^{784 \times 6}$ can also be considered as containing learned representations.
\item We can consider two views of $W$, as rows or as columns.
\begin{figure}[htb]
\centering
\includegraphics[scale=0.35]{pics/2rep.png}
\end{figure}
Two views of the $W$ matrix. (a) Each column corresponds to a language. (b) Each row
corresponds to a letter bigram. Source: \cite{goldberg2017neural}.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Representations}
\begin{scriptsize}
\begin{itemize}
\item A column of $W$ can be taken to be a $784$-dimensional vector representation of a language in terms of its characteristic letter-bigram patterns.
\item We can then cluster the 6 language vectors according to their similarity.
\item Each of the 784 rows of $W$ provide a 6-dimensional vector representation of that bigram in terms of the languages it prompts.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Representations}
\begin{scriptsize}
\begin{itemize}
\item Representations are central to deep learning.
\item One could argue that the main power of deep-learning is the ability to learn good representations.
\item In the linear case, the representations are interpretable.
\item We can assign a meaningful interpretation to each dimension in the
representation vector.
\item For example: each dimension corresponds to a particular language or letter-bigram.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Representations}
\begin{scriptsize}
\begin{itemize}
\item Deep learning models, on the other hand, often learn a cascade of representations of the input that build on top of each other.
\item These representations are often not interpretable.
\item We do not know which properties of the input they capture.
\item However, they are still very useful for making predictions.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{One-Hot Vector Representation}
\begin{scriptsize}
\begin{itemize}
\item The input vector $\vec{x}$ in our language classification example contains the normalized bigram counts in the document $D$.
\item This vector can be decomposed into an average of $|D|$ vectors, each corresponding to a particular document position $i$:
\begin{equation}
\vec{x} = \frac{1}{|D|} \sum_{i=1}^{|D|} \vec{x}^{D_{[i]}}
\end{equation}
\item Here, $D_{[i]}$ is the bigram at document position $i$.
\item Each vector $\vec{x}^{D_{[i]}} \in \mathcal{R}^{784}$ is a one-hot vector.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{One-Hot Vector Representation}
\begin{scriptsize}
\begin{itemize}
\item A one-hot vector: all entries are zero except the single entry corresponding to the letter bigram $D_{[i]}$, which is 1.
\item The resulting vector $\vec{x}$ is commonly referred to as an averaged bag of bigrams (more generally averaged bag of words , or just bag of words).
\item Bag-of-words (BOW) representations contain information about the identities of all the ``words'' (here, bigrams) of the document, without considering their order.
\item A one-hot representation can be considered as a bag-of-a-single-word.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{One-Hot Vector Representation}
\begin{scriptsize}
\begin{figure}[htb]
\centering
\includegraphics[scale=0.35]{pics/onehot.png}
\end{figure}
One-hot vectors of words. Source: \url{https://medium.com/@athif.shaffy/one-hot-encoding-of-text-b69124bef0a7}.
\end{scriptsize}
\end{frame}
\begin{frame}{Log-linear Multi-class Classification}
\begin{scriptsize}
\begin{itemize}
\item In the binary case, we transformed the linear prediction into a probability estimate by passing it through the sigmoid function, resulting in a log-linear model.
\item The analog for the multi-class case is passing the score vector through the \textbf{softmax} function:
\begin{equation}
\operatorname{softmax}(\vec{x})_{[i]} = \frac{e^{\vec{x}_{[i]}}}{\sum_j e^{\vec{x}_{[j]}}}
\end{equation}
Resulting in:
\begin{equation}
\begin{split}
\vec{\hat{y}} \quad & = \operatorname{softmax}(\vec{x} \cdot W + \vec{b}) \\
\vec{\hat{y}}_{[i]} \quad & = \frac{e^{(\vec{x} \cdot W + \vec{b})_{[i]}}}{\sum_j e^{(\vec{x} \cdot W + \vec{b})_{[j]}}}
\end{split}
\end{equation}
\item The softmax transformation forces the values in $\hat{\vec{y}}$ to be positive and sum to 1, making them interpretable as a probability distribution.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Training}
\begin{scriptsize}
\begin{itemize}
\item When training a parameterized function (e.g., a linear model, a neural network) one defines a loss function $L(\hat{y}, y)$, stating the loss of predicting $\hat{y}$ when the true output is y.
\begin{displaymath}
L(f(\vec{x};\Theta), y)
\end{displaymath}
\item We use the symbol $\Theta$ to denote all the parameters of the model (e.g., $W, \vec{b}$)
\item The training objective is then to minimize the loss across the different training examples.
\item Formally, a loss function $L(\hat{y},y)$ assigns a numerical score (a scalar) to a predicted output $\hat{y}$ given the true expected output $y$.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Training}
\begin{scriptsize}
\begin{itemize}
\item The loss function should attain its minimum value for cases where the prediction is correct.
\item We can also define a corpus-wide loss with respect to the parameters $\Theta$ as the average loss over all training examples:
\begin{displaymath}
\mathcal{L}(\Theta) = \frac{1}{n} \sum_{i=1}^n L(f(\vec{x}_i;\Theta), y_i)
\end{displaymath}
\item The goal of the training algorithm is then to set the values of the parameters $\Theta$ such that the value of $\mathcal{L}$ is minimized.
\begin{displaymath}
\hat{\Theta} = \operatorname{argmin}_{\Theta} \mathcal{L}(\Theta) = \operatorname{argmin}_{\Theta} \frac{1}{n} \sum_{i=1}^n L(f(\vec{x}_i;\Theta), y_i)
\end{displaymath}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Gradient-based Optimization}
\begin{scriptsize}
\begin{itemize}
\item Functions are trained using gradient-based methods.
\item They work by repeatedly computing an estimate of the loss $L$ over the training set.
\item The training method computes gradients of the parameters ($\Theta$) with respect to the loss estimate, and moving the parameters in the opposite directions of the gradient.
\item Different optimization methods differ in how the error estimate is computed, and how moving in the opposite direction of the gradient is defined.
\item If the function is convex, the optimum will be a global one.
\item Otherwise, the process is only guaranteed to find a local optimum.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Gradient Descent}
\begin{figure}[htb]
\centering
\includegraphics[scale=0.15]{pics/sgd.png}
\end{figure}
\footnotetext{Source: \url{https://sebastianraschka.com/images/faq/closed-form-vs-gd/ball.png}}
\end{frame}
\begin{frame}{Gradient Descent}
\begin{figure}[htb]
\centering
\includegraphics[scale=0.45]{pics/gradientdescent.png}
\end{figure}
\footnotetext{\cite{goodfellow2016deep}}
\end{frame}
\begin{frame}{Online Stochastic Gradient Descent}
\begin{scriptsize}
\begin{itemize}
\item All the parameters are initialized with random values ($\Theta$).
\item For each training example $(x,y)$ we calculate the loss $L$ with current values of $\Theta$.
\item Then we update the parameters with the following rule until convergence:
\item $\Theta_i \leftarrow \Theta_i - \eta \frac{\partial L}{\Theta_i}(\hat{y},y)$ (for all parameters $\Theta_i$)
\begin{figure}[htb]
\centering
\includegraphics[scale=0.3]{pics/Online-SGD.png}
\end{figure}
\end{itemize}
\footnotetext{Source:\cite{goldberg2017neural}}
\end{scriptsize}
\end{frame}
\begin{frame}{Online Stochastic Gradient Descent}
\begin{scriptsize}
\begin{itemize}
\item The learning rate can either be fixed throughout the
training process, or decay as a function of the time step $t$.
\item The error calculated in line 3 is based on a single training example, and is thus just a rough estimate of the corpus-wide loss $L$ that we are aiming to minimize.
\item The noise in the loss computation may result in inaccurate gradients (single examples may provide noisy information).
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Mini-batch Stochastic Gradient Descent}
\begin{scriptsize}
\begin{itemize}
\item A common way of reducing this noise is
to estimate the error and the gradients based on a sample of $m$ examples.
\item This gives rise to the minibatch SGD algorithm
\begin{figure}[htb]
\centering
\includegraphics[scale=0.25]{pics/minibatch-SGD.png}
\end{figure}
\item Higher values of $m$ provide better estimates of the corpus-wide gradients, while smaller values allow more updates and in turn faster convergence.
\item For modest sizes of $m$ , some computing architectures (i.e., GPUs) allow an efficient parallel implementation of the computation in lines 3-6.
\end{itemize}
\end{scriptsize}
\footnotetext{Source:\cite{goldberg2017neural}}
\end{frame}
\begin{frame}{Loss Functions}
\begin{scriptsize}
\begin{itemize}
\item Hinge (or SVM loss): for binary classification problems, the classifier's output is a single scalar $\tilde{y}$ and the intended output y is in $\{+1,-1\}$. The classification rule is $\hat{y} = sign(\tilde{y})$, and a classification is considered correct if $y \cdot \tilde{y} > 0$.
\begin{displaymath}
L_{\text{hinge}(\text{binary})}(\tilde{y},y) = \max(0,1-y \cdot \tilde{y})
\end{displaymath}
\item Binary cross entropy (or logistic loss): is used in binary classification with conditional probability outputs. The classifier's output $\tilde{y}$ is transformed using the sigmoid function to the range $[0,1]$ , and is interpreted as the conditional probability $P(y=1|x)$.
\begin{displaymath}
L_{\text{logistic}}(\hat{y},y) = -y \log \hat{y} - (1-y) \log(1-\hat{y})
\end{displaymath}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Loss Functions}
\begin{scriptsize}
\begin{itemize}
%\hat{y}=\sigma(f(\vec{x})) = \frac{1}{1+e^{-\vec{x}\cdot \vec{w}+b}}
\item The logistic loss has a probabilistic interpretation:
\item We assume that $P(y =1 | \vec{x} ; \Theta) = \sigma(f(\vec{x})) = \frac{1}{1+e^{-\vec{x}\cdot \vec{w}+b}}$ and $P(y = 0 | \vec{x} ; \Theta) = 1 - \sigma(f(\vec{x}))$
\item We can write this is a more compact way:
\begin{displaymath}
P(y | \vec{x} ; \Theta) = \sigma(f(\vec{x}))^y\times(1-\sigma(f(\vec{x})))^{1-y}
\end{displaymath}
\item The above expression is the probability mass function of the Bernoulli distribution.
\item Now we replace $\sigma(f(\vec{x}))$ by $\hat{y}$:
\begin{displaymath}
P(y | \vec{x} ; \Theta) = \hat{y}^y\times(1-\hat{y})^{1-y}
\end{displaymath}
\item If we perform maximum likelihood estimation to this expression we would apply a logarithm function and maximize the parameters $\Theta$:
\begin{displaymath}
y \log \hat{y} + (1-y) \log(1-\hat{y})
\end{displaymath}
\item Maximizing this expression is equivalent to minimizing the logistic loss!
\item Many loss functions correspond to the negative log-likelihood of probabilistic models!
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Loss Functions}
\begin{scriptsize}
\begin{itemize}
\item Categorical cross-entropy loss: is used when a probabilistic interpretation of multi-class scores is desired. It measures the dissimilarity between the true label distribution $\vec{y}$ and the predicted label distribution $\vec{\hat{y}}$.
\begin{displaymath}
L_{\text{cross-entropy}}(\vec{\hat{y}},\vec{y}) = - \sum_{i} \vec{y}_{[i]} \log(\vec{\hat{y}}_{[i]})
\end{displaymath}
\item When using the cross-entropy loss, it is assumed that the classifier's output is transformed using the softmax transformation.
\item The softmax function squashes the $k$-dimensional output to values in the range (0,1) with all entries adding up to 1. Hence, $\vec{\hat{y}}_{[i]} = P( y = i |x)$ represent the class membership conditional distribution.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Loss Functions}
\begin{scriptsize}
\begin{itemize}
\item For hard-classification problems in which each training example has a single correct class assignment, $\vec{y}$ is a one-hot vector representing the true class.
\item In such cases, the cross entropy can be simplified to:
\begin{displaymath}
L_{\text{cross-entropy(hard classification)}}(\vec{\hat{y}},\vec{y}) = -\log(\vec{\hat{y}}_{[t]})
\end{displaymath}
where $t$ is the correct class assignment.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Regularization}
\begin{scriptsize}
\begin{itemize}
\item Our optimization problem may admit multiple solutions, and, especially in higher dimensions, it can also over-fit.
\item Scenario: In our language identification problem one of the documents in the training set ($\vec{x}_O$) is an outlier.
\item The document is actually in German, but is labeled as French.
\item In order to drive the loss down, the learner can identify features (letter bigrams) in $\vec{x}_O$ that occur in only few other documents.
\item The learner will give these features very strong weights toward the (incorrect) French class.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Regularization}
\begin{scriptsize}
\begin{itemize}
\item This is a bad solution to the learning problem.
\item The model is learning something incorrect.
\item Test German documents which share many words with $\vec{x}_O$ could be mistakenly classified as French.
\item We would like to control for such cases by driving the learner away from such misguided solutions.
\item Idea: it is OK to mis-classify a few examples if they don't fit well with the rest.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Regularization}
\begin{scriptsize}
\begin{itemize}
\item Regularization: we add a regularization term $R$ to the optimization objective.
\item The goal of this term: to control the complexity (large weights) of the parameter value ($\Theta$), and avoid cases of overfitting:
\begin{equation}
\begin{split}
\hat{\Theta} \quad & = \operatorname{argmin}_{\Theta} \mathcal{L}(\Theta) + \lambda R(\Theta) \\
\quad & = \operatorname{argmin}_{\Theta} \frac{1}{n} \sum_{i=1}^n L(f(\vec{x}_i;\Theta), y_i) + \lambda R(\Theta) \\
\end{split}
\end{equation}
\item The regularization term considers the parameter values, and scores their complexity.
\item The value of hyperparameter $\lambda$ has to be set manually, based on the classification performance on a development set.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Regularization}
\begin{scriptsize}
\begin{itemize}
\item In practice, the regularizers $R$ equate complexity with large weights.
\item They work to keep the parameter values ($\Theta$) low.
\item They drive the learner toward solutions with low norms of the parameter matrices ($W$).
\item Common choices for $R$ are the $L_2$ norm, the $L_1$ norm, and the elastic-net.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{L$_2$ Regularization}
\begin{scriptsize}
\begin{itemize}
\item In $L_2$ regularization, $R$ takes the form of the squared $L_2$ norm of the parameters.
\item Goal: try to keep the sum of the squares of the parameter values low.
\begin{displaymath}
R_{L_{2}}(W) = ||W||^{2}_{2} = \sum_{i,j}(W_{[i,l]})^2
\end{displaymath}
\item The $L_2$ regularizer is also called a Gaussian prior or weight decay.
\item $L_2$ regularized models are severely punished for high parameter weights.
\item Once the value is close enough to zero, their effect becomes negligible.
\item The model will prefer to decrease the value of one parameter with high weight by 1 than to decrease the value of ten parameters that already have relatively low weights by 0.1 each.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{L$_1$ Regularization}
\begin{scriptsize}
\begin{itemize}
\item In $L_1$ regularization, $R$ takes the form of the $L_1$ norm of the parameters.
\item Goal: try to keep the sum of the absolute values of the parameters low.
\begin{displaymath}
R_{L_{1}}(W) = ||W||_{1} = \sum_{i,j} |W_{[i,l]}|
\end{displaymath}
\item In contrast to $L_2$ , the $L_1$ regularizer is punished uniformly for low and high values.
\item It has an incentive to decrease all the non-zero parameter values toward zero.
\item It thus encourages a sparse solutions—models with many parameters with a zero value.
\item The $L_1$ regularizer is also called a sparse prior or lasso \cite{tibshirani1996regression}.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Elastic-Net}
\begin{scriptsize}
\begin{itemize}
\item The elastic-net regularization \cite{zou2005regularization} combines both $L_1$ and $L_2$ regularization:
\begin{displaymath}
R_{\text{elactic-net}}(W) = \lambda_{1}R_{L_{1}}(W) + \lambda_{2}R_{L_{2}}(W)
\end{displaymath}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Beyond SGD}
\begin{scriptsize}
\begin{itemize}
\item While the SGD algorithm can and often does produce good results, more advanced algorithms are also available.
\item The SGD+Momentum \cite{polyak1964some} and Nesterov Momentum \cite{nesterov2018lectures,sutskever2013importance} algorithms are variants of SGD in which previous gradients are accumulated and affect the current update.
\item Adaptive learning rate algorithms including AdaGrad \cite{duchi2011adaptive}, AdaDelta \cite{zeiler2012adadelta}, RMSProp \cite{tieleman2012lecture}, and Adam \cite{kingma2014adam} are designed to select the learning rate for each minibatch.
\item This sometimes done on a per-coordinate basis, potentially alleviating the need of fiddling with learning rate scheduling.
\item For details of these algorithms, see the original papers or \cite{goodfellow2016deep}(Sections 8.3, 8.4).
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Train, Test, and Validation Sets}
\begin{scriptsize}
\begin{itemize}
\item When training a model our goal is to produce a function $f(\vec{x})$ that correctly maps inputs $\vec{x}$ to outputs $\hat{y}$ as evidenced by the training set.
\item Performance on training data can be misleading: our goal is to train a function capable of generalizing to unseen examples.
\item Held-out set: split training set into training and testing subsets (80\% and 20\% splits). Train on training and compute accuracy on testing.
\item Problem: in practice you often train several models, compare their quality, and select the best one.
\item Selecting the best model according to the held-out set's accuracy will result in an overly optimistic estimate of the model's quality.
\item You don't know if the chosen settings of the final classifier are good in general, or are just good for the particular examples in the held-out sets.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Train, Test, and Validation Sets}
\begin{scriptsize}
\begin{itemize}
\item The accepted methodology is to use a three-way split of the data into train, validation (also called development), and test sets\footnote{An alternative approach is cross-validation, but it doesn't scale well for training deep neural networks.}.
\item This gives you two held-out sets: a validation set (also called development set ), and a test set.
\item All the experiments, tweaks, error analysis, and model selection should be performed based on the validation set.
\item Then, a single run of the final model over the test set will give a good estimate of its expected quality on unseen examples.
\item It is important to keep the test set as pristine as possible, running as few experiments as possible on it.
\item Some even advocate that you should not even look at the examples in the test
set, so as to not bias the way you design your model.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Train, Test, and Validation Sets}
\begin{scriptsize}
\begin{figure}[htb]
\centering
\includegraphics[scale=0.55]{pics/validation.png}
\end{figure}
\footnotetext{source: \url{https://www.codeproject.com/KB/AI/1146582/validation.PNG}}
\end{scriptsize}
\end{frame}
\begin{frame}{A limitation of linear models: the XOR problem}
\begin{scriptsize}
\begin{itemize}
\item The hypothesis class of linear (and log-linear) models is severely restricted.
\item For example, it cannot represent the XOR function, defined as:
\begin{equation}
\begin{split}
\operatorname{xor}(0,0) \quad & = 0 \\
\operatorname{xor}(1,0) \quad & = 1 \\
\operatorname{xor}(0,1) \quad & = 1 \\
\operatorname{xor}(1,1) \quad & = 0 \\
\end{split}
\end{equation}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{A limitation of linear models: the XOR problem}
\begin{scriptsize}
\begin{itemize}
\item There is no parameterization $\vec{w} \in \mathcal{R}^2, b \in \mathcal{R}$ such that:
\begin{equation}
\begin{split}
(0,0) \cdot \vec{w} +b \quad & < 0 \\
(0,1) \cdot \vec{w} +b \quad & \geq 0 \\
(1,0) \cdot \vec{w} +b \quad & \geq 0 \\
(1,1) \cdot \vec{w} +b \quad & < 0 \\
\end{split}
\end{equation}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{A limitation of linear models: the XOR problem}
\begin{scriptsize}
\begin{itemize}
\item To see why, consider the following plot of the XOR function, where blue Os denote the positive class and green Xs the negative class.
\begin{figure}[htb]
\centering
\includegraphics[scale=0.35]{pics/xor.png}
\end{figure}
\item It is clear that no straight line can separate the two classes.
\end{itemize}
\end{scriptsize}
\end{frame}