-
Notifications
You must be signed in to change notification settings - Fork 0
/
OOV.py
1232 lines (985 loc) · 57.4 KB
/
OOV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""v7
#name: v7_*
# Code which takes uspantekan or spanish small data (100 sent) about two classes
education and dancing, runs it through a QNLP model, which is supported by a fasttext model,
and two neural network models to learn and make prediction.
#to get blow by blow details of what this code does, refer to a section named
"how this code runs" inside the project plan
https://github.com/ua-datalab/QNLP/blob/main/Project-Plan.md
4 major models used in this code.
1. QNLP model, called model1
2. Fast text embedding model , called model 2
3. NN model that learns mapping between fast text embedding and QNLP trained model's weights
4. Prediction model - which is use dto predict on test set.
#todo, find why not just do model1.predict?
"""
from lambeq import RemoveCupsRewriter
from tqdm import tqdm
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import torch
from torch import nn
import spacy
from lambeq import SpacyTokeniser
import numpy as np
import fasttext as ft
from lambeq import PytorchTrainer
from lambeq.backend.tensor import Dim
from lambeq import AtomicType
from lambeq import Dataset
from lambeq import PytorchModel, NumpyModel, TketModel, PennyLaneModel
from lambeq import TensorAnsatz,SpiderAnsatz
from lambeq import BobcatParser,spiders_reader
parser_to_use = spiders_reader #[BobcatParser(verbose='text'), spiders_reader]
ansatz_to_use = SpiderAnsatz #[IQP, Sim14, Sim15,TensorAnsatz ]
model_to_use = PytorchModel #[numpy, pytorch]
trainer_to_use= PytorchTrainer #[PytorchTrainer, QuantumTrainer]
embedding_model = ft.load_model('./embeddings-l-model.bin')
# maxparams is the maximum qbits (or dimensions of the tensor, as your case be)
MAXPARAMS = 300
BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 0.1
SEED = 43434
DATA_BASE_FOLDER= "data"
USE_SPANISH_DATA=True
USE_USP_DATA=False
USE_FOOD_IT_DATA = False
USE_MRPC_DATA=False
#setting a flag for TESTING so that it is done only once.
# Everything else is done on train and dev
TESTING = False
if(USE_USP_DATA):
TRAIN="uspantek_train.txt"
DEV="uspantek_dev.txt"
TEST="uspantek_test.txt"
if(USE_SPANISH_DATA):
TRAIN="spanish_train.txt"
DEV="spanish_dev.txt"
TEST="spanish_test.txt"
# #todo: actual MRPC is a NLi kind of task.- the below MRPC is a hack which has only the
# premise mapped to a lable of standard MRPC
# # Use the 2 classes of information technology and food
# # thing dataset instead if you want something for testing one class alone
if(USE_MRPC_DATA):
TRAIN="mrpc_train_80_sent.txt"
DEV="mrpc_dev_10_sent.txt"
TEST="mrpc_test_10sent.txt"
if(USE_FOOD_IT_DATA):
TRAIN="mc_train_data.txt"
DEV="mc_dev_data.txt"
TEST="mc_test_data.txt"
# loss = lambda y_hat, y: -np.sum(y * np.log(y_hat)) / len(y) # binary cross-entropy loss
# acc = lambda y_hat, y: np.sum(np.round(y_hat) == y) / len(y) / 2 # half due to double-counting
sig = torch.sigmoid
def accuracy(y_hat, y):
assert type(y_hat)== type(y)
# half due to double-counting
#todo/confirm what does he mean by double counting
return torch.sum(torch.eq(torch.round(sig(y_hat)), y))/len(y)/2
eval_metrics = {"acc": accuracy}
spacy_tokeniser = SpacyTokeniser()
if(USE_SPANISH_DATA) or (USE_USP_DATA):
spanish_tokeniser=spacy.load("es_core_news_sm")
spacy_tokeniser.tokeniser = spanish_tokeniser
#for english tokenizer
if(USE_MRPC_DATA) or USE_FOOD_IT_DATA:
english_tokenizer = spacy.load("en_core_web_sm")
spacy_tokeniser.tokeniser =english_tokenizer
import os
"""go through all thecircuits in training data,
and pick the one which has highest type value
note that they are not using the literal length of the circuit, but
the number attached to next to aldea_2...todo : find what exactly that does"""
def get_max_word_param_length(input_circuits):
lengths=[]
for d in input_circuits:
for symb in d.free_symbols:
x = symb.name.split('_', 1)[1]
y = x.split('__')[0]
lengths.append(int(y))
return lengths
def get_vocab_emb_dict(vocab):
#i.e given a word from training and dev vocab, get the corresponding
# embedding using fast text.
#all this is stored as a key value pair in embed_dict, where the word is
#the key and embedding is the value
#todo: confirm if this is how Khatri does it too.
embed_dict={}
for wrd in vocab:
"""#spider ansatz alone writes the tokens in its vocabulary with a single underscore first and then a double underscore
#so we need to parse accordingly
#todo: some words are already in dictionary- i think this is because of the same
# words having multiple versions- mostly likely we shouldn't split the _1_ thing- i am thinking
#that denotes the nth TYPE of LIKES kinda adverbs."""
cleaned_wrd_just_plain_text,cleaned_wrd_with_type=clean_wrd_for_spider_ansatz_coming_from_vocab(wrd)
if cleaned_wrd_with_type in embed_dict :
"""#this shouldn't happen. a) does it happen for english data b) for
fucks sake find out what is _0- is this standard protocol, i.e if a word occurs twice
we ignore it? confirm with original khatri code."""
print(f"error. the word {cleaned_wrd_with_type} was already in dict")
else:
embed_dict[cleaned_wrd_with_type]= embedding_model[cleaned_wrd_just_plain_text]
return embed_dict
"""
Spider anstaz writes the symbols different than other ansatz.
So creating a separate function itself which will be called only
for spider ansatz
todo: either raise a pull request with LAMBEQ guys or find out
if they are doing this deliebrately and its my understanding which is lacking.
spider ansatz does: aldea_0__s or así_0__s
while other ansatz:aldea_s_0 #so we can use cleaned_wrd= symb.name.rsplit('_', 1)[0]
also todo: confirm if its a spider ansatz thing only. or are there any other ansatz that does that
Khatri's code uses IQP, Sim14 and Sim15, so we know that doesn't
update @oct29th2024.
- Found that in khatri code he retains the __s i.e aldea_0__s becomes
aldea__s. So going to put that back in
- And also confirmed. Spider ansatz is messing up the order
In khatri's code using sim 15 he uses the format aldea__s_2
i.e the s comes first and then 2
todo: clean up accents/utf-8 eventually?
"""
def clean_wrd_for_spider_ansatz(wrd):
split_word= wrd.lower().split('_')
cleaned_wrd=split_word[0].replace("(","").replace(")","").replace('\\','').replace(",","")
# //-update @oct29th2024. Found that in khatri code he retains the __s i.e aldea_0__s becomes
# aldea__s. So going to put that back in
wrd_with__s= cleaned_wrd+"__" +split_word[3]
return cleaned_wrd, wrd_with__s
#in vocab the wrd is already stored in the form aldea__s. Weneed to extract plain text from it for finding embedding
def clean_wrd_for_spider_ansatz_coming_from_vocab(wrd):
split_word= wrd.lower().split('_')
cleaned_wrd=split_word[0].replace("(","").replace(")","").replace('\\','').replace(",","")
# //-update @oct29th2024. Found that in khatri code he retains the __s i.e aldea_0__s becomes
# aldea__s. So going to put that back in
wrd_with__s= cleaned_wrd+"__" +split_word[2]
return cleaned_wrd, wrd_with__s
"""given a set of circuits (e.g. train_circuit) extract the word from the
symbol and create a dictionary full of it
Todo: This works only for SpiderAnsatz. So modify code for other ansatz."""
def create_vocab_from_circuits(circuits):
vocab=set()
if(ansatz_to_use==SpiderAnsatz):
for d in circuits:
for symb in d.free_symbols:
if "aldea" in symb.name:
print("found aldea")
cleaned_wrd_just_plain_text,cleaned_wrd_with_type = clean_wrd_for_spider_ansatz(symb.name)
vocab.add(cleaned_wrd_with_type)
return vocab
"""#set the the initial phases of the gates.
Also note that in this function is where we are walking into OOV land.
i.e we check if there are any words that are found only in test/val set
and not in train set.
mithuns comment @26thsep2024
"""
def generate_initial_parameterisation(train_circuits, val_circuits, embedding_model, qnlp_model):
""" extract the words from the circuits- i.e the training data
# Note that in this vocab, the same word can have multiple types, which each occur separately
# todo: what did he mean by same word having multiple types.
# is this the likes vs never example mentione din 1958 lambek paper?
# update@sep29th2024. This is how the free_symbols look like.
# distorting_0_s- right now we don't know what 0 or s stands for. My
# guess after reading only 1958 paper is that s is sentence, which is part of the
# 2 fundamental units lambek specifices in 1958 paper- n and s"""
# train_vocab = {symb.name.rsplit('_', 1)[0] for d in train_circuits for symb in d.free_symbols}
train_vocab=create_vocab_from_circuits(train_circuits)
val_vocab=create_vocab_from_circuits(val_circuits)
#todo print: the total number of words in train, and test+ note it down
#.answer: for spanish henderson there are 463 words in training and 89 in testing
# out of the 89 words in test, 33 are not present in training, so they are OOV
print(len(val_vocab.union(train_vocab)), len(train_vocab), len(val_vocab))
print(f"OOV word count: i.e out of {len(val_vocab)} words in the testing vocab there are {len(val_vocab - train_vocab)} words that are not found in training. So they are OOV")
oov_words=val_vocab - train_vocab
print(f"list of OOV words are {oov_words}")
#todo: find the meaning of symbol count- what is the difference between symbol count and OOV or train_vocab
"""update @29thOct2024- Symbol is the term used for the word + number of qbits or dimension
e.g aldea_0__s means, that symbol is for the word aldea for for its 1st qbit/dimension
similarly aldea_1__s means it is the symbol for the 2nd qbit etc.
So ideally the count of OOV symbols must be more than that of oov words.
that is because most of the words will have more than 1 symbols.
"""
oov_symbols={symb.name for d in val_circuits for symb in d.free_symbols} - {symb.name for d in train_circuits for symb in d.free_symbols}
n_oov_symbs = len(oov_symbols)
print(f'OOV symbol count: {n_oov_symbs} / {len({symb.name for d in val_circuits for symb in d.free_symbols})}')
print(f"the symbols that are in symbol count but not in word count are:{oov_symbols-oov_words}")
#######note that everything to do with OOV ends here. So far it was just FYI, there is OOV in this dataset's test partition
max_word_param_length=0
if(ansatz_to_use==SpiderAnsatz):
max_word_param_length_train = max(get_max_word_param_length(train_circuits))
max_word_param_length_test = max(get_max_word_param_length(val_circuits))
max_word_param_length = max(max_word_param_length_train, max_word_param_length_test) + 1
assert max_word_param_length!=0
# max_word_param_length = max(, ) + 1
print(f'Max params/word: {max_word_param_length}')
""" # next , for each word in train and test vocab , we need to get its embedding from fasttext
# mithuns comment @26thsep2024:
# note that there is some confusion between the input data Khatri used from MRPC
# as oposed to the spanish one = rather how spider reader is storing it.
# In MRPC and bobcat parser, they store it in one format(i think its two underscore
# while spider parser stores it with one underscore or two dashes or something
its definitely a bug in their code. However, we bear the brunt since spider ansatz
is the only one which didnt give errors for spanish data. So eventually this needs to be
replaced/fixed/single format must be stored for all ansatz- evne see if you can
create a pull request for this
"""
if(ansatz_to_use==SpiderAnsatz):
# train_vocab_embeddings={}
""" #for each word in train and test vocab get its embedding from fasttext
#note that even though the symbols per se have _0_, in the train_vocab_embedding
# dictionary it is stored in the
#format of {wrd: embedding}- i.e only the word aldea out of aldea_0_ is separated out and used.
# mithuns comment @26thsep2024: note that this is a hack, and ideally such data format
# based difference shouldnt occur.
# TODO: run khastri code on MRPC and confirm who is screwing up.
# is it spider ansatz which is messing up the data format or is it us?
update@october29th2024: so in khatri code when he is giving the word
to the embedding model he is using plain text word i.e "aldea"
but when he is storing it in the dict train_vocab_embeddings, the key
goes back to "aldea__s" while the value is the vector/embedding you got from
fasttext
# """
train_vocab_embeddings = get_vocab_emb_dict(train_vocab)
val_vocab_embeddings = get_vocab_emb_dict(val_vocab)
else:
#for the words created from other ansatz just write it as : _0_ so we can reuse the parsing from original khatri code. But here recording specifically for this instance
train_vocab_embeddings = {wrd: embedding_model[wrd.split('__')[0]] for wrd in train_vocab}
val_vocab_embeddings = {wrd: embedding_model[wrd.split('__')[0]] for wrd in val_vocab}
#to store all the initial weights
initial_param_vector = []
#todo: find what qnlp_model.symbols is- rather how is it different than train vocab?
#ans: it is every word in the given list of circuits e.g. únicamente_0__s
for sym in qnlp_model.symbols:
#@sep2nd2024-not sure what idx is supposed to do, am giong to give it the number associated with the word
if(ansatz_to_use==SpiderAnsatz):
cleaned_wrd_just_plain_text,cleaned_wrd_with_type = clean_wrd_for_spider_ansatz(sym.name)
rest = sym.name.split('_', 1)[1]
idx = rest.split('__')[0]
"""#@sep2nd2024/ end of day: getting key error for lots of words - e.g. aldea..but why are words
# in qnlpmodel.symbols not getting the fasttext emb on the fly? why are we separating train_embeddings earlier?
#what is the meaning of symbols in qnlp.model
#todo a) read the lambeq documentation on symbols
# b) read the 2010 discocat and CQM paper onwards up, chronologically
#no point turning knobs without deeply understanding what symbols do
#todo:compare the format ith mrpc data, and see if he is storing the initial param vector- and symbols with _0_ or not?
# update@29th 0ct 2024. a) done
# b) done. Refer definition of symbol in comments near line 223
# b) yes he is appending just the first element of the embedding as an entry to
# initial params."""
if cleaned_wrd_with_type in train_vocab_embeddings:
if model_to_use == PytorchModel:
# initial_param_vector.append(train_vocab_embeddings[cleaned_wrd_with_type][int(idx),1-int(idx)])
val1= train_vocab_embeddings[cleaned_wrd_with_type][int(idx)]
val2= train_vocab_embeddings[cleaned_wrd_with_type][int(idx)+1]
tup= torch.tensor ([val1,val2], requires_grad=True) #initializing with first two values of the embedding
initial_param_vector.append(tup)
else:
initial_param_vector.append(train_vocab_embeddings[cleaned_wrd_with_type][int(idx)])
else:
'''
#todo: lots of words are getting hit with OOV- conirm why they are not there in fasttext emb
# my guess is its all the unicode characters.
# In theory fast text is meant to create zero
# OOV..since it builds up from 1 gram 2 gram etc
#update: this might be caused because am
# removing the _0_ thing from the actual name, without
# realizing what it is doing.
found that this word verdad, was OOV/not in fasttext emb
found that this word vió, was OOV/not in fasttext emb
found that this word yo, was OOV/not in fasttext emb
found that this word yyyyyy was OOV/not in fasttext emb
'''
print(f"found that this word {cleaned_wrd_with_type} was OOV/not in fasttext emb")
"""
Set the intialization of QNLP model's weight as that of the embeddings of each word
Am not completely convinced about what he is doing here.
FOr example in NN world, embedding is separate than weights of neurons.
You might initialize the weights of neurons with random shit like Xavier glorot
but almost nver initialize it with embedding itself.
todo: qnlp_model.weights-is a list
of size 463 while the np.array is creating a matrix of shape (463,) -which
is then again being converted to nn.ParameterList-which is a list.
We dont even know how/why we are going to numpy land and then converting it back to nn.parameter
rather; there was no np.array initially.- gave some error. So i went to khatri's
code - and exactly copy pasted np.array()- the error went away, so i didnt complain.
But all this is being caused because we are working on someone else's code without
realizing what the code does.
"""
"""#update @16th oct 2024.
this below code was what the error: the shape[2] not equal to shape[1]
this meant,
a) the LHS= qnlp_model.weights is a list of tensors i.e [Tensor([0.9,0.1],req_grad="true"]
but the RHS is giving a simple list i.e initial_param_vectorDont=[1]
In short the error is saying the first value is meant to be a tensor over a tuple o
of two elements, instead you are giving me a plain as 1 float value.
b) I still dont know why he is initializing the weights of QNLP model
(which looks like the weights of two classes,..while we are providing
just the vector of embeddings- what is the relationship between
a vector of embeddings and qnlp.weights? find out from KHatri's initial code )
@octr17th2024
commenting the code out until i find what its doing
."""
# assert len(qnlp_model.symbols) == len(qnlp_model.weights)== len(train_vocab_embeddings) +len(val_vocab_embeddings)
assert len( qnlp_model.weights) == len(initial_param_vector)
qnlp_model.weights = nn.ParameterList(initial_param_vector)
return train_vocab_embeddings, val_vocab_embeddings, max_word_param_length
def trained_params_from_model(trained_qnlp_model, train_embeddings, max_word_param_length):
"""
Args:
trained_qnlp_model- the trained_qnlp_model
train_vocab_embeddings- the initial embeddings for words in the vocab got from fasttext
max_word_param_length- what is the maximum size of a wor
Returns:
a map between each word and its latest final weights
"""
trained_param_map = { symbol: param for symbol, param in zip(trained_qnlp_model.symbols, trained_qnlp_model.weights)}
trained_parameterisation_map = {wrd: np.zeros(max_word_param_length) for wrd in train_embeddings}
cleaned_wrd_with_type=""
for symbol, train_val in trained_param_map.items():
if(ansatz_to_use==SpiderAnsatz):
cleaned_wrd_just_plain_text,cleaned_wrd_with_type = clean_wrd_for_spider_ansatz(symbol.name)
rest = symbol.name.split('_', 1)[1]
idx = rest.split('__')[0]
else:
wrd, idx = symbol.name.rsplit('_', 1)
assert cleaned_wrd_with_type != ""
if cleaned_wrd_with_type in trained_parameterisation_map:
trained_parameterisation_map[cleaned_wrd_with_type][int(idx)] = train_val
return trained_parameterisation_map
def generate_OOV_parameterising_model(trained_qnlp_model, train_vocab_embeddings, max_word_param_length):
"""
in the previous func `generate_initial_parameterisation` we took model 1 i.e the QNLP model
and initialized its weights with the embeddings of the words
here we will take the model 1, and create another NN i.e model 3, which will learn the mapping between
train_vocab_embeddings and weights of trained_qnlp_model
Args:
trained_qnlp_model- the trained_qnlp_model
train_vocab_embeddings- the initial embeddings for words in the vocab got from fasttext
max_word_param_length- what is the maximum size of a word
Returns:
Weights of a NN model which now has learnt
for each word in fasttext as its original embedding mapped to the weights in the trained QNLP model
"""
"""explanation of dict_training_symbols_vs_qnlp_trained_weights:
dict_training_symbols_vs_qnlp_trained_weights is a dictionary that map symbols in the trained QNLP model to
its weights at the end of QNLP training i.e the training that happened to model 1 i.e the QNLP model
#todo: print and confirm if symbol means word
#update@sep 11th 2024 : symbol is not word, it is word+ that idx number-
# which i am suspecting is the number of Types a same word can have
# for example if you read 1958 Lambek paper youc an see that adverb Likes can have two different TYPE representations.
# but yes, todo: confirm this by comparing it with the original Khatri MRPC code.
# todo: he is doing the same process above inside the function generate_initial_parameterisation
around line 222. Find out why doing it so many times? are they same? so why not just pass around?- update@oct29th2024. the _2 in symbol denotes the qbit number or the dimension of the tensor. refer line 222 comments starting with update @29thOct2024
# """
dict_training_symbols_vs_qnlp_trained_weights = {symbol: param for symbol, param in
zip(trained_qnlp_model.symbols, trained_qnlp_model.weights)}
"""
train_vocab_embeddings are the initial embeddings
for words in the vocab we got from fasttext-
now for each such word create an array of zeroes called trained_param_vectors
- this array is where the weights of the model 3, NN model which maps between embedding and learned weights of
QNLP model will be added.
i.e for now for each such word in training vocabulary, create a vector filled with zeroes to represent
trained parameters
Update@oct29th2024- in the below code it was np.zeros(max_word_param_length+1)
however, when using pytorch trainer and pytorch model it needs weights in a tuple form
(todo find out why)- but max param length was 1- since we were using spider ansatz
so as of now hardcoding the np.zeroes to be 2- will have to change this later as and when we move to other ansatz
"""
dict_wrd_in_training_vs_weights = {wrd: np.zeros(max_word_param_length+1) for wrd in train_vocab_embeddings}
'''for each such word in training vocabulary,
to the empty array/array of zeroes created above.
-get ist weights from the qnlp trained weights
#todo 1) am still not sure why he is touching his nose in a circumambulated way
# 2) print and confirm, i think they are repeating the same
# weight value for every entry of the array in trained_param_vectors.
#
# Note that this for loop below is purely done so as to extract word out of symbol.
# the dictionary intuition remains same between both dictionaries. i.e dict_training_symbols_vs_qnlp_trained_weights and
# dict_wrd_in_training_vs_weights i.e key is either word/symbol while value is sambe for both, which is the corresponding
# weight of the same word in the trained QNLP model '''
cleaned_wrd_with_type=""
for symbol, trained_weights in dict_training_symbols_vs_qnlp_trained_weights.items():
if(ansatz_to_use==SpiderAnsatz):
#symbol and word are different. e.g. aldea_0. From this extract the word
cleaned_wrd_just_plain_text,cleaned_wrd_with_type = clean_wrd_for_spider_ansatz(symbol.name)
rest = symbol.name.split('_', 1)[1]
idx = rest.split('__')[0]
else:
cleaned_wrd_with_type, idx = symbol.name.rsplit('_', 1)
assert cleaned_wrd_with_type != ""
#if that word is in train vocab (from the embedding side)- 99% should be there.
#todo: confirm if any words are outside
#stopping here at oct 16th 2024. 2pm- getting a newerror some key value mistake.
# i think this has something to do with the wrd splitting thing
#todo: in original code of khatri he is not storing aldea with plain english word
#but he is doing with aldea_0_s- make sure we revert back to this later.
if cleaned_wrd_with_type in dict_wrd_in_training_vs_weights:
dict_wrd_in_training_vs_weights[cleaned_wrd_with_type][int(idx)] = trained_weights[int(idx)]
else:
print(f"inside OOV_generation-found that this word {cleaned_wrd_with_type} was not in trained_param_vectors")
wrds_in_order = list(train_vocab_embeddings.keys())
"""#For each word in a ordered list of training vocabulary words, create 2 arrays.
One for embeddings (NN_train_X) and the other for trained weights (NN_train_Y)
Note, the goal of model 3 is to learn the mapping between these two things
update@oct29th2024. currently we are passing only one value as label. but that is only because
aldea_0 the max value is 0 (param length 1). and that happened only because we are using
dumb spider ansatz. Eventually when we use any other ansatz, icnluding tensor ansatz
every word should have more than 1 values. So NN_train_Y will be a list of 2 tuple arrays. Be ready
for if and when this bombs """
NN_train_X = np.array([train_vocab_embeddings[wrd] for wrd in wrds_in_order])
NN_train_Y = np.array([dict_wrd_in_training_vs_weights[wrd] for wrd in wrds_in_order])
"""#this is model 3. i.e create a simple Keras NN model, which will learn the above mapping.
todo: use a better model a) FFNN using pytorch b) something a little bit more complicated than a simple FFNN"""
OOV_NN_model = keras.Sequential([
layers.Dense(int((max_word_param_length + MAXPARAMS) / 2), activation='tanh'),
layers.Dense(max_word_param_length, activation='tanh'),
])
#standard keras stuff, initialize and tell what loss function and which optimizer will you be using
OOV_NN_model.compile(loss='mean_absolute_error', optimizer=keras.optimizers.Adam(0.001))
callback = tf.keras.callbacks.EarlyStopping(
monitor='val_loss',
min_delta=0.01,
patience=10,
verbose=1,
mode='auto',
baseline=None,
restore_best_weights=True,
start_from_epoch=0
)
# Embedding dim!
"""#todo find why maxparams are hardcoded as 300 (is it the dimension of the fasttext embedding?)
#ans: yes. The first layer needs to have the dimension of NN_train_X, which in turn has the dimension of the
Fasttext embedding"""
OOV_NN_model.build(input_shape=(None, MAXPARAMS))
#train that model 3
hist = OOV_NN_model.fit(NN_train_X, NN_train_Y, validation_split=0.2, verbose=1, epochs=100,callbacks=[callback])
print(hist.history.keys())
print(f'OOV NN model final epoch loss: {(hist.history["loss"][-1], hist.history["val_loss"][-1])}')
plt.plot(hist.history['loss'], label='loss')
plt.plot(hist.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Error')
plt.legend()
plt.show()
return OOV_NN_model,dict_wrd_in_training_vs_weights
def evaluate_val_set(pred_model, val_circuits, val_labels, trained_weights, val_vocab_embeddings, max_word_param_length, OOV_strategy='random', OOV_model=None):
"""
So this is where we do the final testing (even if its over dev)
Here they are sending
a) pred_model:the newly created prediction model - which is same as the original QNLP model fundamentally.
b) val_circuits , val_labels- what it says
c) trained_params: trained_wts from the model 1- qnlp model (my guess is he is going to assign these weights to the
newly created model. Though why he is doing it in a circumambulated way while he could have directly
used qnlp_model i.e model 1 is beyond me)- update@oct30th2024. This is all zeroes. find and fix why
d) val_vocab_embeddings: Take every word in test/val set, give it to fasttext, and get their embeddings
e) max_word_param_length: refer function get_max_word_param_length
f) oov_strategy= picking one hardcoded one from [zero,embed, model,random etc]- basically this is where you
decide what baseline model do you want to use for your model 3- rather, do you want to use any model at all
or do you want to use things like baselines methods like random number generator, fill with zeroes etc.
"""
pred_parameter_map = {}
#Use the words from train wherever possible, else use DNN prediction
for wrd, embedding in val_vocab_embeddings.items():
if OOV_strategy == 'model':
"""
for each word in test/dev vocabulary, give the word as input to model 3 (the NN model also called DNN here)
which now will ideally return the corresponding weight of the QNLP trained model- because model 3 had learned
the mapping between these two things. REfer: generate_oov_* function above.
Note that all other strategies below are time pass/ boring hard coded baseline stuff
"""
pred_parameter_map[wrd] = trained_weights.get(wrd, OOV_model.predict(np.array([embedding]), verbose=0)[0])
elif OOV_strategy == 'embed':
pred_parameter_map[wrd] = trained_weights.get(wrd, embedding)
elif OOV_strategy == 'zeros':
pred_parameter_map[wrd] = trained_weights.get(wrd, np.zeros(max_word_param_length))
else:
pred_parameter_map[wrd] = trained_weights.get(wrd, 2 * np.random.rand(max_word_param_length)-1)
#convert the dictionary pred_parameter_map into a list pred_weight_vector
pred_weight_vector = []
for sym in pred_model.symbols:
if(ansatz_to_use==SpiderAnsatz):
cleaned_wrd_just_plain_text,cleaned_wrd_with_type = clean_wrd_for_spider_ansatz(sym.name)
rest = sym.name.split('_', 1)[1]
idx = rest.split('__')[0]
if cleaned_wrd_with_type in pred_parameter_map:
if model_to_use == PytorchModel:
val1= np.float32(pred_parameter_map[cleaned_wrd_with_type][int(idx)])
"""# todo: there are some cleaned_wrd_with_type in pred_parameter_map which is empty. i.e size of tuple =1
#figure that out,. commenting this line until then and forcing val2 to be zero
# val2= pred_parameter_map[cleaned_wrd_with_type][int(idx)+1]"""
if len(pred_parameter_map[cleaned_wrd_with_type])>1:
val2= np.float32(pred_parameter_map[cleaned_wrd_with_type][int(idx)+1])
else:
val2 = np.float32(0.0)
tup= torch.tensor ([val1,val2], requires_grad=True) #initializing with first two values of the embedding
pred_weight_vector.append(tup)
"""#so he is assigning the weights he picked from model 3's outout (the DNN one)
to that of model 4 - i.e the prediction model. I think this is the answer to the question
of why is he using two QNLP models (model1 and model 4)- that is because if we could direclty use
the trained weights of the original QNLP model model 1- we wouldnt have had to go through this
circumambulated way of finding the mapping between fast text and qnlp weights.
The weights here are the new learnings created by model 3- the DNN/NN model specifically trained on
model(a,b) where a is the weights from model1 and b is the embeddings from model2 -the fasttextmodel
todo: a) why is the pred_model.weights zero
b) why is it not a parameter list, like that of model 1?
"""
assert len(pred_model.symbols) == len(pred_weight_vector)
assert type(pred_model.weights) == type( nn.ParameterList(pred_weight_vector))
# assert len(pred_model.weights) == len(pred_weight_vector)
pred_model.weights = nn.ParameterList(pred_weight_vector)
#use the model now to create predictions on the test set.
preds = pred_model.get_diagram_output(val_circuits)
loss_pyTorch =torch.nn.BCEWithLogitsLoss()
l= loss_pyTorch(preds, torch.tensor(val_labels))
a=accuracy(preds, torch.tensor(val_labels))
return l, a
def read_data(filename):
labels, sentences = [], []
with open(filename) as f:
for line in f:
# todo: find why this is float- i think unlike classicalNLP they are not taking lables
#but are taking the logits in QNLP - which can translate to weights of words...IMHO but do confirm
t = float(line[0])
"""#todo find why pytorch model needs labels in [a, 1-a] format.
answer/update: in all examples of lambeq they use this format, classical or quantum
Todo: uncomment the plain 0 or 1 code if and when this gives issue- which usually shows up as size mismatch in
.fit()
# if model_to_use == PytorchModel:
# else:
# labels.append(int(t))
"""
labels.append([t, 1-t])
sentences.append(line[1:].strip())
return labels, sentences
#back to the main thread after all functions are defined.
#read the base data, i.e plain text english.
train_labels, train_data = read_data(os.path.join(DATA_BASE_FOLDER,TRAIN))
val_labels, val_data = read_data(os.path.join(DATA_BASE_FOLDER,DEV))
test_labels, test_data = read_data(os.path.join(DATA_BASE_FOLDER,TEST))
# todo: not sure what am doing here. need to figure out as and when we get to testing
# if TESTING:
# train_labels, train_data = train_labels[:2], train_data[:2]
# val_labels, val_data = val_labels[:2], val_data[:2]
# test_labels, test_data = test_labels[:2], test_data[:2]
# EPOCHS = 1
"""
# not using bob cat parser- note: this wasn't compatible with spider ansatz
history: we are using spiders reader for spanish/uspantek, because pytorch trainer goes well
#with it. Note that this is being done in SEp 2024 to just get the
# code to take off from the ground. However, other than this being a good baseline,
spider reader should be soon discareded and switched to bob cat parser+ some
quantum trainers asap. """
"""spanish_diagrams is a dummy function I had created once when I had to test
both MRPC and uspantekan/spanish data at the same time. it is very useful in debugging
especially for converting aldea_0 kinda format issues. But do remove this
once not needed. Mithun@26th sep 2024"""
def convert_to_diagrams(list_sents,labels):
list_target = []
labels_target = []
sent_count_longer_than_32=0
for sent, label in tqdm(zip(list_sents, labels),desc="reading sent"):
# tokenized = spacy_spanish_tokeniser.tokenise_sentence(sent)
# diag =parser.sentence2diagram(tokenized, tokenised= True)
# diag.draw()
# list_target.append(diag)
# #this is
# if(USE_MRPC_DATA):
# sent = sent.split('\t')[2]
tokenized = spacy_tokeniser.tokenise_sentence(sent)
"""if the length of sentences is more than 32, ignore it
doing this to avoid this error
(ValueError: maximum supported dimension for an ndarray is 32, found 33)
Todo: find if this is a very spanish tokenizer only issue or like pytorchmodel only issue"""
if( USE_SPANISH_DATA or USE_USP_DATA):
if len(tokenized)> 32:
print(f"no of tokens in this sentence is {len(tokenized)}")
sent_count_longer_than_32+=1
continue
spiders_diagram = parser_to_use.sentence2diagram(sentence=sent)
list_target.append(spiders_diagram)
labels_target.append(label)
print(f"sent_count_longer_than_32={sent_count_longer_than_32}")
print("no. of items processed= ", len(list_target))
return list_target, labels_target
"""#convert the plain text input to ZX diagrams
# #todo: find who does the adding the underscore 0 part. is it ansatz or sentence2diagram?.
# Ans:value of _0 the 0 part, is extracted in spanish_diagrams. But the attaching it to
# the nounc part aldea_0 is done by ansatz.
# Note that this is a confusion arising on sep 29th 2024: because we don't know what is the meaning of
# the _0 in aldea. Rather, i am yet to read the 2010 discocat paper. That should explain it
# Until then taking a guess"""
train_diagrams, train_labels_v2 = convert_to_diagrams(train_data,train_labels)
val_diagrams, val_labels_v2 = convert_to_diagrams(val_data,val_labels)
test_diagrams, test_labels_v2 = convert_to_diagrams(test_data,test_labels)
"""#assignign teh labels back to s`ame old lable
# doing because didnt want same variable going into th function and returning it.
# python lets you get away with it, but i dont trust it"""
train_labels = train_labels_v2
val_labels = val_labels_v2
test_labels = test_labels_v2
"""
these d1.cod=d2.code are now orphan codes, but in reality, these code were there in khatri's original
code (https://colab.research.google.com/drive/13W_oktxSFMAB6m5Rfvy8vidxuQDrCWwW#scrollTo=0be9c058)
Mithun@27thsep2024-I have a bad feeling I might have removed all this an year ago, when it was giving "error"
Clearly my mental state at that time ws so messed pu that all i was trying to do it, somehow get it to work.
even if it means removing bug filled code...weird/sad but true.
update@5thnov2024- the d1.cod==d2.code is a very specific thing for datasets which have pairs of
inputs. eg mrpc or NLI. for plain single sentence classification shouldnt matter"""
# from collections import Counter
# # We omit any case where the 2 phrases are not parsed to the same type
# joint_diagrams_train = [d1 @ d2.r if d1.cod == d2.cod else None for (d1, d2) in zip(train_diags1, train_diags2)]
# joint_diagrams_test = [d1 @ d2.r if d1.cod == d2.cod else None for (d1, d2) in zip(test_diags1, test_diags2)]
# train_diags_raw = [d for d in joint_diagrams_train if d is not None]
# train_y = np.array([y for d,y in zip(joint_diagrams_train, filt_train_y) if d is not None])
# test_diags_raw = [d for d in joint_diagrams_test if d is not None]
# test_y = np.array([y for d,y in zip(joint_diagrams_test, filt_test_y) if d is not None])
# print("FINAL DATASET SIZE:")
# print("-----------------------------------")
# print(f"Training: {len(train_diags_raw)} {Counter([tuple(elem) for elem in train_y])}")
# print(f"Testing : {len(test_diags_raw)} {Counter([tuple(elem) for elem in test_y])}")
train_X = []
val_X = []
"""# Note: removing cups and normalizing is more useful in bobcat parser, not in spiders
#but leaving it here since eventually we want everything to go through bobcat
# refer: https://cqcl.github.io/lambeq-docs/tutorials/trainer-quantum.html"""
remove_cups = RemoveCupsRewriter()
for d in tqdm(train_diagrams):
train_X.append(remove_cups(d).normal_form())
for d in tqdm(val_diagrams):
val_X.append(remove_cups(d).normal_form())
train_diagrams = train_X
val_diagrams = val_X
# this is used only when there are a pair of sentences
# from discopy.quantum.gates import CX, Rx, H, Bra, Id
# equality_comparator = (CX >> (H @ Rx(0.5)) >> (Bra(0) @ Id(1)))
# equality_comparator.draw()
"""
print and assert statements for debugging
"""
assert len(train_diagrams)== len(train_labels_v2)
print(f"count of train, test, val elements respectively are: ")
print({len(train_diagrams)}, {len(test_diagrams)}, {len(val_diagrams)})
assert len(train_diagrams)== len(train_labels)
assert len(val_diagrams)== len(val_labels)
assert len(test_diagrams)== len(test_labels)
def run_experiment(nlayers=1, seed=SEED):
"""mithuns comment @26thsep2024typically spider ansatz only goes with spider reader.
like i mentioned earlier, spider was used to just get the code off the ground
1) we need to definitely test with atleast bobcat parser
2) Noun should have a higher dimension than sentence? how?
- go back and confirm the original 1958 paper by lambek. also how
is the code in LAMBEQ deciding the dimensions or even what data types to use?
answer might be in 2010 discocat paper"""
ansatz = ansatz_to_use({AtomicType.NOUN: Dim(4),
AtomicType.SENTENCE: Dim(2)
# AtomicType.PREPOSITIONAL_PHRASE: Dim(2),
})
"""
todo: his original code for ansatz is as below. Todo find out: why we switched to the above.
I think it had something do with spider ansatz-
todo: write the whole history of playing with this code- why spider ansatz etc- in one single
word document, like chronological order.-for your own sanity
N = AtomicType.NOUN
S = AtomicType.SENTENCE
P = AtomicType.PREPOSITIONAL_PHRASE
print(f'RUNNING WITH {nlayers} layers')
ansatz = Sim15Ansatz({N: 1, S: 1, P:1}, n_layers=nlayers, n_single_qubit_params=3)
Also the two lines below is more to do with comparing two things, like NLI/MRPC, Might not be that
relevant in say classification
train_circs = [ansatz(d) >> equality_comparator for d in train_X]
test_circs = [ansatz(d) >> equality_comparator for d in test_X]
"""
#use the anstax to create circuits from diagrams
train_circuits = [ansatz(diagram) for diagram in train_diagrams]
val_circuits = [ansatz(diagram) for diagram in val_diagrams]
test_circuits = [ansatz(diagram) for diagram in test_diagrams]
#mithuns comment @26thsep2024:pytorch model was the only one going well
# with spider reader and spider anstaz
#also todo: should this not contain both val and train circuits as inputz?
# qnlp_model = PytorchModel.from_diagrams(train_circuits)
from lambeq import TketModel, NumpyModel, QuantumTrainer, SPSAOptimizer, Dataset
# qnlp_model = NumpyModel.from_diagrams(train_circuits)
qnlp_model = model_to_use.from_diagrams(train_circuits)
print("length of each circuit in train is:")
print([len(x) for x in train_circuits])
train_dataset = Dataset(
train_circuits,
train_labels,
batch_size=BATCH_SIZE)
val_dataset = Dataset(val_circuits, val_labels, shuffle=False)
print(len(train_labels), len(train_circuits))
#print and assert statements for debugging
print(len(train_circuits), len(val_circuits), len(test_circuits))
assert len(train_circuits)== len(train_labels)
assert len(val_circuits)== len(val_labels)
assert len(test_circuits)== len(test_labels)
"""
#this is a;;wandb crap,- i was trying to automate a sweep across parameters.
Turned out to be more hurting than helping. Will think about this as and when we get to expt running stage
# comment out if becoming a pain in the butt. MLFlow is better than wandb imho
sweep_config = {
'method': 'random'
}
metric = {
'name': 'loss',
'goal': 'minimize'
}
sweep_config['metric'] = metric
parameters_dict = {
'LEARNING_RATE': {
'values': [0.3, 0.03, 0.003,0.0003]
},
}
sweep_config['parameters'] = parameters_dict
parameters_dict.update({
'epochs': {
'value': 1}
})
import pprint
pprint.pprint(sweep_config)
# sweep_id = wandb.sweep(sweep_config, project="uspantekan_spider_tuning")
# wandb_logger = WandbLogger()"""
trainer = trainer_to_use(
model=qnlp_model,
loss_function=torch.nn.BCEWithLogitsLoss(),
optimizer=torch.optim.AdamW,
learning_rate=LEARNING_RATE,
use_tensorboard=True,
epochs=EPOCHS,
evaluate_functions=eval_metrics,
evaluate_on_train=True,
verbose='text',
seed=SEED)
# trainer = QuantumTrainer(
# qnlp_model,
# loss_function=torch.nn.BCEWithLogitsLoss(),
# epochs=EPOCHS,
# optimizer=SPSAOptimizer,
# optim_hyperparams={'a': 0.05, 'c': 0.06, 'A':0.01*EPOCHS},
# evaluate_functions=eval_metrics,
# evaluate_on_train=True,
# verbose = 'text',
# seed=seed
# )
#get the embeddings etc to be used in models 2 through 4. Note
# that one very interesting thing that happens as far as model 1is considered is that
# inside the function generate_initial_parameterisation() the QNLP
# model ka weights (i.e the angles of gates)
# gets initialized with initial fast text embeddings of each word in training
train_embeddings, val_embeddings, max_w_param_length = generate_initial_parameterisation(
train_circuits, val_circuits, embedding_model, qnlp_model)
#run ONLY the QNLP model.i.e let it train on the train_dataset. and test on val_dataset
#todo: somewhere you use the term val and somehwere else you use the term test.
# Fix it/use only one everywhere-bottomline: make val/dev explicitly different than test
trainer.fit(train_dataset, log_interval=1)
"""#for experiments on october 14th 2024. i.e
just use 1 off the shelf model and spread spectrum/parameter search
for out of hte box for usp"""
"""
Uncomment this code eventually. Here he is using the model trained above on training circuits
itself, just to get training accuracy
todo: do we want to bring in dev here now, and keep testing below as TEST partition?
else what is the point in doing prediction on training data- obviously it will be 100%
-on the bright side_ maybe it is a sanity check to ensure our model is getting trained atleast"""
"""error@ 16th 2024-
below code is egiving the error, i.e when trying to evaluate on val dataset
File "/Users/mithun/miniconda3/envs/qnlp/lib/python3.12/site-packages/lambeq/training/pytorch_model.py", line 142, in get_diagram_output
raise KeyError(
KeyError: 'Unknown symbol: únicamente_0__s
Note this is only
a) after commenting the qnlp_model.weights ka problem
inside the function generate_initial_parameterisation()
which means, we kind of ignore/commented out that problem,
and here is the next one.
b) when we are trying to evaluate on val dataset
What this error means is that: únicamente_0__s is not found
to get the diagram output for. Mostlikely it is looking
for embedding of únicamente_0__s inside train_embeddings.
"the exact line 143 can be found in this part of the LAMBEQ code:https://github.com/CQCL/lambeq/blob/8109d952d707880b8588e0d04f24f0b5a94c3d59/lambeq/training/pytorch_model.py#L140