Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NaN output and loss value #56

Open
afmsaif opened this issue Mar 16, 2023 · 0 comments
Open

NaN output and loss value #56

afmsaif opened this issue Mar 16, 2023 · 0 comments

Comments

@afmsaif
Copy link

afmsaif commented Mar 16, 2023

I am using the following training function and librispeech dataset. Every time the output of the model while training become Nan as a result the loss is also nan. What could be the possible issue.

class IterMeter(object):
"""keeps track of total iterations"""
def init(self):
self.val = 0

def step(self):
    self.val += 1

def get(self):
    return self.val

def train(model, device, train_loader, criterion, optimizer, scheduler, epoch):
model.train()

train_loss = 0


data_len = len(train_loader.dataset)
for batch_idx, _data in enumerate(train_loader):
        spectrograms, labels, input_lengths, label_lengths = _data 
        
        spectrograms=torch.squeeze(spectrograms, dim=1)
        
        spectrograms = spectrograms.transpose(1,2)
        
        labels= torch.LongTensor(labels.long())
        
        input_lengths=torch.LongTensor(input_lengths)
        label_lengths=torch.LongTensor(label_lengths)
        input_lengths = input_lengths.to(device)
        label_lengths = label_lengths.to(device)
        spectrograms, labels = spectrograms.to(device), labels.to(device)
        print(spectrograms.type())

        optimizer.zero_grad()

        output, output_lengths = model(spectrograms,input_lengths)  # (batch, time, n_class)
        
        output = output.transpose(0, 1) # (time, batch, n_class)
        loss = criterion(output, labels, output_lengths, label_lengths)

        train_loss += loss.item() / len(train_loader)
        
        loss.backward()

 
        optimizer.step()
        scheduler.step()
       
        if batch_idx % 100 == 0 or batch_idx == data_len:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(spectrograms), data_len,
                100. * batch_idx / len(train_loader), loss.item()))
        
return train_loss

def test(model, device, test_loader, criterion, epoch,batch_size=20):
print('\nevaluating...')
model.eval()
test_loss = 0
test_cer, test_wer = [], []
n_classes = 29

if epoch%5==0:
    with torch.no_grad():
            for i, _data in enumerate(test_loader):
                spectrograms, labels, input_lengths, label_lengths = _data 
                spectrograms=torch.squeeze(spectrograms)
                
                spectrograms = spectrograms.transpose(1,2)
        
                labels=labels.long()

                input_lengths=torch.LongTensor(input_lengths)
                label_lengths=torch.LongTensor(label_lengths)
                input_lengths = input_lengths
                label_lengths = label_lengths

                spectrograms, labels = spectrograms.to(device), labels.to(device)

                output, output_lengths = model(spectrograms,input_lengths)  # (batch, time, n_class)
                soft_max = torch.nn.functional.softmax(output,dim=2)
                output = output.transpose(0, 1) # (time, batch, n_class)
                loss = criterion(output, labels, output_lengths, label_lengths)
                test_loss += loss.item() / len(test_loader)


                decoder = CTCBeamDecoder(
                    [''] * (n_classes - 1) + [' '],
                    model_path=None,
                    alpha=0,
                    beta=0,
                    cutoff_top_n=40,
                    cutoff_prob=1.0,
                    beam_width=1000,
                    num_processes=4,
                    blank_id=28,
                    log_probs_input=False
                )
                beam_results, beam_scores, timesteps, out_lens = decoder.decode(soft_max, output_lengths)
                b=[]
                for i in range(batch_size):
                     b.append(beam_results[i][0][:out_lens[i][0]])
                decoded_preds, decoded_targets = numtoword(b,out_lens,labels, label_lengths)

                for j in range(len(decoded_preds)):
                    test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                    test_wer.append(wer(decoded_targets[j], decoded_preds[j]))


    avg_cer = sum(test_cer)/len(test_cer)
    avg_wer = sum(test_wer)/len(test_wer)

    print('Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(test_loss, avg_cer, avg_wer))

    return test_loss, avg_cer, avg_wer 
else:
    with torch.no_grad():
        for i, _data in enumerate(test_loader):
            spectrograms, labels, input_lengths, label_lengths = _data 
            spectrograms=torch.squeeze(spectrograms)
            
            spectrograms = spectrograms.transpose(1,2)
        
            labels=labels.long()

            input_lengths=torch.LongTensor(input_lengths)
            label_lengths=torch.LongTensor(label_lengths)
            
            input_lengths = input_lengths.to(device)
            label_lengths = label_lengths.to(device)
            

            spectrograms, labels = spectrograms.to(device), labels.to(device)

            output, output_lengths = model(spectrograms,input_lengths)  # (batch, time, n_class)
            soft_max = torch.nn.functional.softmax(output,dim=2)
            output = output.transpose(0, 1) # (time, batch, n_class)
            loss = criterion(output, labels, output_lengths, label_lengths)
            test_loss += loss.item() / len(test_loader)
    print('Test set: Average loss: {:.4f}\n'.format(test_loss))
    return test_loss, 0 , 0

def main(learning_rate=5e-4, batch_size=20, epochs=10,
train_url="train-clean-100", test_url="test-clean"):

hparams = {

    "n_class": 29,
    "n_feats": 80,
    "learning_rate": learning_rate,
    "batch_size": batch_size,
    "epochs": epochs
}


use_cuda = torch.cuda.is_available()
torch.manual_seed(7)
device = torch.device("cuda" if use_cuda else "cpu")

if not os.path.isdir("./data"):
    os.makedirs("./data")

train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=train_url, download=True)
test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=test_url, download=True)

  
kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}
train_loader = data.DataLoader(dataset=train_dataset,
                            batch_size=hparams['batch_size'],
                            shuffle=True,
                            collate_fn=lambda x: data_processing(x, 'train'),
                            **kwargs)
test_loader = data.DataLoader(dataset=test_dataset,
                            batch_size=hparams['batch_size'],
                            shuffle=False,
                            collate_fn=lambda x: data_processing(x, 'valid'),
                            **kwargs)

model = Conformer(num_classes=hparams['n_class'], 
              input_dim=hparams['n_feats'], 
              encoder_dim=512, 
              num_encoder_layers=1)

model = nn.DataParallel(model)

model.to(device)

print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
criterion = nn.CTCLoss().to(device)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                        steps_per_epoch=int(len(train_loader)),
                                        epochs=hparams['epochs'],
                                        anneal_strategy='linear')
train_loss=[]
test_loss=[]
cer=[]
wer=[]
for epoch in range(1, epochs + 1):
    tra_loss = train(model, device, train_loader, criterion, optimizer, scheduler, epoch)
    tes_loss, c, w =  test(model, device, test_loader, criterion, epoch)
    train_loss.append(tra_loss)
    test_loss.append(tes_loss)
    cer.append(c)
    wer.append(w)
return train_loss, test_loss, cer, wer
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant