As part of my learning process I wanted to write a transformer model to copy input sequence to output sequence. I thought it would be relatively straight forward, however the results are less than ideal - loss is higher than I've expected resulting in inaccurate copies of numbers in the sequence.
I would love to get some pointers on where I can improve (so far I tried a lot of hyper param tuning, nothing substantially changed the outcome). Is there something wrong with my architecture? Is there some algorithmic bugs I overlooked?
Sample run
Epoch 0 Loss 1.9481123001017469
Epoch 1 Loss 1.4541001472067325
Epoch 2 Loss 1.2569004525529577
Epoch 3 Loss 1.158278153297749
Epoch 4 Loss 1.1283172952367904
Test Case 0
Expected: [2, 3, 4, 2, 5, 9, 6, 7]
Actual: [2, 6, 4, 9, 3, 2, 7, 5]
Test Case 1
Expected: [8, 2, 5, 6, 7, 9, 4, 8]
Actual: [8, 2, 9, 4, 6, 5, 8, 7]
Test Case 2
Expected: [8, 5, 6, 3, 4, 5, 6, 8]
Actual: [8, 6, 5, 3, 4, 6, 8, 5]
Test Case 3
Expected: [5, 5, 7, 5, 2, 2, 9, 2]
Actual: [2, 5, 2, 5, 9, 7, 2]
Test Case 4
Expected: [4, 5, 9, 5, 3, 5, 7, 8]
Actual: [5, 9, 5, 3, 8, 4, 7]
My Implementation:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import math
from torch.utils.data import Dataset, DataLoader
# Set the random seed for reproducibility
torch.manual_seed(0)
# Define the device to run the model on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Model hyperparameters
input_size = 10 # Size of the input vocabulary (number of unique tokens)
num_layers = 6 # Number of transformer layers
d_model = 512 # The number of expected features in the encoder/decoder inputs
nhead = 8 # The number of heads in the multiheadattention model
dim_feedforward = 2048 # Dimension of the feedforward network model in nn.TransformerEncoder
dropout = 0.01 # Dropout rate
max_seq_length = 10 # Maximum sequence length
# Training hyperparameters
batch_size = 64
num_samples = 3000
epochs = 5
lr = 0.0001 # Learning rate
SOS_token = np.array([0])
EOS_token = np.array([1])
class TransformerModel(nn.Module):
def __init__(self, input_size, d_model, nhead, num_layers, dim_feedforward, dropout=0.1):
super(TransformerModel, self).__init__()
self.model_type = 'Transformer'
self.tgt_mask = None
self.pos_encoder = PositionalEncoding(d_model, dropout)
self.embedding = nn.Embedding(input_size, d_model)
self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, num_decoder_layers=num_layers,
dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
self.linear = nn.Linear(d_model, input_size)
def _generate_square_subsequent_mask(self, sz):
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def forward(self, src, tgt):
tgt_mask = self._generate_square_subsequent_mask(tgt.size(1)).to(device)
src = self.embedding(src) * math.sqrt(d_model)
tgt = self.embedding(tgt) * math.sqrt(d_model)
src = self.pos_encoder(src)
tgt = self.pos_encoder(tgt)
output = self.transformer(src, tgt, tgt_mask=tgt_mask)
output = self.linear(output)
return output
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=1024):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
class TrainingDataSet(Dataset):
def __init__(self, num_tokens, seq_length, size):
self.num_tokens = num_tokens
self.seq_length = seq_length
self.size = size
def __len__(self):
return self.size
def __getitem__(self, index):
sequence = np.concatenate(
(SOS_token, torch.randint(2, 10, (self.seq_length,), dtype=torch.long), EOS_token))
return sequence, sequence # input and target are the same in the copy task
# Create the dataset and dataloader
dataset = TrainingDataSet(input_size,
max_seq_length - 2, # -2 due to SOS end EOS tokens
num_samples)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Initialize the model
model = TransformerModel(input_size, d_model, nhead, num_layers, dim_feedforward, dropout).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
# Training loop
model.train()
for epoch in range(epochs):
total_loss = 0.
for src, tgt in dataloader:
src = src.to(device)
tgt = tgt.to(device)
tgt_input = tgt[:, :-1]
tgt_expected = tgt[:, 1:]
optimizer.zero_grad()
output = model(src, tgt_input)
# loss = criterion(output.view(-1, input_size), tgt_expected.view(-1))
# loss = criterion(output.reshape(-1, input_size), tgt_expected.reshape(-1))
# permute (N, L, C) -> (N, C, L) where N is batch, L is seq len, C is class/vocab_size
output = output.permute(0, 2, 1)
# shape of tgt_expected is (N, L) where N is batch, L is seq len
loss = criterion(output, tgt_expected)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f'Epoch {epoch} Loss {total_loss / len(dataloader)}')
# Save the model
torch.save(model.state_dict(), 'transformer_copy_task.pth')
def predict(model, input_sequence):
"""
Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
"""
model.eval()
y_input = torch.tensor([SOS_token], dtype=torch.long, device=device)
for _ in range(max_seq_length+2):
pred = model(input_sequence, y_input)
next_item = pred.topk(1)[1].view(-1)[-1].item() # num with highest probability
next_item = torch.tensor([[next_item]], device=device)
# Concatenate previous input with predicted best word
y_input = torch.cat((y_input, next_item), dim=1)
# Stop if model predicts end of sentence
if next_item.view(-1).item() == EOS_token:
break
return y_input.view(-1).tolist()
# Test the model
model.eval()
# Here we test some examples to observe how the model predicts
examples = [
torch.tensor([[0, 2, 3, 4, 2, 5, 9, 6, 7, 1]], dtype=torch.long, device=device),
torch.tensor([[0, 8, 2, 5, 6, 7, 9, 4, 8, 1]], dtype=torch.long, device=device),
torch.tensor([[0, 8, 5, 6, 3, 4, 5, 6, 8, 1]], dtype=torch.long, device=device),
torch.tensor([[0, 5, 5, 7, 5, 2, 2, 9, 2, 1]], dtype=torch.long, device=device),
torch.tensor([[0, 4, 5, 9, 5, 3, 5, 7, 8, 1]], dtype=torch.long, device=device),
]
for idx, example in enumerate(examples):
result = predict(model, example)
print(f"Test Case {idx}")
print(f"Expected: {example.view(-1).tolist()[1:-1]}")
print(f"Actual: {result[1:-1]}")
print()
Turns out encoder-decoder model is not suitable for this type of task. Using encoder-only model should work.