As part of my learning process I wanted to write a transformer model to copy input sequence to output sequence. I thought it would be relatively straight forward, however the results are less than ideal - loss is higher than I've expected resulting in inaccurate copies of numbers in the sequence.
I would love to get some pointers on where I can improve (so far I tried a lot of hyper param tuning, nothing substantially changed the outcome). Is there something wrong with my architecture? Is there some algorithmic bugs I overlooked?
Sample run
Epoch 0 Loss 1.9481123001017469
Epoch 1 Loss 1.4541001472067325
Epoch 2 Loss 1.2569004525529577
Epoch 3 Loss 1.158278153297749
Epoch 4 Loss 1.1283172952367904
Test Case 0
Expected: [2, 3, 4, 2, 5, 9, 6, 7]
Actual: [2, 6, 4, 9, 3, 2, 7, 5]
Test Case 1
Expected: [8, 2, 5, 6, 7, 9, 4, 8]
Actual: [8, 2, 9, 4, 6, 5, 8, 7]
Test Case 2
Expected: [8, 5, 6, 3, 4, 5, 6, 8]
Actual: [8, 6, 5, 3, 4, 6, 8, 5]
Test Case 3
Expected: [5, 5, 7, 5, 2, 2, 9, 2]
Actual: [2, 5, 2, 5, 9, 7, 2]
Test Case 4
Expected: [4, 5, 9, 5, 3, 5, 7, 8]
Actual: [5, 9, 5, 3, 8, 4, 7]
My Implementation:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import math
from torch.utils.data import Dataset, DataLoader
# Set the random seed for reproducibility
torch.manual_seed(0)
# Define the device to run the model on
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Model hyperparameters
input_size = 10  # Size of the input vocabulary (number of unique tokens)
num_layers = 6  # Number of transformer layers
d_model = 512  # The number of expected features in the encoder/decoder inputs
nhead = 8  # The number of heads in the multiheadattention model
dim_feedforward = 2048  # Dimension of the feedforward network model in nn.TransformerEncoder
dropout = 0.01  # Dropout rate
max_seq_length = 10  # Maximum sequence length
# Training hyperparameters
batch_size = 64
num_samples = 3000
epochs = 5
lr = 0.0001  # Learning rate
SOS_token = np.array([0])
EOS_token = np.array([1])
class TransformerModel(nn.Module):
    def __init__(self, input_size, d_model, nhead, num_layers, dim_feedforward, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.tgt_mask = None
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.embedding = nn.Embedding(input_size, d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, num_decoder_layers=num_layers,
                                          dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        self.linear = nn.Linear(d_model, input_size)
    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask
    def forward(self, src, tgt):
        tgt_mask = self._generate_square_subsequent_mask(tgt.size(1)).to(device)
        src = self.embedding(src) * math.sqrt(d_model)
        tgt = self.embedding(tgt) * math.sqrt(d_model)
        src = self.pos_encoder(src)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(src, tgt, tgt_mask=tgt_mask)
        output = self.linear(output)
        return output
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=1024):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)
    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)
class TrainingDataSet(Dataset):
    def __init__(self, num_tokens, seq_length, size):
        self.num_tokens = num_tokens
        self.seq_length = seq_length
        self.size = size
    def __len__(self):
        return self.size
    def __getitem__(self, index):
        sequence = np.concatenate(
            (SOS_token, torch.randint(2, 10, (self.seq_length,), dtype=torch.long), EOS_token))
        return sequence, sequence  # input and target are the same in the copy task
# Create the dataset and dataloader
dataset = TrainingDataSet(input_size,
                          max_seq_length - 2,  # -2 due to SOS end EOS tokens
                          num_samples)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Initialize the model
model = TransformerModel(input_size, d_model, nhead, num_layers, dim_feedforward, dropout).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
# Training loop
model.train()
for epoch in range(epochs):
    total_loss = 0.
    for src, tgt in dataloader:
        src = src.to(device)
        tgt = tgt.to(device)
        tgt_input = tgt[:, :-1]
        tgt_expected = tgt[:, 1:]
        optimizer.zero_grad()
        output = model(src, tgt_input)
        # loss = criterion(output.view(-1, input_size), tgt_expected.view(-1))
        # loss = criterion(output.reshape(-1, input_size), tgt_expected.reshape(-1))
        # permute (N, L, C) -> (N, C, L) where N is batch, L is seq len, C is class/vocab_size
        output = output.permute(0, 2, 1)
        # shape of tgt_expected is (N, L) where N is batch, L is seq len
        loss = criterion(output, tgt_expected)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch} Loss {total_loss / len(dataloader)}')
# Save the model
torch.save(model.state_dict(), 'transformer_copy_task.pth')
def predict(model, input_sequence):
    """
    Method from "A detailed guide to Pytorch's nn.Transformer() module.", by
    Daniel Melchor: https://medium.com/@danielmelchor/a-detailed-guide-to-pytorchs-nn-transformer-module-c80afbc9ffb1
    """
    model.eval()
    y_input = torch.tensor([SOS_token], dtype=torch.long, device=device)
    for _ in range(max_seq_length+2):
        pred = model(input_sequence, y_input)
        next_item = pred.topk(1)[1].view(-1)[-1].item()  # num with highest probability
        next_item = torch.tensor([[next_item]], device=device)
        # Concatenate previous input with predicted best word
        y_input = torch.cat((y_input, next_item), dim=1)
        # Stop if model predicts end of sentence
        if next_item.view(-1).item() == EOS_token:
            break
    return y_input.view(-1).tolist()
# Test the model
model.eval()
# Here we test some examples to observe how the model predicts
examples = [
    torch.tensor([[0, 2, 3, 4, 2, 5, 9, 6, 7, 1]], dtype=torch.long, device=device),
    torch.tensor([[0, 8, 2, 5, 6, 7, 9, 4, 8, 1]], dtype=torch.long, device=device),
    torch.tensor([[0, 8, 5, 6, 3, 4, 5, 6, 8, 1]], dtype=torch.long, device=device),
    torch.tensor([[0, 5, 5, 7, 5, 2, 2, 9, 2, 1]], dtype=torch.long, device=device),
    torch.tensor([[0, 4, 5, 9, 5, 3, 5, 7, 8, 1]], dtype=torch.long, device=device),
]
for idx, example in enumerate(examples):
    result = predict(model, example)
    print(f"Test Case {idx}")
    print(f"Expected: {example.view(-1).tolist()[1:-1]}")
    print(f"Actual: {result[1:-1]}")
    print()
 
                        
Turns out encoder-decoder model is not suitable for this type of task. Using encoder-only model should work.