I would like to make training of a gpt2 from scratch on a very specific non-english corpus with Hugging Face. I managed to "train" a tokenizer. I managed to create a simplified version of gpt2 with less embedding dimension, less layer since it is just a prototype and I have colab notebook with one GPU.
Number of training documents is 174. When I tokenize them and split them into 64 length chunks my training sample size goes up from 174 (documents) to roughly 5 000 (chunks) which is the expected behavior.
If I want to replicate this approach with pytorch iterable dataset and dataloader approach:
import torch
from torch.utils.data import IterableDataset
class ConstantLengthDataset(IterableDataset):
def __init__(self, tokenizer, dataset, field_name="text", seq_length=CONTEXT_LENGTH,
num_of_sequences=174, chars_per_token=characters_per_token):
self.tokenizer = tokenizer
self.concat_token_id = tokenizer.eos_token_id
self.dataset = dataset
self.seq_length = seq_length
self.input_characters = seq_length * chars_per_token * num_of_sequences
self.field_name = field_name
def __iter__(self):
iterator = iter(self.dataset)
more_examples = True
while more_examples:
buffer, buffer_len = [], 0
while True:
if buffer_len >= self.input_characters:
m=f"Buffer full: {buffer_len}>={self.input_characters:.0f}"
# print(m)
break
try:
buffer.append(next(iterator)[self.field_name])
buffer_len += len(buffer[-1])
except StopIteration:
iterator = iter(self.dataset)
all_token_ids = []
tokenized_inputs = self.tokenizer(buffer, truncation=False)
for j, tokenized_input in enumerate(tokenized_inputs["input_ids"]):
all_token_ids.extend(tokenized_input + [self.concat_token_id])
for i in range(0, len(all_token_ids), self.seq_length):
input_ids = all_token_ids[i : i + self.seq_length]
if len(input_ids) == self.seq_length:
yield torch.tensor(input_ids)
and the data loaders:
def create_dataloaders(dataset_name):
train_data = load_from_disk(dataset_name)["train"]
train_data = train_data.shuffle(seed=args.seed)
valid_data = load_from_disk(dataset_name)["test"]
train_dataset = ConstantLengthDataset(tokenizer, train_data,
seq_length=args.seq_length)
valid_dataset = ConstantLengthDataset(tokenizer, valid_data,
seq_length=args.seq_length)
train_dataloader=DataLoader(train_dataset, batch_size=args.train_batch_size)
eval_dataloader=DataLoader(valid_dataset, batch_size=args.valid_batch_size)
return train_dataloader, eval_dataloader
Since it is an iterable dataset len() is not working. When I iterate through the data:
for i, data in enumerate(train_dataloader):
if i % 10000 == 0:
print(f"{i}")
instead of getting roughly 5000 samples the iteration just goes on, over 9 million. I could not wait for the end of the iteration.
What is going on here?