I am trying to work on a text generation project. I downloaded the WikiBooks dataset from Kaggle:
https://www.kaggle.com/datasets/dhruvildave/wikibooks-dataset
And when I try to create a dataset to tokenize the texts, my kernel crashes because it runs out of memory.
from torch.utils.data import Dataset
from transformers import BertTokenizer
import os
class WikiDataset(Dataset):
def __init__(self, query, conn, tokenizer, max_len):
self.df = pd.read_sql_query(query, conn)
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.df)
def __getitem__(self, index):
# Return tokenized sequence
texts = self.df[index]
tokens = self.tokenizer.encode_plus(
texts,
max_length=self.max_len,
truncation=True,
padding="max_length",
return_tensors="pt"
)
return tokens["input_ids"], tokens["attention_masks"]
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
train_ds = WikiDataset(query, conn, tokenizer, 128)
My question: How do I tokenize the texts without the kernel crashing?
I tried different "max_length"s, but I assume the main problem is the dataset size.