I'm trying to create an encoder-decoder LSTM model for text samarization in tensorflow.But i think my code isn't working properly can anyone fix it?
importing libraries
import io
import json
import numpy as np
import pandas as pd
import random
import re
import tensorflow as tf
import unicodedata
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
from transformers import AutoTokenizer
Importing and processing data (samsum = https://huggingface.co/datasets/samsum)
def cnn_dailymail_dataset():
dataset = load_dataset("samsum")
df_train = pd.DataFrame(dataset['train'])
df_test = pd.DataFrame(dataset['test'])
number_regex = r'\d+'
whitespace_regex = r'\s+'
punctuation_regex = r'[^\w\s]'
# Train data
df_train['clear_article'] = df_train['dialogue'].str.lower().str.replace(number_regex, '').str.replace(whitespace_regex, ' ').str.replace(punctuation_regex, '')
df_train['clear_highlights'] = df_train['summary'].str.lower().str.replace(number_regex, '').str.replace(whitespace_regex, ' ').str.replace(punctuation_regex, '')
# Test data
df_test['clear_article'] = df_test['dialogue'].str.lower().str.replace(number_regex, '').str.replace(whitespace_regex, ' ').str.replace(punctuation_regex, '')
df_test['clear_highlights'] = df_test['summary'].str.lower().str.replace(number_regex, '').str.replace(whitespace_regex, ' ').str.replace(punctuation_regex, '')
# Train data
X_train = df_train['clear_article'].apply(str).values
y_train = df_train['clear_highlights'].apply(str).values
# Test data
X_test = df_test['clear_article'].apply(str).values
y_test = df_test['clear_highlights'].apply(str).values
return X_train, y_train, X_test, y_test
Add <Start> and <End> tokens
train_preprocessed_input,train_preprocessed_target,_,_ = cnn_dailymail_dataset()
def tag_target_sentences(sentences):
tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
return list(tagged_sentences)
train_tagged_preprocessed_target = tag_target_sentences(train_preprocessed_target)
Setting Tokenizer
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
source_tokenizer.fit_on_texts(train_preprocessed_input)
source_tokenizer.get_config()
source_vocab_size = len(source_tokenizer.word_index) + 1
print(source_vocab_size)
# Tokenizer for the English target sentences.
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
target_tokenizer.fit_on_texts(train_tagged_preprocessed_target)
target_tokenizer.get_config()
target_vocab_size = len(target_tokenizer.word_index) + 1
print(target_vocab_size)
Teacher forcing and pad seq
train_encoder_inputs = source_tokenizer.texts_to_sequences(train_preprocessed_input)
print(train_encoder_inputs[:3])
print(source_tokenizer.sequences_to_texts(train_encoder_inputs[:3]))
def generate_decoder_inputs_targets(sentences, tokenizer):
seqs = tokenizer.texts_to_sequences(sentences)
decoder_inputs = [s[:-1] for s in seqs] # Drop the last token in the sentence.
decoder_targets = [s[1:] for s in seqs] # Drop the first token in the sentence.
return decoder_inputs, decoder_targets
train_decoder_inputs, train_decoder_targets = generate_decoder_inputs_targets(train_tagged_preprocessed_target, target_tokenizer)
max_encoding_len = 100
max_decoding_len = 60
padded_train_encoder_inputs = pad_sequences(train_encoder_inputs, max_encoding_len, padding='post', truncating='post')
padded_train_decoder_inputs = pad_sequences(train_decoder_inputs, max_decoding_len, padding='post', truncating='post')
padded_train_decoder_targets = pad_sequences(train_decoder_targets, max_decoding_len, padding='post', truncating='post')
Model Architectures
embedding_dim = 120
hidden_dim = 100
default_dropout=0.2
batch_size = 12
epochs = 12
encoder_inputs = layers.Input(shape=[None], name='encoder_inputs')
encoder_embeddings = layers.Embedding(source_vocab_size,
embedding_dim,
mask_zero=True,
name='encoder_embeddings')
encoder_embedding_output = encoder_embeddings(encoder_inputs)
encoder_lstm = layers.LSTM(hidden_dim,
return_state=True,
dropout=default_dropout,
name='encoder_lstm')
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding_output)
encoder_states = (state_h, state_c)
decoder_inputs = layers.Input(shape=[None], name='decoder_inputs')
decoder_embeddings = layers.Embedding(target_vocab_size,
embedding_dim,
mask_zero=True,
name='decoder_embeddings')
decoder_embedding_output = decoder_embeddings(decoder_inputs)
decoder_lstm = layers.LSTM(hidden_dim,
return_sequences=True,
return_state=True,
dropout=default_dropout,
name='decoder_lstm')
decoder_outputs, _, _ = decoder_lstm(decoder_embedding_output, initial_state=encoder_states)
decoder_dense = layers.Dense(target_vocab_size, activation='softmax', name='decoder_dense')
y_proba = decoder_dense(decoder_outputs)
model = tf.keras.Model([encoder_inputs, decoder_inputs], y_proba, name='cnn_dailymail_no_attention')
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics='sparse_categorical_accuracy')
model.summary()
Training The model
history = model.fit([padded_train_encoder_inputs, padded_train_decoder_inputs], padded_train_decoder_targets,
batch_size=batch_size,
epochs=epochs)
model.save('cnn_dailymail_no_attention')
Save the tokenizers
source_tokenizer_json = source_tokenizer.to_json()
with io.open('cnn_source_tokenizer.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(source_tokenizer_json, ensure_ascii=False))
target_tokenizer_json = target_tokenizer.to_json()
with io.open('cnn_target_tokenizer.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(target_tokenizer_json, ensure_ascii=False))
Evaluating The Model
def process_dataset(preprocessed_input, preprocessed_output):
# Tag target sentences with <sos> and <eos> tokens.
tagged_preprocessed_output = tag_target_sentences(preprocessed_output)
# Vectorize encoder source sentences.
encoder_inputs = source_tokenizer.texts_to_sequences(preprocessed_input)
# Vectorize and create decoder input and target sentences.
decoder_inputs, decoder_targets = generate_decoder_inputs_targets(tagged_preprocessed_output,
target_tokenizer)
# Pad all collections.
padded_encoder_inputs = pad_sequences(encoder_inputs, max_encoding_len, padding='post', truncating='post')
padded_decoder_inputs = pad_sequences(decoder_inputs, max_decoding_len, padding='post', truncating='post')
padded_decoder_targets = pad_sequences(decoder_targets, max_decoding_len, padding='post', truncating='post')
return padded_encoder_inputs, padded_decoder_inputs, padded_decoder_targets
Opening The save tokenizers and the model
with open('cnn_source_tokenizer.json') as f:
data = json.load(f)
source_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)
with open('cnn_target_tokenizer.json') as f:
data = json.load(f)
target_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)
# Load the model.
model = tf.keras.models.load_model('cnn_dailymail_no_attention')
Model Evaluation
_, _, X_test, y_test = cnn_dailymail_dataset()
padded_test_encoder_inputs, padded_test_decoder_inputs, padded_test_decoder_targets = process_dataset(X_test, y_test)
model.evaluate([padded_test_encoder_inputs, padded_test_decoder_inputs], padded_test_decoder_targets)
[3.7422165870666504, 0.19346268475055695]
Make Summary (Predicting Summary)
defining the encoder
# These are the layers of our trained model.
[layer.name for layer in model.layers]
encoder_inputs = model.get_layer('encoder_inputs').input
encoder_embedding_layer = model.get_layer('encoder_embeddings')
encoder_embeddings = encoder_embedding_layer(encoder_inputs)
encoder_lstm = model.get_layer('encoder_lstm')
_, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embeddings)
encoder_states = [encoder_state_h, encoder_state_c]
encoder_model_no_attention = tf.keras.Model(encoder_inputs, encoder_states)
defining the decoder
decoder_inputs = model.get_layer('decoder_inputs').input
decoder_embedding_layer = model.get_layer('decoder_embeddings')
decoder_embeddings = decoder_embedding_layer(decoder_inputs)
decoder_input_state_h = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_h')
decoder_input_state_c = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_c')
decoder_input_states = [decoder_input_state_h, decoder_input_state_c]
decoder_lstm = model.get_layer('decoder_lstm')
decoder_sequence_outputs, decoder_output_state_h, decoder_output_state_c = decoder_lstm(
decoder_embeddings, initial_state=decoder_input_states
)
decoder_output_states = [decoder_output_state_h, decoder_output_state_c]
decoder_dense = model.get_layer('decoder_dense')
y_proba = decoder_dense(decoder_sequence_outputs)
decoder_model_no_attention = tf.keras.Model(
[decoder_inputs] + decoder_input_states,
[y_proba] + decoder_output_states
)
Make the predication function
def translate_without_attention(sentence: str,
source_tokenizer, encoder,
target_tokenizer, decoder,
max_translated_len = 30):
# Vectorize the source sentence and run it through the encoder.
input_seq = source_tokenizer.texts_to_sequences([sentence])
# Get the tokenized sentence to see if there are any unknown tokens.
tokenized_sentence = source_tokenizer.sequences_to_texts(input_seq)
states = encoder.predict(input_seq)
current_word = '<sos>'
decoded_sentence = []
while len(decoded_sentence) < max_translated_len:
# Set the next input word for the decoder.
target_seq = np.zeros((1,1))
target_seq[0, 0] = target_tokenizer.word_index[current_word]
# Determine the next word.
target_y_proba, h, c = decoder.predict([target_seq] + states)
target_token_index = np.argmax(target_y_proba[0, -1, :])
current_word = target_tokenizer.index_word[target_token_index]
if (current_word == '<eos>'):
break
decoded_sentence.append(current_word)
states = [h, c]
return tokenized_sentence[0], ' '.join(decoded_sentence)
Test Data
X_test_sentences = X_test[:30]
y_test_sentences = y_test[:30]
final function ()
def translate_sentences(X_test_sentences, y_test_sentences, translation_func, source_tokenizer, encoder,
target_tokenizer, decoder):
translations = {'Original_Text': [], 'Summary': [], 'Predicted_Summary': []}
for s in range(30):
source, target = X_test_sentences[s], y_test_sentences[s]
tokenized_sentence, translated = translation_func(source, source_tokenizer, encoder,
target_tokenizer, decoder)
translations['Original_Text'].append(tokenized_sentence)
translations['Summary'].append(target)
translations['Predicted_Summary'].append(translated)
return translations
translations_no_attention =
pd.DataFrame(translate_sentences(X_test_sentences,y_test_sentences,translate_without_attention,source_tokenizer, encoder_model_no_attention,target_tokenizer, decoder_model_no_attention))
Saving The Summary in a csv file
translations_no_attention.to_csv('CNN_output.csv', index=False)
it actually generates some texts
example
- Actual Summary = membership gives the icc jurisdiction over alleged crimes committed in palestinian territories since last june israel and the united states opposed the move which could open the door to war crimes investigations against israelis
Predicted Summary = new arrests have been held against the egyptian government in the capital of the united states the group has been arrested in connection with the attacks of the muslim brotherhood
But the problem is accuracy is so low and it took a long time to train even in a RTX 3090 Ti it took almost 27 minutes for 14.7k Rows.
Epoch 1/12 1228/1228 [==============================] - 134s 100ms/step - loss: 2.4015 - sparse_categorical_accuracy: 0.0923 Epoch 2/12 1228/1228 [==============================] - 122s 99ms/step - loss: 2.0641 - sparse_categorical_accuracy: 0.1511 Epoch 3/12 1228/1228 [==============================] - 121s 99ms/step - loss: 1.9166 - sparse_categorical_accuracy: 0.1764 Epoch 4/12 1228/1228 [==============================] - 122s 99ms/step - loss: 1.8108 - sparse_categorical_accuracy: 0.1963 Epoch 5/12 1228/1228 [==============================] - 122s 100ms/step - loss: 1.7240 - sparse_categorical_accuracy: 0.2123 Epoch 6/12 1228/1228 [==============================] - 121s 99ms/step - loss: 1.6501 - sparse_categorical_accuracy: 0.2272 Epoch 7/12 1228/1228 [==============================] - 118s 96ms/step - loss: 1.5835 - sparse_categorical_accuracy: 0.2411 Epoch 8/12 1228/1228 [==============================] - 135s 110ms/step - loss: 1.5246 - sparse_categorical_accuracy: 0.2532 Epoch 9/12 1228/1228 [==============================] - 141s 115ms/step - loss: 1.4711 - sparse_categorical_accuracy: 0.2660 Epoch 10/12 1228/1228 [==============================] - 140s 114ms/step - loss: 1.4213 - sparse_categorical_accuracy: 0.2782 Epoch 11/12 1228/1228 [==============================] - 135s 110ms/step - loss: 1.3758 - sparse_categorical_accuracy: 0.2892 Epoch 12/12 1228/1228 [==============================] - 139s 113ms/step - loss: 1.3331 - sparse_categorical_accuracy: 0.3010