creating an encoder-decoder LSTM model for text samarization in tensorflow

46 views Asked by At

I'm trying to create an encoder-decoder LSTM model for text samarization in tensorflow.But i think my code isn't working properly can anyone fix it?

importing libraries

import io
import json
import numpy as np
import pandas as pd
import random
import re
import tensorflow as tf
import unicodedata
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from datasets import load_dataset
from transformers import AutoTokenizer

Importing and processing data (samsum = https://huggingface.co/datasets/samsum)

def cnn_dailymail_dataset():
    dataset = load_dataset("samsum")
    df_train = pd.DataFrame(dataset['train'])
    df_test = pd.DataFrame(dataset['test'])

    number_regex = r'\d+'
    whitespace_regex = r'\s+'
    punctuation_regex = r'[^\w\s]'

    # Train data
    df_train['clear_article'] = df_train['dialogue'].str.lower().str.replace(number_regex, '').str.replace(whitespace_regex, ' ').str.replace(punctuation_regex, '')

    df_train['clear_highlights'] = df_train['summary'].str.lower().str.replace(number_regex, '').str.replace(whitespace_regex, ' ').str.replace(punctuation_regex, '')

    # Test data
    df_test['clear_article'] = df_test['dialogue'].str.lower().str.replace(number_regex, '').str.replace(whitespace_regex, ' ').str.replace(punctuation_regex, '')
    df_test['clear_highlights'] = df_test['summary'].str.lower().str.replace(number_regex, '').str.replace(whitespace_regex, ' ').str.replace(punctuation_regex, '')

    # Train data
    X_train = df_train['clear_article'].apply(str).values
    y_train = df_train['clear_highlights'].apply(str).values

    # Test data
    X_test = df_test['clear_article'].apply(str).values
    y_test = df_test['clear_highlights'].apply(str).values

    return X_train, y_train, X_test, y_test

Add <Start> and <End> tokens

train_preprocessed_input,train_preprocessed_target,_,_ = cnn_dailymail_dataset()
def tag_target_sentences(sentences):
  tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
  return list(tagged_sentences)

train_tagged_preprocessed_target = tag_target_sentences(train_preprocessed_target)

Setting Tokenizer

source_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
source_tokenizer.fit_on_texts(train_preprocessed_input)
source_tokenizer.get_config()
source_vocab_size = len(source_tokenizer.word_index) + 1
print(source_vocab_size)

# Tokenizer for the English target sentences.
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>', filters='"#$%&()*+-/:;=@[\\]^_`{|}~\t\n')
target_tokenizer.fit_on_texts(train_tagged_preprocessed_target)
target_tokenizer.get_config()
target_vocab_size = len(target_tokenizer.word_index) + 1
print(target_vocab_size)

Teacher forcing and pad seq

train_encoder_inputs = source_tokenizer.texts_to_sequences(train_preprocessed_input)
print(train_encoder_inputs[:3])
print(source_tokenizer.sequences_to_texts(train_encoder_inputs[:3]))

def generate_decoder_inputs_targets(sentences, tokenizer):
  seqs = tokenizer.texts_to_sequences(sentences)
  decoder_inputs = [s[:-1] for s in seqs] # Drop the last token in the sentence.
  decoder_targets = [s[1:] for s in seqs] # Drop the first token in the sentence.

  return decoder_inputs, decoder_targets

train_decoder_inputs, train_decoder_targets = generate_decoder_inputs_targets(train_tagged_preprocessed_target, target_tokenizer)

max_encoding_len = 100
max_decoding_len = 60
padded_train_encoder_inputs = pad_sequences(train_encoder_inputs, max_encoding_len, padding='post', truncating='post')
padded_train_decoder_inputs = pad_sequences(train_decoder_inputs, max_decoding_len, padding='post', truncating='post')
padded_train_decoder_targets = pad_sequences(train_decoder_targets, max_decoding_len, padding='post', truncating='post')

Model Architectures

embedding_dim = 120
hidden_dim = 100
default_dropout=0.2
batch_size = 12
epochs = 12

encoder_inputs = layers.Input(shape=[None], name='encoder_inputs')

encoder_embeddings = layers.Embedding(source_vocab_size, 
                                      embedding_dim,
                                      mask_zero=True,
                                      name='encoder_embeddings')

encoder_embedding_output = encoder_embeddings(encoder_inputs)
encoder_lstm = layers.LSTM(hidden_dim, 
                           return_state=True, 
                           dropout=default_dropout, 
                           name='encoder_lstm')


encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding_output)
encoder_states = (state_h, state_c)

decoder_inputs = layers.Input(shape=[None], name='decoder_inputs')


decoder_embeddings = layers.Embedding(target_vocab_size, 
                                      embedding_dim, 
                                      mask_zero=True,
                                      name='decoder_embeddings')


decoder_embedding_output = decoder_embeddings(decoder_inputs)

decoder_lstm = layers.LSTM(hidden_dim,
                           return_sequences=True,
                           return_state=True,
                           dropout=default_dropout,
                           name='decoder_lstm')

decoder_outputs, _, _ = decoder_lstm(decoder_embedding_output, initial_state=encoder_states)

decoder_dense = layers.Dense(target_vocab_size, activation='softmax', name='decoder_dense')

y_proba = decoder_dense(decoder_outputs)
model = tf.keras.Model([encoder_inputs, decoder_inputs], y_proba, name='cnn_dailymail_no_attention')
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',  metrics='sparse_categorical_accuracy')
model.summary()

Training The model

history = model.fit([padded_train_encoder_inputs, padded_train_decoder_inputs], padded_train_decoder_targets,
                     batch_size=batch_size,
                     epochs=epochs)
model.save('cnn_dailymail_no_attention')

Save the tokenizers

source_tokenizer_json = source_tokenizer.to_json()
with io.open('cnn_source_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(source_tokenizer_json, ensure_ascii=False))

target_tokenizer_json = target_tokenizer.to_json()
with io.open('cnn_target_tokenizer.json', 'w', encoding='utf-8') as f:
  f.write(json.dumps(target_tokenizer_json, ensure_ascii=False))

Evaluating The Model

def process_dataset(preprocessed_input, preprocessed_output):
  # Tag target sentences with <sos> and <eos> tokens.
  tagged_preprocessed_output = tag_target_sentences(preprocessed_output)
  # Vectorize encoder source sentences.
  encoder_inputs = source_tokenizer.texts_to_sequences(preprocessed_input)
  # Vectorize and create decoder input and target sentences.
  decoder_inputs, decoder_targets = generate_decoder_inputs_targets(tagged_preprocessed_output, 
                                                                    target_tokenizer)
  # Pad all collections.
  padded_encoder_inputs = pad_sequences(encoder_inputs, max_encoding_len, padding='post', truncating='post')
  padded_decoder_inputs = pad_sequences(decoder_inputs, max_decoding_len, padding='post', truncating='post')
  padded_decoder_targets = pad_sequences(decoder_targets, max_decoding_len, padding='post', truncating='post')
  return padded_encoder_inputs, padded_decoder_inputs, padded_decoder_targets

Opening The save tokenizers and the model

with open('cnn_source_tokenizer.json') as f:
    data = json.load(f)
    source_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

with open('cnn_target_tokenizer.json') as f:
    data = json.load(f)
    target_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

# Load the model.
model = tf.keras.models.load_model('cnn_dailymail_no_attention')

Model Evaluation

_, _, X_test, y_test = cnn_dailymail_dataset()
padded_test_encoder_inputs, padded_test_decoder_inputs, padded_test_decoder_targets = process_dataset(X_test, y_test)
model.evaluate([padded_test_encoder_inputs, padded_test_decoder_inputs], padded_test_decoder_targets)

[3.7422165870666504, 0.19346268475055695]

Make Summary (Predicting Summary)

defining the encoder

# These are the layers of our trained model.
[layer.name for layer in model.layers]
encoder_inputs = model.get_layer('encoder_inputs').input
encoder_embedding_layer = model.get_layer('encoder_embeddings')
encoder_embeddings = encoder_embedding_layer(encoder_inputs)
encoder_lstm = model.get_layer('encoder_lstm')
_, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embeddings)
encoder_states = [encoder_state_h, encoder_state_c]
encoder_model_no_attention = tf.keras.Model(encoder_inputs, encoder_states)

defining the decoder

decoder_inputs = model.get_layer('decoder_inputs').input
decoder_embedding_layer = model.get_layer('decoder_embeddings')
decoder_embeddings = decoder_embedding_layer(decoder_inputs)
decoder_input_state_h = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_h')
decoder_input_state_c = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_c')
decoder_input_states = [decoder_input_state_h, decoder_input_state_c]
decoder_lstm = model.get_layer('decoder_lstm')
decoder_sequence_outputs, decoder_output_state_h, decoder_output_state_c = decoder_lstm(
    decoder_embeddings, initial_state=decoder_input_states
)
decoder_output_states = [decoder_output_state_h, decoder_output_state_c]
decoder_dense = model.get_layer('decoder_dense')
y_proba = decoder_dense(decoder_sequence_outputs)
decoder_model_no_attention = tf.keras.Model(
    [decoder_inputs] + decoder_input_states, 
    [y_proba] + decoder_output_states
)

Make the predication function

def translate_without_attention(sentence: str, 
                                source_tokenizer, encoder,
                                target_tokenizer, decoder,
                                max_translated_len = 30):

  # Vectorize the source sentence and run it through the encoder.    
  input_seq = source_tokenizer.texts_to_sequences([sentence])

  # Get the tokenized sentence to see if there are any unknown tokens.
  tokenized_sentence = source_tokenizer.sequences_to_texts(input_seq)

  states = encoder.predict(input_seq)  

  current_word = '<sos>'
  decoded_sentence = []

  while len(decoded_sentence) < max_translated_len:
    
    # Set the next input word for the decoder.
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_tokenizer.word_index[current_word]
    
    # Determine the next word.
    target_y_proba, h, c = decoder.predict([target_seq] + states)
    target_token_index = np.argmax(target_y_proba[0, -1, :])
    current_word = target_tokenizer.index_word[target_token_index]

    if (current_word == '<eos>'):
      break

    decoded_sentence.append(current_word)
    states = [h, c]
  
  return tokenized_sentence[0], ' '.join(decoded_sentence)

Test Data

X_test_sentences = X_test[:30]
y_test_sentences = y_test[:30]

final function ()

def translate_sentences(X_test_sentences, y_test_sentences, translation_func, source_tokenizer, encoder,
                        target_tokenizer, decoder):
  translations = {'Original_Text': [], 'Summary': [], 'Predicted_Summary': []}

  for s in range(30):
    source, target = X_test_sentences[s], y_test_sentences[s]
    tokenized_sentence, translated = translation_func(source, source_tokenizer, encoder,
                                                      target_tokenizer, decoder)

    translations['Original_Text'].append(tokenized_sentence)
    translations['Summary'].append(target)
    translations['Predicted_Summary'].append(translated)
  
  return translations
translations_no_attention =
pd.DataFrame(translate_sentences(X_test_sentences,y_test_sentences,translate_without_attention,source_tokenizer, encoder_model_no_attention,target_tokenizer, decoder_model_no_attention))

Saving The Summary in a csv file

translations_no_attention.to_csv('CNN_output.csv', index=False)

it actually generates some texts

example

  1. Actual Summary = membership gives the icc jurisdiction over alleged crimes committed in palestinian territories since last june israel and the united states opposed the move which could open the door to war crimes investigations against israelis

Predicted Summary = new arrests have been held against the egyptian government in the capital of the united states the group has been arrested in connection with the attacks of the muslim brotherhood

But the problem is accuracy is so low and it took a long time to train even in a RTX 3090 Ti it took almost 27 minutes for 14.7k Rows.

Epoch 1/12 1228/1228 [==============================] - 134s 100ms/step - loss: 2.4015 - sparse_categorical_accuracy: 0.0923 Epoch 2/12 1228/1228 [==============================] - 122s 99ms/step - loss: 2.0641 - sparse_categorical_accuracy: 0.1511 Epoch 3/12 1228/1228 [==============================] - 121s 99ms/step - loss: 1.9166 - sparse_categorical_accuracy: 0.1764 Epoch 4/12 1228/1228 [==============================] - 122s 99ms/step - loss: 1.8108 - sparse_categorical_accuracy: 0.1963 Epoch 5/12 1228/1228 [==============================] - 122s 100ms/step - loss: 1.7240 - sparse_categorical_accuracy: 0.2123 Epoch 6/12 1228/1228 [==============================] - 121s 99ms/step - loss: 1.6501 - sparse_categorical_accuracy: 0.2272 Epoch 7/12 1228/1228 [==============================] - 118s 96ms/step - loss: 1.5835 - sparse_categorical_accuracy: 0.2411 Epoch 8/12 1228/1228 [==============================] - 135s 110ms/step - loss: 1.5246 - sparse_categorical_accuracy: 0.2532 Epoch 9/12 1228/1228 [==============================] - 141s 115ms/step - loss: 1.4711 - sparse_categorical_accuracy: 0.2660 Epoch 10/12 1228/1228 [==============================] - 140s 114ms/step - loss: 1.4213 - sparse_categorical_accuracy: 0.2782 Epoch 11/12 1228/1228 [==============================] - 135s 110ms/step - loss: 1.3758 - sparse_categorical_accuracy: 0.2892 Epoch 12/12 1228/1228 [==============================] - 139s 113ms/step - loss: 1.3331 - sparse_categorical_accuracy: 0.3010

What steps should I take to resolve these issues, enhance the model’s accuracy, and optimize it further?

0

There are 0 answers