Model loaded in Flask app returns NameError for packages

37 views Asked by At

I'm using a classification model (NLP pipeline) in my flask app (app.py). It saves and loads fine with dill. However, when the model is actually called, the app returns a NameError, which leads me to believe it still can't access the packages present in the initial model.

Here is the traceback error:

    [2024-02-14 21:53:24,188] ERROR in app: Exception on /go [GET]
Traceback (most recent call last):
  File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/flask/app.py", line 2529, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/flask/app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/flask/app.py", line 1823, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/flask/app.py", line 1799, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tristenwallace/projects/udacity/data_science/disaster_response_message_classifier/webapp/app.py", line 32, in response
    clf_labels = model.predict([query])[0]
                 ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/sklearn/model_selection/_search.py", line 519, in predict
    return self.best_estimator_.predict(X)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/sklearn/pipeline.py", line 507, in predict
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/sklearn/feature_extraction/text.py", line 1428, in transform
    _, X = self._count_vocab(raw_documents, fixed_vocab=True)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/sklearn/feature_extraction/text.py", line 1270, in _count_vocab
    for feature in analyze(doc):
                   ^^^^^^^^^^^^
  File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/sklearn/feature_extraction/text.py", line 112, in _analyze
    doc = tokenizer(doc)
          ^^^^^^^^^^^^^^
  File "/home/tristenwallace/projects/udacity/data_science/disaster_response_message_classifier/src/train_classifier.py", line 76, in tokenize
    tokens = wordpunct_tokenize(text.lower().strip())
             ^^^^^^^^^^^^^^^^^^
NameError: name 'wordpunct_tokenize' is not defined

This is the flask app:

from flask import Flask
from flask import render_template, request
import dill
from sqlalchemy import create_engine
import pandas as pd
from nltk.tokenize import wordpunct_tokenize


app = Flask(__name__)


# load data
engine = create_engine('sqlite:///../data/DisasterResponse.db')
df = pd.read_sql_table('MessageCategories', con=engine)

# load model
model = dill.load(open('../models/message_classifier.pkl', 'rb'))

categories = list(df.columns[2:])

@app.route("/")
@app.route('/home')
def home():
    return render_template("pages/home.html")

@app.route("/go")
def response():
    # save user input in query
    query = request.args.get('query', '') 
    
    # use model to predict classification for query
    clf_labels = model.predict([query])[0]
    clf_results = dict(zip(categories, clf_labels))
    
    return render_template("pages/go.html",
                            query=query,
                            clf_results=clf_results)

@app.route("/about")
def about():
    return render_template("pages/about.html")

And this is the code to save the model:

# Imports
import sys
import pandas as pd
from sqlalchemy import create_engine
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold as mlsKFold
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import dill


nltk.download('averaged_perceptron_tagger')
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

##############################################################################

def load_data(db_filepath):
    ''' Load database from DisasterResponse SQL database

    OUTPUT
        X (array): array containing messages 
        Y (array): array containing binary catagory values 
        cats (list): list containing category names
    '''
    
    engine = create_engine('sqlite:///' + db_filepath)
    table = 'MessageCategories'
    df = pd.read_sql_table(table, engine)
    
    # convert values of 2 to 1 in first category
    df.loc[df['related'] == 2,'related'] = 1

    #Remove child alone as it has all zeros only
    df.drop(['child_alone', 'genre'], axis=1, inplace=True)

    X = df.message
    Y = df.drop('message', axis=1)
    categories = Y.columns.values 
    return X.values, Y.values, categories

##############################################################################

def tokenize(text):
    ''' Tokenizes input text
    
    INPUT
    text (str): text data as str
    
    OUTPUT
    lemma (list): tokenized text
    '''

    # Replace all urls with a urlplaceholder string
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    
    # Extract all the urls from the provided text 
    detected_urls = re.findall(url_regex, text)
    
    # Replace url with a url placeholder string
    for detected_url in detected_urls:
        text = text.replace(detected_url, 'url')
    
    # split text strings into tokens
    tokens = wordpunct_tokenize(text.lower().strip())
    
    # Remove stopwords
    rm = set(stopwords.words("english"))
    tokens= list(set(tokens) - rm)

    # Lemmatization w/ POS
    lemma = []
    lmtzr = WordNetLemmatizer()

    dict_pos_map = {
    # Look for NN in the POS tag because all nouns begin with NN
    'NN': wn.NOUN,
    # Look for VB in the POS tag because all verbs begin with VB
    'VB': wn.VERB,
    # Look for JJ in the POS tag because all adjectives begin with JJ
    'JJ' : wn.ADJ,
    # Look for RB in the POS tag because all adverbs begin with RB
    'RB': wn.ADV  
}

    for token, tag in pos_tag(tokens):
        if tag[0] in dict_pos_map:
            token = lmtzr.lemmatize(token, pos=dict_pos_map[tag[0]])
        else:
            token = lmtzr.lemmatize(token)
        lemma.append(token)

    return lemma

##############################################################################

def split_data(X, Y):
    ''' Split data into train and test arrays

    INPUT
        X (array): array containing messages 
        Y (array): array containing binary catagory values

    OUTPUT
        train and test arrays 
    '''
    
    mskf = mlsKFold(n_splits=2, shuffle=True, random_state=42)

    for train_index, test_index in mskf.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
    
    return X_train, X_test, Y_train, Y_test

##############################################################################

def build_model():
    ''' Return classification model
    '''

    clf = MultiOutputClassifier(LinearSVC())

    pipeline = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer()),
            ('clf', clf)
        ])

    params = {
        'clf__estimator__C': [1000.0, 2000.0],
        'clf__estimator__max_iter': [10000],
        'clf__estimator__random_state': [42],
        'clf__estimator__dual': ['auto']
    }

    # Cross validation using grid search
    cv = GridSearchCV(
        pipeline, 
        params, 
        cv=4,
        scoring = 'f1_weighted',
        verbose=1
    )
    
    return cv

##############################################################################

def evaluate_model(model, X_test, Y_test, categories):
    '''Evaluates models performance in predicting message categories
    
    INPUT
        model (Classification): stored classification model
        X_test (array): Independent Variables
        Y_test (array): Dependent Variables
        category_names (DataFrame): Stores message category labels
    
    OUTPUT
        rints a classification report
    '''
    
    Y_preds = model.predict(X_test)

    # Display Results
    print('best_score: {}'.format(model.best_score_))
    print('best_params: {}'.format(model.best_params_))

    print(
            "Test set classification report: {}".format(
                classification_report(Y_test, Y_preds, target_names=categories)
            )
        )

##############################################################################

def save_model(model):
    ''' Save trained classification model to pickle file
    '''
    
    with open('models/message_classifier.pkl', "wb") as f:
        dill.dump(model, f)

##############################################################################        

def main():
    if len(sys.argv) == 2:
        
        db_filepath = sys.argv[1]
        
        print('Loading data...\n    DATABASE: {}'.format(db_filepath))
        X, Y, categories = load_data(db_filepath)

        print('Splitting data...\n')
        X_train, X_test, Y_train, Y_test = split_data(X, Y)

        print('Building model...\n')
        model = build_model()

        print('Training model...\n')
        model.fit(X_train, Y_train)

        print("Evaluating model...\n")
        evaluate_model(model, X_test, Y_test, categories)

        print('Saving model...\n    MODEL: ../models/message_classifier')
        save_model(model)

        print('Trained model saved!')

    else:
        print(
            'Please provide the filepath of the disaster messages database '
            'as the first argument. \n\n'
            'Example: python train_classifier.py data/DisasterResponse.db'
            )

if __name__ == '__main__':
    main()

This is my first flask app, so I'm struggling to troubleshoot this error. My guess is it has something to do with how the model is serialized and making the packages available within the flask app.

Before using dill, I was getting a MissingAttribute error when saving and loading with pickle (the model is saved in main). dill seemed to solve the problem, but now I see that's not the case because I still have namespace issues.

After that, I've tried importing the packages and even just copying the functions used into the flask app with no luck.

I'm trying to understand how functions are called in the deserialized model since importing the packages in app.py still results in a NameError.

0

There are 0 answers