I'm using a classification model (NLP pipeline) in my flask app (app.py). It saves and loads fine with dill. However, when the model is actually called, the app returns a NameError, which leads me to believe it still can't access the packages present in the initial model.
Here is the traceback error:
[2024-02-14 21:53:24,188] ERROR in app: Exception on /go [GET]
Traceback (most recent call last):
File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/flask/app.py", line 2529, in wsgi_app
response = self.full_dispatch_request()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/flask/app.py", line 1825, in full_dispatch_request
rv = self.handle_user_exception(e)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/flask/app.py", line 1823, in full_dispatch_request
rv = self.dispatch_request()
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/flask/app.py", line 1799, in dispatch_request
return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tristenwallace/projects/udacity/data_science/disaster_response_message_classifier/webapp/app.py", line 32, in response
clf_labels = model.predict([query])[0]
^^^^^^^^^^^^^^^^^^^^^^
File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/sklearn/model_selection/_search.py", line 519, in predict
return self.best_estimator_.predict(X)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/sklearn/pipeline.py", line 507, in predict
Xt = transform.transform(Xt)
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/sklearn/feature_extraction/text.py", line 1428, in transform
_, X = self._count_vocab(raw_documents, fixed_vocab=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/sklearn/feature_extraction/text.py", line 1270, in _count_vocab
for feature in analyze(doc):
^^^^^^^^^^^^
File "/home/tristenwallace/anaconda3/envs/udacity/lib/python3.11/site-packages/sklearn/feature_extraction/text.py", line 112, in _analyze
doc = tokenizer(doc)
^^^^^^^^^^^^^^
File "/home/tristenwallace/projects/udacity/data_science/disaster_response_message_classifier/src/train_classifier.py", line 76, in tokenize
tokens = wordpunct_tokenize(text.lower().strip())
^^^^^^^^^^^^^^^^^^
NameError: name 'wordpunct_tokenize' is not defined
This is the flask app:
from flask import Flask
from flask import render_template, request
import dill
from sqlalchemy import create_engine
import pandas as pd
from nltk.tokenize import wordpunct_tokenize
app = Flask(__name__)
# load data
engine = create_engine('sqlite:///../data/DisasterResponse.db')
df = pd.read_sql_table('MessageCategories', con=engine)
# load model
model = dill.load(open('../models/message_classifier.pkl', 'rb'))
categories = list(df.columns[2:])
@app.route("/")
@app.route('/home')
def home():
return render_template("pages/home.html")
@app.route("/go")
def response():
# save user input in query
query = request.args.get('query', '')
# use model to predict classification for query
clf_labels = model.predict([query])[0]
clf_results = dict(zip(categories, clf_labels))
return render_template("pages/go.html",
query=query,
clf_results=clf_results)
@app.route("/about")
def about():
return render_template("pages/about.html")
And this is the code to save the model:
# Imports
import sys
import pandas as pd
from sqlalchemy import create_engine
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold as mlsKFold
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import dill
nltk.download('averaged_perceptron_tagger')
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
##############################################################################
def load_data(db_filepath):
''' Load database from DisasterResponse SQL database
OUTPUT
X (array): array containing messages
Y (array): array containing binary catagory values
cats (list): list containing category names
'''
engine = create_engine('sqlite:///' + db_filepath)
table = 'MessageCategories'
df = pd.read_sql_table(table, engine)
# convert values of 2 to 1 in first category
df.loc[df['related'] == 2,'related'] = 1
#Remove child alone as it has all zeros only
df.drop(['child_alone', 'genre'], axis=1, inplace=True)
X = df.message
Y = df.drop('message', axis=1)
categories = Y.columns.values
return X.values, Y.values, categories
##############################################################################
def tokenize(text):
''' Tokenizes input text
INPUT
text (str): text data as str
OUTPUT
lemma (list): tokenized text
'''
# Replace all urls with a urlplaceholder string
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
# Extract all the urls from the provided text
detected_urls = re.findall(url_regex, text)
# Replace url with a url placeholder string
for detected_url in detected_urls:
text = text.replace(detected_url, 'url')
# split text strings into tokens
tokens = wordpunct_tokenize(text.lower().strip())
# Remove stopwords
rm = set(stopwords.words("english"))
tokens= list(set(tokens) - rm)
# Lemmatization w/ POS
lemma = []
lmtzr = WordNetLemmatizer()
dict_pos_map = {
# Look for NN in the POS tag because all nouns begin with NN
'NN': wn.NOUN,
# Look for VB in the POS tag because all verbs begin with VB
'VB': wn.VERB,
# Look for JJ in the POS tag because all adjectives begin with JJ
'JJ' : wn.ADJ,
# Look for RB in the POS tag because all adverbs begin with RB
'RB': wn.ADV
}
for token, tag in pos_tag(tokens):
if tag[0] in dict_pos_map:
token = lmtzr.lemmatize(token, pos=dict_pos_map[tag[0]])
else:
token = lmtzr.lemmatize(token)
lemma.append(token)
return lemma
##############################################################################
def split_data(X, Y):
''' Split data into train and test arrays
INPUT
X (array): array containing messages
Y (array): array containing binary catagory values
OUTPUT
train and test arrays
'''
mskf = mlsKFold(n_splits=2, shuffle=True, random_state=42)
for train_index, test_index in mskf.split(X, Y):
X_train, X_test = X[train_index], X[test_index]
Y_train, Y_test = Y[train_index], Y[test_index]
return X_train, X_test, Y_train, Y_test
##############################################################################
def build_model():
''' Return classification model
'''
clf = MultiOutputClassifier(LinearSVC())
pipeline = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', clf)
])
params = {
'clf__estimator__C': [1000.0, 2000.0],
'clf__estimator__max_iter': [10000],
'clf__estimator__random_state': [42],
'clf__estimator__dual': ['auto']
}
# Cross validation using grid search
cv = GridSearchCV(
pipeline,
params,
cv=4,
scoring = 'f1_weighted',
verbose=1
)
return cv
##############################################################################
def evaluate_model(model, X_test, Y_test, categories):
'''Evaluates models performance in predicting message categories
INPUT
model (Classification): stored classification model
X_test (array): Independent Variables
Y_test (array): Dependent Variables
category_names (DataFrame): Stores message category labels
OUTPUT
rints a classification report
'''
Y_preds = model.predict(X_test)
# Display Results
print('best_score: {}'.format(model.best_score_))
print('best_params: {}'.format(model.best_params_))
print(
"Test set classification report: {}".format(
classification_report(Y_test, Y_preds, target_names=categories)
)
)
##############################################################################
def save_model(model):
''' Save trained classification model to pickle file
'''
with open('models/message_classifier.pkl', "wb") as f:
dill.dump(model, f)
##############################################################################
def main():
if len(sys.argv) == 2:
db_filepath = sys.argv[1]
print('Loading data...\n DATABASE: {}'.format(db_filepath))
X, Y, categories = load_data(db_filepath)
print('Splitting data...\n')
X_train, X_test, Y_train, Y_test = split_data(X, Y)
print('Building model...\n')
model = build_model()
print('Training model...\n')
model.fit(X_train, Y_train)
print("Evaluating model...\n")
evaluate_model(model, X_test, Y_test, categories)
print('Saving model...\n MODEL: ../models/message_classifier')
save_model(model)
print('Trained model saved!')
else:
print(
'Please provide the filepath of the disaster messages database '
'as the first argument. \n\n'
'Example: python train_classifier.py data/DisasterResponse.db'
)
if __name__ == '__main__':
main()
This is my first flask app, so I'm struggling to troubleshoot this error. My guess is it has something to do with how the model is serialized and making the packages available within the flask app.
Before using dill, I was getting a MissingAttribute error when saving and loading with pickle (the model is saved in main). dill seemed to solve the problem, but now I see that's not the case because I still have namespace issues.
After that, I've tried importing the packages and even just copying the functions used into the flask app with no luck.
I'm trying to understand how functions are called in the deserialized model since importing the packages in app.py still results in a NameError.