SAME QUESTION AS 'Matplotlib causing problems generating map' I am attempting to establish my own comprehensible flow of code components, each of them taking me to the predicted cols and correlation close to 1, i.e. trained data and a model fit. Yet I am overly frustrated by the fact that matplotlib does not allow me to generate the corr mats, because I cannot track the column in the data, and I know it is there. I checked. Header and data.

View code. Imported all relevant modules (I think)

import pandas as pd
import numpy as np

train = pd.read_csv("..._new.csv", encoding='cp1252', error_bad_lines=False) # , encoding='cp1252', error_bad_lines=False
test = pd.read_csv("..._test.csv", encoding='cp1252', error_bad_lines=False) # , encoding='cp1252', error_bad_lines=False
submission = pd.read_csv("C:\\Users\\jcst\\Desktop\\Private\\Python data\\train16.csv")

print(submission.head(20))

X = train.drop(["person_id", "DTUDur"],axis = 1)
y = train["DTUDur"]

print("\nSIZE OF DATATABLE")

print(X.head(10))

print("\nCOLUMNS IN DATA")

print(X.columns, X.shape)

# -------------------------------------------

# Actual data processing

print("\nCHECK MISSING DATA")

def check_missing_data(X):
total = X.isnull().sum().sort_values(ascending = False)
percent = ((X.isnull().sum()/X.isnull().count())*100).sort_values(ascending = False)

return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

check_missing_data(X).head()

print(check_missing_data(test).head(25))

# -------------------------------------------

# Unique values in the dataset

print("\nUNIQUE VALUES")

  df_tmp=pd.DataFrame(X.nunique().sort_values(),columns=['num_unique_values']).reset_index().rename(columns={'index':'Column_name'})

print(df_tmp.head(10))

print("\nCOLUMNS WITH UNIQUE VALUES")

# Columns with unique values

def col_name_with_n_unique_value(X, n):
    df1=pd.DataFrame(X.nunique().sort_values(),columns=['num_unique_values']).reset_index()

col_name=list(df1[df1.num_unique_values==1]['index']) print('number of columns with only', n, 'unique values are: ', len(col_name)) return col_name

col_to_drop=col_name_with_n_unique_value(X,1)

# -------------------------------------------

# correlation of the matrix

# correlation of the matrix

import matplotlib.pyplot as plt
import seaborn as sns

corrmat = X.corr()
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corrmat, vmax=.8, annot=True);

# -------------------------------------------

# most correlated features

corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["DTUDur"])>0.1]
plt.rcParams.update({'font.size': 5})
plt.figure(figsize=(6,6))
g = sns.heatmap(train[top_corr_features].corr(), annot=True, cmap="RdYlGn")



corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["DTUDurYr"])>0.7]
plt.figure(figsize=(10,10))
g = sns.barplot(train., train.DTUDurYr)

corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["DTUDurYr"])>0.7]
plt.figure(figsize=(10,10))
g = sns.barplot(train.DTUDurYr, train.GennemsnitBeståetAndetSemesterECTS)

plt.rcParams.update({'font.size': 5})
plt.show()

# -------------------------------------------

# divide the data set into categorial and non categorial features and apply models to get the insight of the data

print("\nDEFINING CATEGORICAL AND NUMERICAL FEATURES")

categorical_features = X.select_dtypes(include=['object']).columns

print(categorical_features)

numerical_features = X.select_dtypes(exclude = ["object"]).columns

print(numerical_features)

print("\nDIVIDE THE DATA SET INTO CATEGORIAL AND NON CATEGORIAL FEATURES AND APPLY MODELS TO GET THE INSIGHT OF THE DATA")

print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))

print("\nFILLING THE MISSING VALUE OF TEST WITH THEIR MEAN VALUE, FOR BETTER ACCURACY")

test = test.select_dtypes(exclude=[np.object])
test.info()
test = test.fillna(test.mean(), inplace=True)

-------------------------------------------

print("\nAPPLYING MODEL RANDOM FOREST REGRESSOR")

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

# pull data into target (y) and predictors (X)
predictor_cols = ['F18 ECTS på kurser med beståede talkarakter']

# -------------------------------------------

# Create training predictors data
train_X = X[predictor_cols]
my_model = RandomForestRegressor()
my_model.fit(train_X, y)
my_model.score(train_X, y)

print(predictor_cols)
print(my_model.score(train_X, y))

test = pd.read_csv("M:\\20190214_Datasæt_new_test.csv")

# -------------------------------------------

print("\nPRINT PREDICTED FACTORS")

test_X = test[predictor_cols]

#  model to make predictions

predicted_factor = my_model.predict(test_X)

#  at the predicted prices to ensure something sensible.

print(predicted_factor)

np.savetxt("M:\\train22.csv", predicted_factor, delimiter=';')

Traceback (most recent call last): File "C:/Users/jcst/PycharmProjects/Frafaldsanalyse/DefiningCatAndNumFeatures_4_new.py", line 142, in my_model.fit(train_X, y) File "C:\Users\jcst\PycharmProjects\Frafaldsanalyse\venv\lib\site-packages\sklearn\ensemble\forest.py", line 250, in fit X = check_array(X, accept_sparse="csc", dtype=DTYPE) File "C:\Users\jcst\PycharmProjects\Frafaldsanalyse\venv\lib\site-packages\sklearn\utils\validation.py", line 573, in check_array allow_nan=force_all_finite == 'allow-nan') File "C:\Users\jcst\PycharmProjects\Frafaldsanalyse\venv\lib\site-packages\sklearn\utils\validation.py", line 56, in _assert_all_finite raise ValueError(msg_err.format(type_err, X.dtype)) ValueError: Input contains NaN, infinity or a value too large for dtype('float32')

0 Answers