cs-training.csv is like:
+----+------------------+--------------------------------------+-----+--------------------------------------+-------------+---------------+---------------------------------+-------------------------+------------------------------+--------------------------------------+--------------------+
| | SeriousDlqin2yrs | RevolvingUtilizationOfUnsecuredLines | age | NumberOfTime30-59DaysPastDueNotWorse | DebtRatio | MonthlyIncome | NumberOfOpenCreditLinesAndLoans | NumberOfTimes90DaysLate | NumberRealEstateLoansOrLines | NumberOfTime60-89DaysPastDueNotWorse | NumberOfDependents |
+----+------------------+--------------------------------------+-----+--------------------------------------+-------------+---------------+---------------------------------+-------------------------+------------------------------+--------------------------------------+--------------------+
| 1 | 1 | 0.766126609 | 45 | 2 | 0.802982129 | 9120 | 13 | 0 | 6 | 0 | 2 |
| 2 | 0 | 0.957151019 | 40 | 0 | 0.121876201 | 2600 | 4 | 0 | 0 | 0 | 1 |
| 3 | 0 | 0.65818014 | 38 | 1 | 0.085113375 | 3042 | 2 | 1 | 0 | 0 | 0 |
| 4 | 0 | 0.233809776 | 30 | 0 | 0.036049682 | 3300 | 5 | 0 | 0 | 0 | 0 |
| 5 | 0 | 0.9072394 | 49 | 1 | 0.024925695 | 63588 | 7 | 0 | 1 | 0 | 0 |
| 6 | 0 | 0.213178682 | 74 | 0 | 0.375606969 | 3500 | 3 | 0 | 1 | 0 | 1 |
| 7 | 0 | 0.305682465 | 57 | 0 | 5710 | NA | 8 | 0 | 3 | 0 | 0 |
| 8 | 0 | 0.754463648 | 39 | 0 | 0.209940017 | 3500 | 8 | 0 | 0 | 0 | 0 |
| 9 | 0 | 0.116950644 | 27 | 0 | 46 | NA | 2 | 0 | 0 | 0 | NA |
| 10 | 0 | 0.189169052 | 57 | 0 | 0.606290901 | 23684 | 9 | 0 | 4 | 0 | 2 |
| 11 | 0 | 0.644225962 | 30 | 0 | 0.30947621 | 2500 | 5 | 0 | 0 | 0 | 0 |
| 12 | 0 | 0.01879812 | 51 | 0 | 0.53152876 | 6501 | 7 | 0 | 2 | 0 | 2 |
| 13 | 0 | 0.010351857 | 46 | 0 | 0.298354075 | 12454 | 13 | 0 | 2 | 0 | 2 |
| 14 | 1 | 0.964672555 | 40 | 3 | 0.382964747 | 13700 | 9 | 3 | 1 | 1 | 2 |
| 15 | 0 | 0.019656581 | 76 | 0 | 477 | 0 | 6 | 0 | 1 | 0 | 0 |
| 16 | 0 | 0.548458062 | 64 | 0 | 0.209891754 | 11362 | 7 | 0 | 1 | 0 | 2 |
| 17 | 0 | 0.061086118 | 78 | 0 | 2058 | NA | 10 | 0 | 2 | 0 | 0 |
| 18 | 0 | 0.166284079 | 53 | 0 | 0.18827406 | 8800 | 7 | 0 | 0 | 0 | 0 |
| 19 | 0 | 0.221812771 | 43 | 0 | 0.527887839 | 3280 | 7 | 0 | 1 | 0 | 2 |
| 20 | 0 | 0.602794411 | 25 | 0 | 0.065868263 | 333 | 2 | 0 | 0 | 0 | 0 |
| 21 | 0 | 0.200923382 | 43 | 0 | 0.430046338 | 12300 | 10 | 0 | 2 | 0 | 0 |
+----+------------------+--------------------------------------+-----+--------------------------------------+-------------+---------------+---------------------------------+-------------------------+------------------------------+--------------------------------------+--------------------+
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
# using RF to predict and fill null
def set_missing(df):
process_df = df.ix[:,[5,0,1,2,3,4,6,7,8,9]]
known = process_df[process_df.MonthlyIncome.notnull()].as_matrix()
unknown = process_df[process_df.MonthlyIncome.isnull()].as_matrix()
X = known[:, 1:]
y = known[:, 0]
rfr = RandomForestRegressor(random_state=0, n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(X,y)
predicted = rfr.predict(unknown[:, 1:]).round(0)
print(predicted)
# fill null,and this line goes wrong
df.loc[(df.MonthlyIncome.isnull()), 'MonthlyIncome'] = predicted
return df
if __name__ == '__main__':
data = pd.read_csv('cs-training.csv')
data.describe().to_csv('DataDescribe.csv')
data=set_missing(data)
data=data.dropna()
data = data.drop_duplicates()
data.to_csv('MissingData.csv',index=False)
data.describe().to_csv('MissingDataDescribe.csv')
I haved checked pages about "ValueError: Input contains NaN, infinity or a value too large for dtype('float32')",however mine seems like a different case.May someone know why and how to fix kindly help.thank!
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) in () ----> 1 data=set_missing(data)
in set_missing(df) 13 rfr.fit(X,y) 14 ---> 15 predicted = rfr.predict(unknown[:, 1:]).round(0) 16 print(predicted) 17
D:\Program Files (x86)\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in predict(self, X) 683 """ 684 # Check data --> 685 X = self._validate_X_predict(X) 686 687 # Assign chunk of trees to jobs
D:\Program Files (x86)\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in _validate_X_predict(self, X) 353 "call
fit
before exploiting the model.") 354 --> 355 return self.estimators_[0]._validate_X_predict(X, check_input=True) 356 357 @propertyD:\Program Files (x86)\Anaconda3\lib\site-packages\sklearn\tree\tree.py in _validate_X_predict(self, X, check_input) 363 364 if check_input: --> 365 X = check_array(X, dtype=DTYPE, accept_sparse="csr") 366 if issparse(X) and (X.indices.dtype != np.intc or 367 X.indptr.dtype != np.intc):
D:\Program Files (x86)\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 405 % (array.ndim, estimator_name)) 406 if force_all_finite: --> 407 _assert_all_finite(array) 408 409 shape_repr = _shape_repr(array.shape)
D:\Program Files (x86)\Anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X) 56 and not np.isfinite(X).all()): 57 raise ValueError("Input contains NaN, infinity" ---> 58 " or a value too large for %r." % X.dtype) 59 60
ValueError: Input contains NaN, infinity or a value too large for dtype('float32').