import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
loans = pd.read_csv('loan_data.csv')
loans.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9578 entries, 0 to 9577 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 credit.policy 9578 non-null int64 1 purpose 9578 non-null object 2 int.rate 9578 non-null float64 3 installment 9578 non-null float64 4 log.annual.inc 9578 non-null float64 5 dti 9578 non-null float64 6 fico 9578 non-null int64 7 days.with.cr.line 9578 non-null float64 8 revol.bal 9578 non-null int64 9 revol.util 9578 non-null float64 10 inq.last.6mths 9578 non-null int64 11 delinq.2yrs 9578 non-null int64 12 pub.rec 9578 non-null int64 13 not.fully.paid 9578 non-null int64 dtypes: float64(6), int64(7), object(1) memory usage: 1.0+ MB
loans.describe()
credit.policy | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | not.fully.paid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9.578000e+03 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 | 9578.000000 |
mean | 0.804970 | 0.122640 | 319.089413 | 10.932117 | 12.606679 | 710.846314 | 4560.767197 | 1.691396e+04 | 46.799236 | 1.577469 | 0.163708 | 0.062122 | 0.160054 |
std | 0.396245 | 0.026847 | 207.071301 | 0.614813 | 6.883970 | 37.970537 | 2496.930377 | 3.375619e+04 | 29.014417 | 2.200245 | 0.546215 | 0.262126 | 0.366676 |
min | 0.000000 | 0.060000 | 15.670000 | 7.547502 | 0.000000 | 612.000000 | 178.958333 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 1.000000 | 0.103900 | 163.770000 | 10.558414 | 7.212500 | 682.000000 | 2820.000000 | 3.187000e+03 | 22.600000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 1.000000 | 0.122100 | 268.950000 | 10.928884 | 12.665000 | 707.000000 | 4139.958333 | 8.596000e+03 | 46.300000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 1.000000 | 0.140700 | 432.762500 | 11.291293 | 17.950000 | 737.000000 | 5730.000000 | 1.824950e+04 | 70.900000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 |
max | 1.000000 | 0.216400 | 940.140000 | 14.528354 | 29.960000 | 827.000000 | 17639.958330 | 1.207359e+06 | 119.000000 | 33.000000 | 13.000000 | 5.000000 | 1.000000 |
loans.head()
credit.policy | purpose | int.rate | installment | log.annual.inc | dti | fico | days.with.cr.line | revol.bal | revol.util | inq.last.6mths | delinq.2yrs | pub.rec | not.fully.paid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | debt_consolidation | 0.1189 | 829.10 | 11.350407 | 19.48 | 737 | 5639.958333 | 28854 | 52.1 | 0 | 0 | 0 | 0 |
1 | 1 | credit_card | 0.1071 | 228.22 | 11.082143 | 14.29 | 707 | 2760.000000 | 33623 | 76.7 | 0 | 0 | 0 | 0 |
2 | 1 | debt_consolidation | 0.1357 | 366.86 | 10.373491 | 11.63 | 682 | 4710.000000 | 3511 | 25.6 | 1 | 0 | 0 | 0 |
3 | 1 | debt_consolidation | 0.1008 | 162.34 | 11.350407 | 8.10 | 712 | 2699.958333 | 33667 | 73.2 | 1 | 0 | 0 | 0 |
4 | 1 | credit_card | 0.1426 | 102.92 | 11.299732 | 14.97 | 667 | 4066.000000 | 4740 | 39.5 | 0 | 1 | 0 | 0 |
plt.figure(figsize=(10,6))
loans[loans['credit.policy']==1]['fico'].hist(alpha=0.5,color='blue',
bins=30,label='Credit.Policy=1',edgecolor='white')
loans[loans['credit.policy']==0]['fico'].hist(alpha=0.5,color='red',
bins=30,label='Credit.Policy=0',edgecolor='white')
plt.legend()
plt.xlabel('FICO')
Text(0.5, 0, 'FICO')
plt.figure(figsize=(10,6))
loans[loans['not.fully.paid']==1]['fico'].hist(alpha=0.5,color='blue',
bins=30,label='not.fully.paid=1',edgecolor='white')
loans[loans['not.fully.paid']==0]['fico'].hist(alpha=0.5,color='red',
bins=30,label='not.fully.paid=0',edgecolor='white')
plt.legend()
plt.xlabel('FICO')
Text(0.5, 0, 'FICO')
plt.figure(figsize=(11,7))
sns.countplot(x='purpose',hue='not.fully.paid',data=loans,palette='Set1')
<Axes: xlabel='purpose', ylabel='count'>
sns.jointplot(x='fico',y='int.rate',data=loans,color='purple')
C:\Users\ACER\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True): C:\Users\ACER\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead. with pd.option_context('mode.use_inf_as_na', True):
<seaborn.axisgrid.JointGrid at 0x17e2775b0d0>
plt.figure(figsize=(11,7))
sns.lmplot(y='int.rate',x='fico',data=loans,hue='credit.policy',
col='not.fully.paid',palette='Set1')
C:\Users\ACER\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
<seaborn.axisgrid.FacetGrid at 0x17e27362690>
<Figure size 1100x700 with 0 Axes>
loans.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9578 entries, 0 to 9577 Data columns (total 14 columns): credit.policy 9578 non-null int64 purpose 9578 non-null object int.rate 9578 non-null float64 installment 9578 non-null float64 log.annual.inc 9578 non-null float64 dti 9578 non-null float64 fico 9578 non-null int64 days.with.cr.line 9578 non-null float64 revol.bal 9578 non-null int64 revol.util 9578 non-null float64 inq.last.6mths 9578 non-null int64 delinq.2yrs 9578 non-null int64 pub.rec 9578 non-null int64 not.fully.paid 9578 non-null int64 dtypes: float64(6), int64(7), object(1) memory usage: 1.0+ MB
cat_feats = ['purpose']
final_data = pd.get_dummies(loans,columns=cat_feats,drop_first=True)
final_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9578 entries, 0 to 9577 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 credit.policy 9578 non-null int64 1 int.rate 9578 non-null float64 2 installment 9578 non-null float64 3 log.annual.inc 9578 non-null float64 4 dti 9578 non-null float64 5 fico 9578 non-null int64 6 days.with.cr.line 9578 non-null float64 7 revol.bal 9578 non-null int64 8 revol.util 9578 non-null float64 9 inq.last.6mths 9578 non-null int64 10 delinq.2yrs 9578 non-null int64 11 pub.rec 9578 non-null int64 12 not.fully.paid 9578 non-null int64 13 purpose_credit_card 9578 non-null bool 14 purpose_debt_consolidation 9578 non-null bool 15 purpose_educational 9578 non-null bool 16 purpose_home_improvement 9578 non-null bool 17 purpose_major_purchase 9578 non-null bool 18 purpose_small_business 9578 non-null bool dtypes: bool(6), float64(6), int64(7) memory usage: 1.0 MB
from sklearn.model_selection import train_test_split
X = final_data.drop('not.fully.paid',axis=1)
y = final_data['not.fully.paid']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
DecisionTreeClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier()
predictions = dtree.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
precision recall f1-score support 0 0.85 0.82 0.84 2431 1 0.19 0.23 0.20 443 accuracy 0.73 2874 macro avg 0.52 0.52 0.52 2874 weighted avg 0.75 0.73 0.74 2874
print(confusion_matrix(y_test,predictions))
[[1997 434] [ 343 100]]
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=600)
rfc.fit(X_train,y_train)
RandomForestClassifier(n_estimators=600)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(n_estimators=600)
predictions = rfc.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))
precision recall f1-score support 0 0.85 1.00 0.92 2431 1 0.54 0.02 0.03 443 accuracy 0.85 2874 macro avg 0.69 0.51 0.47 2874 weighted avg 0.80 0.85 0.78 2874
print(confusion_matrix(y_test,predictions))
[[2425 6] [ 436 7]]