Sunday, October 15, 2017

DG 08 Fitting

dg13-08A-Fitting
In [1]:
import pandas as pd
import numpy as np
import pylab as pl
In [2]:
train = pd.read_csv("cs-training-v1.csv")
test = pd.read_csv("credit-data-testset.csv")
test.head()
Out[2]:
serious_dlqin2yrs revolving_utilization_of_unsecured_lines age number_of_time30-59_days_past_due_not_worse debt_ratio monthly_income number_of_open_credit_lines_and_loans number_of_times90_days_late number_real_estate_loans_or_lines number_of_time60-89_days_past_due_not_worse number_of_dependents monthly_income_imputed
0 0 0.233810 30 0 0.036050 3300.0 5 0 0 0 0.0 6017.0
1 1 0.964673 40 3 0.382965 13700.0 9 3 1 1 2.0 2850.0
2 0 0.061086 78 0 2058.000000 2500.0 10 0 2 0 0.0 2500.0
3 0 0.075427 32 0 0.085512 7916.0 6 0 0 0 0.0 4145.0
4 0 0.046560 58 0 0.241622 2416.0 9 0 1 0 0.0 2850.0
In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
In [8]:
featuresTest = ['revolving_utilization_of_unsecured_lines', 'debt_ratio',
            'monthly_income', 'age', 'number_of_times90_days_late']

featuresTrain = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
            'MonthlyIncome', 'age', 'NumberOfTimes90DaysLate']
In [6]:
clf = KNeighborsClassifier(n_neighbors=13, warn_on_equidistant=False)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-6-191e958d8c98> in <module>()
----> 1 clf = KNeighborsClassifier(n_neighbors=13, warn_on_equidistant=False)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\classification.py in __init__(self, n_neighbors, weights, algorithm, leaf_size, p, metric, metric_params, n_jobs, **kwargs)
    124                           algorithm=algorithm,
    125                           leaf_size=leaf_size, metric=metric, p=p,
--> 126                           metric_params=metric_params, n_jobs=n_jobs, **kwargs)
    127         self.weights = _check_weights(weights)
    128 

TypeError: _init_params() got an unexpected keyword argument 'warn_on_equidistant'
In [9]:
clf = KNeighborsClassifier(n_neighbors=13)
In [10]:
clf.fit(train[featuresTrain], train.SeriousDlqin2yrs)
Out[10]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=13, p=2,
           weights='uniform')
In [11]:
#classes (returns an array)
clf.predict(test[featuresTest])
Out[11]:
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
In [10]:
# remove all data with missing values
#test = test[np.isfinite(test['MonthlyIncome'])]
#test = test[np.isfinite(test['NumberOfDependents'])]
In [27]:
# remove all data with missing values
#test1 = test[np.isfinite(test['MonthlyIncome'])]
#test2 = test1[np.isfinite(test1['NumberOfDependents'])]
#test2.to_csv("cs-test-v1.csv", index=False)
In [28]:
#testC = pd.read_csv("cs-test-v1.csv")
In [29]:
#classes (returns an array)
#clf.predict(testC[features])
Out[29]:
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
In [12]:
print(clf.predict(test[featuresTest]))
[0 0 0 ..., 0 0 0]
In [13]:
#probabilities (returns a numpy array)
clf.predict_proba(test[featuresTest])
Out[13]:
array([[ 0.92307692,  0.07692308],
       [ 0.84615385,  0.15384615],
       [ 0.61538462,  0.38461538],
       ..., 
       [ 0.92307692,  0.07692308],
       [ 0.92307692,  0.07692308],
       [ 1.        ,  0.        ]])
In [14]:
%matplotlib inline 
In [15]:
probs = clf.predict_proba(test[featuresTest])
prob_true = probs[::,1]
pl.hist(prob_true)
Out[15]:
(array([  1.67870000e+04,   1.13980000e+04,   5.64000000e+03,
          2.28200000e+03,   1.04400000e+03,   2.86000000e+02,
          9.40000000e+01,   0.00000000e+00,   4.50000000e+01,
          9.00000000e+00]),
 array([ 0.        ,  0.07692308,  0.15384615,  0.23076923,  0.30769231,
         0.38461538,  0.46153846,  0.53846154,  0.61538462,  0.69230769,
         0.76923077]),
 <a list of 10 Patch objects>)
In [16]:
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
In [17]:
preds = clf.predict_proba(test[featuresTest])
preds
Out[17]:
array([[ 0.92307692,  0.07692308],
       [ 0.84615385,  0.15384615],
       [ 0.61538462,  0.38461538],
       ..., 
       [ 0.92307692,  0.07692308],
       [ 0.92307692,  0.07692308],
       [ 1.        ,  0.        ]])
In [27]:
#print(test['serious_dlqin2yrs'])
In [20]:
confusion_matrix(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]))
Out[20]:
array([[35042,    27],
       [ 2472,    44]])
In [21]:
print (classification_report(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]), labels=[0, 1]))
             precision    recall  f1-score   support

          0       0.93      1.00      0.97     35069
          1       0.62      0.02      0.03      2516

avg / total       0.91      0.93      0.90     37585

In [23]:
pd.crosstab(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]), rownames=["Actual"], colnames=["Predicted"])
Out[23]:
Predicted 0 1
Actual
0 35042 27
1 2472 44
In [24]:
def plot_roc(name, probs):
    fpr, tpr, thresholds = roc_curve(test['serious_dlqin2yrs'], probs)
    roc_auc = auc(fpr, tpr)
    pl.clf()
    pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    pl.plot([0, 1], [0, 1], 'k--')
    pl.xlim([0.0, 1.05])
    pl.ylim([0.0, 1.05])
    pl.xlabel('False Positive Rate')
    pl.ylabel('True Positive Rate')
    pl.title(name)
    pl.legend(loc="lower right")
    pl.show()
In [26]:
plot_roc("Perfect Classifier", test['serious_dlqin2yrs'])
plot_roc("Guessing", np.random.uniform(0, 1, len(test['serious_dlqin2yrs'])))

#[::,1] selects the 2nd column of the numpy array
plot_roc("KNN", preds[::,1])
In [28]:
clf = RandomForestClassifier()
clf.fit(train[featuresTrain], train.SeriousDlqin2yrs)
Out[28]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [29]:
probs = clf.predict_proba(test[featuresTest])[::,1]
plot_roc("RandomForest", probs)
In [30]:
train.head()
Out[30]:
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
0 1 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0
1 2 0 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0
2 3 0 0.658180 38 1 0.085113 3042.0 2 1 0 0 0.0
3 4 0 0.233810 30 0 0.036050 3300.0 5 0 0 0 0.0
4 5 0 0.907239 49 1 0.024926 63588.0 7 0 1 0 0.0
In [31]:
featuresTest2 = ['revolving_utilization_of_unsecured_lines', 'debt_ratio',
            'number_of_times90_days_late', 'number_real_estate_loans_or_lines']

featuresTrain2 = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
            'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines']
In [32]:
clf = GradientBoostingClassifier()
clf.fit(train[featuresTrain2], train.SeriousDlqin2yrs)
Out[32]:
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
In [33]:
probs = clf.predict_proba(test[featuresTest2])[::,1]
plot_roc("Your Classifier", probs)
In [34]:
probs
odds = (1 - probs) / probs
score = np.log(odds)*(40/np.log(2)) + 340
pl.hist(score)
Out[34]:
(array([  3.00000000e+00,   6.10000000e+01,   3.30000000e+02,
          7.60000000e+02,   9.49000000e+02,   1.47400000e+03,
          5.11600000e+03,   5.18200000e+03,   8.60600000e+03,
          1.51040000e+04]),
 array([ 213.59444104,  252.24191446,  290.88938789,  329.53686131,
         368.18433474,  406.83180816,  445.47928159,  484.12675502,
         522.77422844,  561.42170187,  600.06917529]),
 <a list of 10 Patch objects>)
In [ ]: