import pandas as pd
import numpy as np
import pylab as pl

train = pd.read_csv("cs-training-v1.csv")
test = pd.read_csv("credit-data-testset.csv")
test.head()

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

featuresTest = ['revolving_utilization_of_unsecured_lines', 'debt_ratio',
            'monthly_income', 'age', 'number_of_times90_days_late']

featuresTrain = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
            'MonthlyIncome', 'age', 'NumberOfTimes90DaysLate']

clf = KNeighborsClassifier(n_neighbors=13, warn_on_equidistant=False)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-6-191e958d8c98> in <module>()
----> 1 clf = KNeighborsClassifier(n_neighbors=13, warn_on_equidistant=False)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\classification.py in __init__(self, n_neighbors, weights, algorithm, leaf_size, p, metric, metric_params, n_jobs, **kwargs)
    124                           algorithm=algorithm,
    125                           leaf_size=leaf_size, metric=metric, p=p,
--> 126                           metric_params=metric_params, n_jobs=n_jobs, **kwargs)
    127         self.weights = _check_weights(weights)
    128 

TypeError: _init_params() got an unexpected keyword argument 'warn_on_equidistant'

clf = KNeighborsClassifier(n_neighbors=13)

clf.fit(train[featuresTrain], train.SeriousDlqin2yrs)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=13, p=2,
           weights='uniform')

#classes (returns an array)
clf.predict(test[featuresTest])

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

# remove all data with missing values
#test = test[np.isfinite(test['MonthlyIncome'])]
#test = test[np.isfinite(test['NumberOfDependents'])]

# remove all data with missing values
#test1 = test[np.isfinite(test['MonthlyIncome'])]
#test2 = test1[np.isfinite(test1['NumberOfDependents'])]
#test2.to_csv("cs-test-v1.csv", index=False)

#testC = pd.read_csv("cs-test-v1.csv")

#classes (returns an array)
#clf.predict(testC[features])

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

print(clf.predict(test[featuresTest]))

[0 0 0 ..., 0 0 0]

#probabilities (returns a numpy array)
clf.predict_proba(test[featuresTest])

array([[ 0.92307692,  0.07692308],
       [ 0.84615385,  0.15384615],
       [ 0.61538462,  0.38461538],
       ..., 
       [ 0.92307692,  0.07692308],
       [ 0.92307692,  0.07692308],
       [ 1.        ,  0.        ]])

%matplotlib inline

probs = clf.predict_proba(test[featuresTest])
prob_true = probs[::,1]
pl.hist(prob_true)

(array([  1.67870000e+04,   1.13980000e+04,   5.64000000e+03,
          2.28200000e+03,   1.04400000e+03,   2.86000000e+02,
          9.40000000e+01,   0.00000000e+00,   4.50000000e+01,
          9.00000000e+00]),
 array([ 0.        ,  0.07692308,  0.15384615,  0.23076923,  0.30769231,
         0.38461538,  0.46153846,  0.53846154,  0.61538462,  0.69230769,
         0.76923077]),
 <a list of 10 Patch objects>)

from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix

preds = clf.predict_proba(test[featuresTest])
preds

array([[ 0.92307692,  0.07692308],
       [ 0.84615385,  0.15384615],
       [ 0.61538462,  0.38461538],
       ..., 
       [ 0.92307692,  0.07692308],
       [ 0.92307692,  0.07692308],
       [ 1.        ,  0.        ]])

#print(test['serious_dlqin2yrs'])

confusion_matrix(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]))

array([[35042,    27],
       [ 2472,    44]])

print (classification_report(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]), labels=[0, 1]))

             precision    recall  f1-score   support

          0       0.93      1.00      0.97     35069
          1       0.62      0.02      0.03      2516

avg / total       0.91      0.93      0.90     37585

pd.crosstab(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]), rownames=["Actual"], colnames=["Predicted"])

def plot_roc(name, probs):
    fpr, tpr, thresholds = roc_curve(test['serious_dlqin2yrs'], probs)
    roc_auc = auc(fpr, tpr)
    pl.clf()
    pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    pl.plot([0, 1], [0, 1], 'k--')
    pl.xlim([0.0, 1.05])
    pl.ylim([0.0, 1.05])
    pl.xlabel('False Positive Rate')
    pl.ylabel('True Positive Rate')
    pl.title(name)
    pl.legend(loc="lower right")
    pl.show()

plot_roc("Perfect Classifier", test['serious_dlqin2yrs'])
plot_roc("Guessing", np.random.uniform(0, 1, len(test['serious_dlqin2yrs'])))

#[::,1] selects the 2nd column of the numpy array
plot_roc("KNN", preds[::,1])

clf = RandomForestClassifier()
clf.fit(train[featuresTrain], train.SeriousDlqin2yrs)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

probs = clf.predict_proba(test[featuresTest])[::,1]
plot_roc("RandomForest", probs)

train.head()

featuresTest2 = ['revolving_utilization_of_unsecured_lines', 'debt_ratio',
            'number_of_times90_days_late', 'number_real_estate_loans_or_lines']

featuresTrain2 = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
            'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines']

clf = GradientBoostingClassifier()
clf.fit(train[featuresTrain2], train.SeriousDlqin2yrs)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

probs = clf.predict_proba(test[featuresTest2])[::,1]
plot_roc("Your Classifier", probs)

probs
odds = (1 - probs) / probs
score = np.log(odds)*(40/np.log(2)) + 340
pl.hist(score)

(array([  3.00000000e+00,   6.10000000e+01,   3.30000000e+02,
          7.60000000e+02,   9.49000000e+02,   1.47400000e+03,
          5.11600000e+03,   5.18200000e+03,   8.60600000e+03,
          1.51040000e+04]),
 array([ 213.59444104,  252.24191446,  290.88938789,  329.53686131,
         368.18433474,  406.83180816,  445.47928159,  484.12675502,
         522.77422844,  561.42170187,  600.06917529]),
 <a list of 10 Patch objects>)

	serious_dlqin2yrs	revolving_utilization_of_unsecured_lines	age	number_of_time30-59_days_past_due_not_worse	debt_ratio	monthly_income	number_of_open_credit_lines_and_loans	number_of_times90_days_late	number_real_estate_loans_or_lines	number_of_time60-89_days_past_due_not_worse	number_of_dependents	monthly_income_imputed
0	0	0.233810	30	0	0.036050	3300.0	5	0	0	0	0.0	6017.0
1	1	0.964673	40	3	0.382965	13700.0	9	3	1	1	2.0	2850.0
2	0	0.061086	78	0	2058.000000	2500.0	10	0	2	0	0.0	2500.0
3	0	0.075427	32	0	0.085512	7916.0	6	0	0	0	0.0	4145.0
4	0	0.046560	58	0	0.241622	2416.0	9	0	1	0	0.0	2850.0

	Unnamed: 0	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfDependents
0	1	1	0.766127	45	2	0.802982	9120.0	13	0	6	2.0
1	2	0	0.957151	40	0	0.121876	2600.0	4	0	0	1.0
2	3	0	0.658180	38	1	0.085113	3042.0	2	1	0	0.0
3	4	0	0.233810	30	0	0.036050	3300.0	5	0	0	0.0
4	5	0	0.907239	49	1	0.024926	63588.0	7	0	1	0.0

Python @ Praxis

Sunday, October 15, 2017

DG 08 Fitting

Predicted	0	1
Actual
0	35042	27
1	2472	44