In [1]:
import pandas as pd
import numpy as np
import pylab as pl
In [2]:
train = pd.read_csv("cs-training-v1.csv")
test = pd.read_csv("credit-data-testset.csv")
test.head()
Out[2]:
In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
In [8]:
featuresTest = ['revolving_utilization_of_unsecured_lines', 'debt_ratio',
'monthly_income', 'age', 'number_of_times90_days_late']
featuresTrain = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
'MonthlyIncome', 'age', 'NumberOfTimes90DaysLate']
In [6]:
clf = KNeighborsClassifier(n_neighbors=13, warn_on_equidistant=False)
In [9]:
clf = KNeighborsClassifier(n_neighbors=13)
In [10]:
clf.fit(train[featuresTrain], train.SeriousDlqin2yrs)
Out[10]:
In [11]:
#classes (returns an array)
clf.predict(test[featuresTest])
Out[11]:
In [10]:
# remove all data with missing values
#test = test[np.isfinite(test['MonthlyIncome'])]
#test = test[np.isfinite(test['NumberOfDependents'])]
In [27]:
# remove all data with missing values
#test1 = test[np.isfinite(test['MonthlyIncome'])]
#test2 = test1[np.isfinite(test1['NumberOfDependents'])]
#test2.to_csv("cs-test-v1.csv", index=False)
In [28]:
#testC = pd.read_csv("cs-test-v1.csv")
In [29]:
#classes (returns an array)
#clf.predict(testC[features])
Out[29]:
In [12]:
print(clf.predict(test[featuresTest]))
In [13]:
#probabilities (returns a numpy array)
clf.predict_proba(test[featuresTest])
Out[13]:
In [14]:
%matplotlib inline
In [15]:
probs = clf.predict_proba(test[featuresTest])
prob_true = probs[::,1]
pl.hist(prob_true)
Out[15]:
In [16]:
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
In [17]:
preds = clf.predict_proba(test[featuresTest])
preds
Out[17]:
In [27]:
#print(test['serious_dlqin2yrs'])
In [20]:
confusion_matrix(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]))
Out[20]:
In [21]:
print (classification_report(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]), labels=[0, 1]))
In [23]:
pd.crosstab(test['serious_dlqin2yrs'], clf.predict(test[featuresTest]), rownames=["Actual"], colnames=["Predicted"])
Out[23]:
In [24]:
def plot_roc(name, probs):
fpr, tpr, thresholds = roc_curve(test['serious_dlqin2yrs'], probs)
roc_auc = auc(fpr, tpr)
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k--')
pl.xlim([0.0, 1.05])
pl.ylim([0.0, 1.05])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title(name)
pl.legend(loc="lower right")
pl.show()
In [26]:
plot_roc("Perfect Classifier", test['serious_dlqin2yrs'])
plot_roc("Guessing", np.random.uniform(0, 1, len(test['serious_dlqin2yrs'])))
#[::,1] selects the 2nd column of the numpy array
plot_roc("KNN", preds[::,1])
In [28]:
clf = RandomForestClassifier()
clf.fit(train[featuresTrain], train.SeriousDlqin2yrs)
Out[28]:
In [29]:
probs = clf.predict_proba(test[featuresTest])[::,1]
plot_roc("RandomForest", probs)
In [30]:
train.head()
Out[30]:
In [31]:
featuresTest2 = ['revolving_utilization_of_unsecured_lines', 'debt_ratio',
'number_of_times90_days_late', 'number_real_estate_loans_or_lines']
featuresTrain2 = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines']
In [32]:
clf = GradientBoostingClassifier()
clf.fit(train[featuresTrain2], train.SeriousDlqin2yrs)
Out[32]:
In [33]:
probs = clf.predict_proba(test[featuresTest2])[::,1]
plot_roc("Your Classifier", probs)
In [34]:
probs
odds = (1 - probs) / probs
score = np.log(odds)*(40/np.log(2)) + 340
pl.hist(score)
Out[34]:
In [ ]: