In [1]:
import pandas as pd
import numpy as np
import pylab as pl
In [2]:
df = pd.read_csv("cs-training.csv")
In [3]:
df.head()
Out[3]:
In [8]:
In [4]:
from sklearn.ensemble import RandomForestClassifier
features = np.array(['RevolvingUtilizationOfUnsecuredLines',
'age', 'NumberOfTime30-59DaysPastDueNotWorse',
'DebtRatio', 'MonthlyIncome','NumberOfOpenCreditLinesAndLoans',
'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents'])
In [5]:
clf = RandomForestClassifier(compute_importances=True)
clf.fit(df[features], df['SeriousDlqin2yrs'])
In [6]:
clf = RandomForestClassifier()
clf.fit(df[features], df['SeriousDlqin2yrs'])
In [ ]:
# remove all data with missing values
df1 = df[np.isfinite(df['MonthlyIncome'])]
df2 = df1[np.isfinite(df['NumberOfDependents'])]
df2.to_csv("cs-training-v1.csv", index=False)
In [9]:
# create new dataframe with clean values
dfC = pd.read_csv("cs-training-v1.csv")
In [10]:
clf = RandomForestClassifier()
clf.fit(dfC[features], dfC['SeriousDlqin2yrs'])
Out[10]:
In [11]:
# from the calculated importances, order them from most to least important
# and make a barplot so we can visualize what is/isn't important
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)
In [12]:
padding = np.arange(len(features)) + 0.5
pl.barh(padding, importances[sorted_idx], align='center')
pl.yticks(padding, features[sorted_idx])
pl.xlabel("Relative Importance")
pl.title("Variable Importance")
pl.show()
In [13]:
dfC['income_bins'] = pd.cut(dfC.MonthlyIncome, bins=15)
pd.value_counts(dfC['income_bins'])
# not very helpful
Out[13]:
In [14]:
dfC.head()
Out[14]:
In [15]:
def cap_values(x, cap):
if x > cap:
return cap
else:
return x
dfC.MonthlyIncome = dfC.MonthlyIncome.apply(lambda x: cap_values(x, 15000))
In [16]:
dfC.MonthlyIncome.describe()
Out[16]:
In [17]:
dfC['income_bins'] = pd.cut(dfC.MonthlyIncome, bins=15, labels=False)
pd.value_counts(dfC.income_bins)
Out[17]:
In [18]:
dfC[["income_bins", "SeriousDlqin2yrs"]].groupby("income_bins").mean()
Out[18]:
In [20]:
%matplotlib inline
In [21]:
cols = ["income_bins", "SeriousDlqin2yrs"]
dfC[cols].groupby("income_bins").mean().plot()
Out[21]:
In [26]:
mybins = [0,20,25,30,35,40,45,50,55,60,65,70,75,80,120]
dfC['age_bucket'] = pd.cut(dfC.age, bins=mybins)
pd.value_counts(dfC['age_bucket'])
Out[26]:
In [28]:
dfC[["age_bucket", "SeriousDlqin2yrs"]].groupby("age_bucket").mean()
Out[28]:
In [30]:
dfC[["age_bucket", "SeriousDlqin2yrs"]].groupby("age_bucket").mean().plot()
Out[30]:
In [31]:
bins2 = []
for q in [0.2, 0.4, 0.6, 0.8, 1.0]:
bins2.append(dfC.DebtRatio.quantile(q))
In [33]:
debt_ratio_binned = pd.cut(dfC.DebtRatio, bins=bins2)
debt_ratio_binned
print (pd.value_counts(debt_ratio_binned))
In [34]:
from sklearn.preprocessing import StandardScaler
dfC['monthly_income_scaled'] = StandardScaler().fit_transform(dfC.MonthlyIncome)
In [35]:
print (dfC.monthly_income_scaled.describe())
print
print ("Mean at 0?", round(dfC.monthly_income_scaled.mean(), 10)==0)
pl.hist(dfC.monthly_income_scaled)
Out[35]:
In [39]:
# redo the process once again
features = np.array(['RevolvingUtilizationOfUnsecuredLines',
'age', 'NumberOfTime30-59DaysPastDueNotWorse',
'DebtRatio', 'MonthlyIncome','NumberOfOpenCreditLinesAndLoans',
'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
'income_bins', 'age_bucket', 'monthly_income_scaled'])
clf = RandomForestClassifier()
clf.fit(dfC[features], dfC['SeriousDlqin2yrs'])
In [40]:
# redo the process once again
# problem with age_bucket -- remove it!!
features = np.array(['RevolvingUtilizationOfUnsecuredLines',
'age', 'NumberOfTime30-59DaysPastDueNotWorse',
'DebtRatio', 'MonthlyIncome','NumberOfOpenCreditLinesAndLoans',
'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
'income_bins', 'monthly_income_scaled'])
clf = RandomForestClassifier()
clf.fit(dfC[features], dfC['SeriousDlqin2yrs'])
Out[40]:
In [41]:
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)
padding = np.arange(len(features)) + 0.5
pl.barh(padding, importances[sorted_idx], align='center')
pl.yticks(padding, features[sorted_idx])
pl.xlabel("Relative Importance")
pl.title("Variable Importance")
pl.show()
In [42]:
best_features = features[sorted_idx][::-1]
best_features
Out[42]:
In [ ]:
No comments:
Post a Comment