Sunday, October 15, 2017

DG 07 Feature Selection

dg13-07-Feature-Selection
In [1]:
import pandas as pd
import numpy as np
import pylab as pl
In [2]:
df = pd.read_csv("cs-training.csv")
In [3]:
df.head()
Out[3]:
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
0 1 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0
1 2 0 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0
2 3 0 0.658180 38 1 0.085113 3042.0 2 1 0 0 0.0
3 4 0 0.233810 30 0 0.036050 3300.0 5 0 0 0 0.0
4 5 0 0.907239 49 1 0.024926 63588.0 7 0 1 0 0.0
In [8]:
 
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  from ipykernel import kernelapp as app
In [4]:
from sklearn.ensemble import RandomForestClassifier

features = np.array(['RevolvingUtilizationOfUnsecuredLines',
                     'age', 'NumberOfTime30-59DaysPastDueNotWorse',
                     'DebtRatio', 'MonthlyIncome','NumberOfOpenCreditLinesAndLoans', 
                     'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
                     'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents'])
In [5]:
clf = RandomForestClassifier(compute_importances=True)
clf.fit(df[features], df['SeriousDlqin2yrs'])
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-5-87c9a3dc81e9> in <module>()
----> 1 clf = RandomForestClassifier(compute_importances=True)
      2 clf.fit(df[features], df['SeriousDlqin2yrs'])

TypeError: __init__() got an unexpected keyword argument 'compute_importances'
In [6]:
clf = RandomForestClassifier()
clf.fit(df[features], df['SeriousDlqin2yrs'])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-8b18593e35e1> in <module>()
      1 clf = RandomForestClassifier()
----> 2 clf.fit(df[features], df['SeriousDlqin2yrs'])

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in fit(self, X, y, sample_weight)
    245         """
    246         # Validate or convert input data
--> 247         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
    248         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
    249         if issparse(X):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    405                              % (array.ndim, estimator_name))
    406         if force_all_finite:
--> 407             _assert_all_finite(array)
    408 
    409     shape_repr = _shape_repr(array.shape)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X)
     56             and not np.isfinite(X).all()):
     57         raise ValueError("Input contains NaN, infinity"
---> 58                          " or a value too large for %r." % X.dtype)
     59 
     60 

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
In [ ]:
# remove all data with missing values
df1 = df[np.isfinite(df['MonthlyIncome'])]
df2 = df1[np.isfinite(df['NumberOfDependents'])]
df2.to_csv("cs-training-v1.csv", index=False)
In [9]:
# create new dataframe with clean values
dfC = pd.read_csv("cs-training-v1.csv")
In [10]:
clf = RandomForestClassifier()
clf.fit(dfC[features], dfC['SeriousDlqin2yrs'])
Out[10]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [11]:
# from the calculated importances, order them from most to least important
# and make a barplot so we can visualize what is/isn't important
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)
In [12]:
padding = np.arange(len(features)) + 0.5
pl.barh(padding, importances[sorted_idx], align='center')
pl.yticks(padding, features[sorted_idx])
pl.xlabel("Relative Importance")
pl.title("Variable Importance")
pl.show()
In [13]:
dfC['income_bins'] = pd.cut(dfC.MonthlyIncome, bins=15)
pd.value_counts(dfC['income_bins'])
# not very helpful
Out[13]:
(-3008.75, 200583.333]        120240
(200583.333, 401166.667]          14
(601750, 802333.333]               5
(401166.667, 601750]               5
(2808166.667, 3008750]             1
(1604666.667, 1805250]             1
(1404083.333, 1604666.667]         1
(1002916.667, 1203500]             1
(802333.333, 1002916.667]          1
(2607583.333, 2808166.667]         0
(2407000, 2607583.333]             0
(2206416.667, 2407000]             0
(2005833.333, 2206416.667]         0
(1805250, 2005833.333]             0
(1203500, 1404083.333]             0
Name: income_bins, dtype: int64
In [14]:
dfC.head()
Out[14]:
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents income_bins
0 1 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0 (-3008.75, 200583.333]
1 2 0 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0 (-3008.75, 200583.333]
2 3 0 0.658180 38 1 0.085113 3042.0 2 1 0 0 0.0 (-3008.75, 200583.333]
3 4 0 0.233810 30 0 0.036050 3300.0 5 0 0 0 0.0 (-3008.75, 200583.333]
4 5 0 0.907239 49 1 0.024926 63588.0 7 0 1 0 0.0 (-3008.75, 200583.333]
In [15]:
def cap_values(x, cap):
    if x > cap:
        return cap
    else:
        return x
    
dfC.MonthlyIncome = dfC.MonthlyIncome.apply(lambda x: cap_values(x, 15000))
In [16]:
dfC.MonthlyIncome.describe()
Out[16]:
count    120269.000000
mean       6135.401999
std        3693.385241
min           0.000000
25%        3400.000000
50%        5400.000000
75%        8249.000000
max       15000.000000
Name: MonthlyIncome, dtype: float64
In [17]:
dfC['income_bins'] = pd.cut(dfC.MonthlyIncome, bins=15, labels=False)
pd.value_counts(dfC.income_bins)
Out[17]:
4     15461
3     15314
5     13215
2     13081
6     10720
7      9024
1      7208
8      7131
14     6696
9      6001
0      4795
10     4562
11     3015
12     2460
13     1586
Name: income_bins, dtype: int64
In [18]:
dfC[["income_bins", "SeriousDlqin2yrs"]].groupby("income_bins").mean()
Out[18]:
SeriousDlqin2yrs
income_bins
0 0.051512
1 0.106410
2 0.096552
3 0.088089
4 0.076774
5 0.069467
6 0.061940
7 0.056405
8 0.050624
9 0.049992
10 0.039895
11 0.040133
12 0.041463
13 0.040984
14 0.048088
In [20]:
%matplotlib inline 
In [21]:
cols = ["income_bins", "SeriousDlqin2yrs"]
dfC[cols].groupby("income_bins").mean().plot()
Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x2009e8ad550>
In [26]:
mybins = [0,20,25,30,35,40,45,50,55,60,65,70,75,80,120]
dfC['age_bucket'] = pd.cut(dfC.age, bins=mybins)
pd.value_counts(dfC['age_bucket'])
Out[26]:
(45, 50]     15736
(50, 55]     14576
(40, 45]     13833
(55, 60]     13326
(60, 65]     12440
(35, 40]     11690
(30, 35]      9341
(65, 70]      8035
(25, 30]      6629
(70, 75]      5450
(75, 80]      3681
(80, 120]     3108
(20, 25]      2423
(0, 20]          0
Name: age_bucket, dtype: int64
In [28]:
dfC[["age_bucket", "SeriousDlqin2yrs"]].groupby("age_bucket").mean()
Out[28]:
SeriousDlqin2yrs
age_bucket
(0, 20] NaN
(20, 25] 0.100702
(25, 30] 0.114949
(30, 35] 0.102880
(35, 40] 0.091018
(40, 45] 0.084653
(45, 50] 0.081596
(50, 55] 0.071968
(55, 60] 0.055380
(60, 65] 0.042283
(65, 70] 0.029620
(70, 75] 0.029725
(75, 80] 0.022548
(80, 120] 0.024131
In [30]:
dfC[["age_bucket", "SeriousDlqin2yrs"]].groupby("age_bucket").mean().plot()
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x2009e8eccf8>
In [31]:
bins2 = []

for q in [0.2, 0.4, 0.6, 0.8, 1.0]:
    bins2.append(dfC.DebtRatio.quantile(q))
In [33]:
debt_ratio_binned = pd.cut(dfC.DebtRatio, bins=bins2)
debt_ratio_binned
print (pd.value_counts(debt_ratio_binned))
(0.543, 61106.5]    24054
(0.36, 0.543]       24054
(0.108, 0.237]      24054
(0.237, 0.36]       24053
Name: DebtRatio, dtype: int64
In [34]:
from sklearn.preprocessing import StandardScaler

dfC['monthly_income_scaled'] = StandardScaler().fit_transform(dfC.MonthlyIncome)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:649: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
In [35]:
print (dfC.monthly_income_scaled.describe())
print
print ("Mean at 0?", round(dfC.monthly_income_scaled.mean(), 10)==0)

pl.hist(dfC.monthly_income_scaled)
count    1.202690e+05
mean    -1.062443e-16
std      1.000004e+00
min     -1.661193e+00
25%     -7.406250e-01
50%     -1.991141e-01
75%      5.722682e-01
max      2.400138e+00
Name: monthly_income_scaled, dtype: float64
Mean at 0? True
Out[35]:
(array([  7055.,  18029.,  22751.,  21239.,  15442.,  11433.,   8674.,
          4904.,   3275.,   7467.]),
 array([-1.66119349, -1.25506032, -0.84892716, -0.44279399, -0.03666083,
         0.36947234,  0.7756055 ,  1.18173867,  1.58787183,  1.99400499,
         2.40013816]),
 <a list of 10 Patch objects>)
In [39]:
# redo the process once again
features = np.array(['RevolvingUtilizationOfUnsecuredLines',
                     'age', 'NumberOfTime30-59DaysPastDueNotWorse',
                     'DebtRatio', 'MonthlyIncome','NumberOfOpenCreditLinesAndLoans', 
                     'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
                     'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
                     'income_bins', 'age_bucket', 'monthly_income_scaled'])

clf = RandomForestClassifier()
clf.fit(dfC[features], dfC['SeriousDlqin2yrs'])
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-39-d9b4e87303fc> in <module>()
      8 
      9 clf = RandomForestClassifier()
---> 10 clf.fit(dfC[features], dfC['SeriousDlqin2yrs'])

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in fit(self, X, y, sample_weight)
    245         """
    246         # Validate or convert input data
--> 247         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
    248         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
    249         if issparse(X):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    380                                       force_all_finite)
    381     else:
--> 382         array = np.array(array, dtype=dtype, order=order, copy=copy)
    383 
    384         if ensure_2d:

ValueError: could not convert string to float: '(60, 65]'
In [40]:
# redo the process once again
# problem with age_bucket -- remove it!!
features = np.array(['RevolvingUtilizationOfUnsecuredLines',
                     'age', 'NumberOfTime30-59DaysPastDueNotWorse',
                     'DebtRatio', 'MonthlyIncome','NumberOfOpenCreditLinesAndLoans', 
                     'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
                     'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
                     'income_bins',  'monthly_income_scaled'])

clf = RandomForestClassifier()
clf.fit(dfC[features], dfC['SeriousDlqin2yrs'])
Out[40]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
In [41]:
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)

padding = np.arange(len(features)) + 0.5
pl.barh(padding, importances[sorted_idx], align='center')
pl.yticks(padding, features[sorted_idx])
pl.xlabel("Relative Importance")
pl.title("Variable Importance")
pl.show()
In [42]:
best_features = features[sorted_idx][::-1]
best_features
Out[42]:
array(['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
       'monthly_income_scaled', 'MonthlyIncome', 'age',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberOfTime30-59DaysPastDueNotWorse',
       'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
       'income_bins', 'NumberRealEstateLoansOrLines'], 
      dtype='<U36')
In [ ]:
 

No comments:

Post a Comment