http://nbviewer.jupyter.org/github/yhat/DataGotham2013/blob/master/notebooks/7%20-%20Feature%20Engineering.ipynb

import pandas as pd
import numpy as np
import pylab as pl

df = pd.read_csv("cs-training.csv")

df.head()

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  from ipykernel import kernelapp as app

from sklearn.ensemble import RandomForestClassifier

features = np.array(['RevolvingUtilizationOfUnsecuredLines',
                     'age', 'NumberOfTime30-59DaysPastDueNotWorse',
                     'DebtRatio', 'MonthlyIncome','NumberOfOpenCreditLinesAndLoans', 
                     'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
                     'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents'])

clf = RandomForestClassifier(compute_importances=True)
clf.fit(df[features], df['SeriousDlqin2yrs'])

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-5-87c9a3dc81e9> in <module>()
----> 1 clf = RandomForestClassifier(compute_importances=True)
      2 clf.fit(df[features], df['SeriousDlqin2yrs'])

TypeError: __init__() got an unexpected keyword argument 'compute_importances'

clf = RandomForestClassifier()
clf.fit(df[features], df['SeriousDlqin2yrs'])

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-8b18593e35e1> in <module>()
      1 clf = RandomForestClassifier()
----> 2 clf.fit(df[features], df['SeriousDlqin2yrs'])

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in fit(self, X, y, sample_weight)
    245         """
    246         # Validate or convert input data
--> 247         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
    248         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
    249         if issparse(X):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    405                              % (array.ndim, estimator_name))
    406         if force_all_finite:
--> 407             _assert_all_finite(array)
    408 
    409     shape_repr = _shape_repr(array.shape)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in _assert_all_finite(X)
     56             and not np.isfinite(X).all()):
     57         raise ValueError("Input contains NaN, infinity"
---> 58                          " or a value too large for %r." % X.dtype)
     59 
     60 

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

# remove all data with missing values
df1 = df[np.isfinite(df['MonthlyIncome'])]
df2 = df1[np.isfinite(df['NumberOfDependents'])]
df2.to_csv("cs-training-v1.csv", index=False)

# create new dataframe with clean values
dfC = pd.read_csv("cs-training-v1.csv")

clf = RandomForestClassifier()
clf.fit(dfC[features], dfC['SeriousDlqin2yrs'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

# from the calculated importances, order them from most to least important
# and make a barplot so we can visualize what is/isn't important
importances = clf.feature_importances_
sorted_idx = np.argsort(importances)

padding = np.arange(len(features)) + 0.5
pl.barh(padding, importances[sorted_idx], align='center')
pl.yticks(padding, features[sorted_idx])
pl.xlabel("Relative Importance")
pl.title("Variable Importance")
pl.show()

dfC['income_bins'] = pd.cut(dfC.MonthlyIncome, bins=15)
pd.value_counts(dfC['income_bins'])
# not very helpful

(-3008.75, 200583.333]        120240
(200583.333, 401166.667]          14
(601750, 802333.333]               5
(401166.667, 601750]               5
(2808166.667, 3008750]             1
(1604666.667, 1805250]             1
(1404083.333, 1604666.667]         1
(1002916.667, 1203500]             1
(802333.333, 1002916.667]          1
(2607583.333, 2808166.667]         0
(2407000, 2607583.333]             0
(2206416.667, 2407000]             0
(2005833.333, 2206416.667]         0
(1805250, 2005833.333]             0
(1203500, 1404083.333]             0
Name: income_bins, dtype: int64

dfC.head()

def cap_values(x, cap):
    if x > cap:
        return cap
    else:
        return x
    
dfC.MonthlyIncome = dfC.MonthlyIncome.apply(lambda x: cap_values(x, 15000))

dfC.MonthlyIncome.describe()

count    120269.000000
mean       6135.401999
std        3693.385241
min           0.000000
25%        3400.000000
50%        5400.000000
75%        8249.000000
max       15000.000000
Name: MonthlyIncome, dtype: float64

dfC['income_bins'] = pd.cut(dfC.MonthlyIncome, bins=15, labels=False)
pd.value_counts(dfC.income_bins)

4     15461
3     15314
5     13215
2     13081
6     10720
7      9024
1      7208
8      7131
14     6696
9      6001
0      4795
10     4562
11     3015
12     2460
13     1586
Name: income_bins, dtype: int64

dfC[["income_bins", "SeriousDlqin2yrs"]].groupby("income_bins").mean()

%matplotlib inline

cols = ["income_bins", "SeriousDlqin2yrs"]
dfC[cols].groupby("income_bins").mean().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x2009e8ad550>

mybins = [0,20,25,30,35,40,45,50,55,60,65,70,75,80,120]
dfC['age_bucket'] = pd.cut(dfC.age, bins=mybins)
pd.value_counts(dfC['age_bucket'])

(45, 50]     15736
(50, 55]     14576
(40, 45]     13833
(55, 60]     13326
(60, 65]     12440
(35, 40]     11690
(30, 35]      9341
(65, 70]      8035
(25, 30]      6629
(70, 75]      5450
(75, 80]      3681
(80, 120]     3108
(20, 25]      2423
(0, 20]          0
Name: age_bucket, dtype: int64

dfC[["age_bucket", "SeriousDlqin2yrs"]].groupby("age_bucket").mean()

dfC[["age_bucket", "SeriousDlqin2yrs"]].groupby("age_bucket").mean().plot()

<matplotlib.axes._subplots.AxesSubplot at 0x2009e8eccf8>

bins2 = []

for q in [0.2, 0.4, 0.6, 0.8, 1.0]:
    bins2.append(dfC.DebtRatio.quantile(q))

debt_ratio_binned = pd.cut(dfC.DebtRatio, bins=bins2)
debt_ratio_binned
print (pd.value_counts(debt_ratio_binned))

(0.543, 61106.5]    24054
(0.36, 0.543]       24054
(0.108, 0.237]      24054
(0.237, 0.36]       24053
Name: DebtRatio, dtype: int64

from sklearn.preprocessing import StandardScaler

dfC['monthly_income_scaled'] = StandardScaler().fit_transform(dfC.MonthlyIncome)

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:586: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py:649: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  warnings.warn(DEPRECATION_MSG_1D, DeprecationWarning)

print (dfC.monthly_income_scaled.describe())
print
print ("Mean at 0?", round(dfC.monthly_income_scaled.mean(), 10)==0)

pl.hist(dfC.monthly_income_scaled)

count    1.202690e+05
mean    -1.062443e-16
std      1.000004e+00
min     -1.661193e+00
25%     -7.406250e-01
50%     -1.991141e-01
75%      5.722682e-01
max      2.400138e+00
Name: monthly_income_scaled, dtype: float64
Mean at 0? True

(array([  7055.,  18029.,  22751.,  21239.,  15442.,  11433.,   8674.,
          4904.,   3275.,   7467.]),
 array([-1.66119349, -1.25506032, -0.84892716, -0.44279399, -0.03666083,
         0.36947234,  0.7756055 ,  1.18173867,  1.58787183,  1.99400499,
         2.40013816]),
 <a list of 10 Patch objects>)

# redo the process once again
features = np.array(['RevolvingUtilizationOfUnsecuredLines',
                     'age', 'NumberOfTime30-59DaysPastDueNotWorse',
                     'DebtRatio', 'MonthlyIncome','NumberOfOpenCreditLinesAndLoans', 
                     'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
                     'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
                     'income_bins', 'age_bucket', 'monthly_income_scaled'])

clf = RandomForestClassifier()
clf.fit(dfC[features], dfC['SeriousDlqin2yrs'])

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-39-d9b4e87303fc> in <module>()
      8 
      9 clf = RandomForestClassifier()
---> 10 clf.fit(dfC[features], dfC['SeriousDlqin2yrs'])

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py in fit(self, X, y, sample_weight)
    245         """
    246         # Validate or convert input data
--> 247         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
    248         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
    249         if issparse(X):

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    380                                       force_all_finite)
    381     else:
--> 382         array = np.array(array, dtype=dtype, order=order, copy=copy)
    383 
    384         if ensure_2d:

ValueError: could not convert string to float: '(60, 65]'

# redo the process once again
# problem with age_bucket -- remove it!!
features = np.array(['RevolvingUtilizationOfUnsecuredLines',
                     'age', 'NumberOfTime30-59DaysPastDueNotWorse',
                     'DebtRatio', 'MonthlyIncome','NumberOfOpenCreditLinesAndLoans', 
                     'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
                     'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
                     'income_bins',  'monthly_income_scaled'])

clf = RandomForestClassifier()
clf.fit(dfC[features], dfC['SeriousDlqin2yrs'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

importances = clf.feature_importances_
sorted_idx = np.argsort(importances)

padding = np.arange(len(features)) + 0.5
pl.barh(padding, importances[sorted_idx], align='center')
pl.yticks(padding, features[sorted_idx])
pl.xlabel("Relative Importance")
pl.title("Variable Importance")
pl.show()

best_features = features[sorted_idx][::-1]
best_features

array(['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio',
       'monthly_income_scaled', 'MonthlyIncome', 'age',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberOfTime30-59DaysPastDueNotWorse',
       'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
       'income_bins', 'NumberRealEstateLoansOrLines'], 
      dtype='<U36')

	SeriousDlqin2yrs
income_bins
0	0.051512
1	0.106410
2	0.096552
3	0.088089
4	0.076774
5	0.069467
6	0.061940
7	0.056405
8	0.050624
9	0.049992
10	0.039895
11	0.040133
12	0.041463
13	0.040984
14	0.048088

Python @ Praxis

Sunday, October 15, 2017

DG 07 Feature Selection

No comments:

Post a Comment

	Unnamed: 0	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfDependents
0	1	1	0.766127	45	2	0.802982	9120.0	13	0	6	2.0
1	2	0	0.957151	40	0	0.121876	2600.0	4	0	0	1.0
2	3	0	0.658180	38	1	0.085113	3042.0	2	1	0	0.0
3	4	0	0.233810	30	0	0.036050	3300.0	5	0	0	0.0
4	5	0	0.907239	49	1	0.024926	63588.0	7	0	1	0.0

	SeriousDlqin2yrs
age_bucket
(0, 20]	NaN
(20, 25]	0.100702
(25, 30]	0.114949
(30, 35]	0.102880
(35, 40]	0.091018
(40, 45]	0.084653
(45, 50]	0.081596
(50, 55]	0.071968
(55, 60]	0.055380
(60, 65]	0.042283
(65, 70]	0.029620
(70, 75]	0.029725
(75, 80]	0.022548
(80, 120]	0.024131