Sunday, October 15, 2017

DG 05 Imputing Data

dg13-05-ImputingData
In [1]:
import matplotlib
%matplotlib inline  
In [2]:
import pandas as pd
import numpy as np
import pylab as pl
In [3]:
df = pd.read_csv("cs-training.csv")
In [4]:
is_test = np.random.uniform(0, 1, len(df)) > 0.75
train = df[is_test==False]
test = df[is_test==True]
In [5]:
len(train), len(test)
Out[5]:
(112254, 37746)
In [7]:
df.head()
Out[7]:
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
0 1 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0
1 2 0 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0
2 3 0 0.658180 38 1 0.085113 3042.0 2 1 0 0 0.0
3 4 0 0.233810 30 0 0.036050 3300.0 5 0 0 0 0.0
4 5 0 0.907239 49 1 0.024926 63588.0 7 0 1 0 0.0
In [8]:
from sklearn.neighbors import KNeighborsRegressor

income_imputer = KNeighborsRegressor(n_neighbors=1)

#split our data into 2 groups; data containing nulls and data 
# not containing nulls we'll train on the latter and make
# 'predictions' on the null data to impute monthly_income
train_w_monthly_income = train[train.MonthlyIncome.isnull()==False]
train_w_null_monthly_income = train[train.MonthlyIncome.isnull()==True]
In [9]:
train_w_monthly_income.corr()
Out[9]:
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
Unnamed: 0 1.000000 0.002208 -0.001783 0.004961 -0.001572 -0.001740 0.003208 0.002954 -0.002260 -0.001140 -0.001816 -0.000564
SeriousDlqin2yrs 0.002208 1.000000 -0.002176 -0.102413 0.118400 -0.002992 -0.022379 -0.025339 0.107595 -0.004633 0.088683 0.049048
RevolvingUtilizationOfUnsecuredLines -0.001783 -0.002176 1.000000 -0.003730 -0.001396 0.000358 0.005093 -0.011287 -0.000790 0.002972 -0.001048 0.000683
age 0.004961 -0.102413 -0.003730 1.000000 -0.049533 -0.000707 0.046543 0.185591 -0.047804 0.064734 -0.043649 -0.205451
NumberOfTime30-59DaysPastDueNotWorse -0.001572 0.118400 -0.001396 -0.049533 1.000000 -0.001784 -0.012267 -0.042336 0.974372 -0.023618 0.979387 0.003590
DebtRatio -0.001740 -0.002992 0.000358 -0.000707 -0.001784 1.000000 -0.032842 0.008168 -0.002447 0.018133 -0.001662 0.009495
MonthlyIncome 0.003208 -0.022379 0.005093 0.046543 -0.012267 -0.032842 1.000000 0.111687 -0.015292 0.154098 -0.013404 0.072802
NumberOfOpenCreditLinesAndLoans 0.002954 -0.025339 -0.011287 0.185591 -0.042336 0.008168 0.111687 1.000000 -0.071497 0.427374 -0.060510 0.038853
NumberOfTimes90DaysLate -0.002260 0.107595 -0.000790 -0.047804 0.974372 -0.002447 -0.015292 -0.071497 1.000000 -0.040088 0.988663 -0.003841
NumberRealEstateLoansOrLines -0.001140 -0.004633 0.002972 0.064734 -0.023618 0.018133 0.154098 0.427374 -0.040088 1.000000 -0.033641 0.117116
NumberOfTime60-89DaysPastDueNotWorse -0.001816 0.088683 -0.001048 -0.043649 0.979387 -0.001662 -0.013404 -0.060510 0.988663 -0.033641 1.000000 -0.005185
NumberOfDependents -0.000564 0.049048 0.000683 -0.205451 0.003590 0.009495 0.072802 0.038853 -0.003841 0.117116 -0.005185 1.000000
In [10]:
train_w_monthly_income.corr().ix[:,5]
Out[10]:
Unnamed: 0                             -0.001740
SeriousDlqin2yrs                       -0.002992
RevolvingUtilizationOfUnsecuredLines    0.000358
age                                    -0.000707
NumberOfTime30-59DaysPastDueNotWorse   -0.001784
DebtRatio                               1.000000
MonthlyIncome                          -0.032842
NumberOfOpenCreditLinesAndLoans         0.008168
NumberOfTimes90DaysLate                -0.002447
NumberRealEstateLoansOrLines            0.018133
NumberOfTime60-89DaysPastDueNotWorse   -0.001662
NumberOfDependents                      0.009495
Name: DebtRatio, dtype: float64
In [13]:
cols = ['NumberRealEstateLoansOrLines', 'NumberOfOpenCreditLinesAndLoans']
income_imputer.fit(train_w_monthly_income[cols], train_w_monthly_income.MonthlyIncome)
Out[13]:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=1, p=2,
          weights='uniform')
In [14]:
new_values = income_imputer.predict(train_w_null_monthly_income[cols])
In [15]:
train_w_null_monthly_income['monthly_income'] = new_values
new_values
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
Out[15]:
array([ 9933.,  5000.,  1400., ...,  1739.,  3500.,  8633.])
In [16]:
#combine the data back together
train = train_w_monthly_income.append(train_w_null_monthly_income)
len(train)
Out[16]:
112254
In [17]:
test['monthly_income_imputed'] = income_imputer.predict(test[cols])
test.head()
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
Out[17]:
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents monthly_income_imputed
8 9 0 0.116951 27 0 46.000000 NaN 2 0 0 0 NaN 1600.0
13 14 1 0.964673 40 3 0.382965 13700.0 9 3 1 1 2.0 3800.0
14 15 0 0.019657 76 0 477.000000 0.0 6 0 1 0 0.0 3750.0
20 21 0 0.200923 43 0 0.430046 12300.0 10 0 2 0 0.0 5000.0
24 25 0 0.046560 58 0 0.241622 2416.0 9 0 1 0 0.0 3800.0
In [19]:
test['monthly_income'] = np.where(test.MonthlyIncome.isnull(), test.monthly_income_imputed,
                                  test.MonthlyIncome)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
In [20]:
print (pd.value_counts(train.monthly_income.isnull()))
print (pd.value_counts(test.monthly_income.isnull()))
True     89961
False    22293
Name: monthly_income, dtype: int64
False    37746
Name: monthly_income, dtype: int64
In [21]:
train.to_csv("out-credit-data-trainingset.csv", index=False)
test.to_csv("out-credit-data-testset.csv", index=False)
In [ ]:
 

No comments:

Post a Comment