http://nbviewer.jupyter.org/github/yhat/DataGotham2013/blob/master/notebooks/5%20-%20Imputing%20Data.ipynb

import matplotlib
%matplotlib inline

import pandas as pd
import numpy as np
import pylab as pl

df = pd.read_csv("cs-training.csv")

is_test = np.random.uniform(0, 1, len(df)) > 0.75
train = df[is_test==False]
test = df[is_test==True]

len(train), len(test)

(112254, 37746)

df.head()

from sklearn.neighbors import KNeighborsRegressor

income_imputer = KNeighborsRegressor(n_neighbors=1)

#split our data into 2 groups; data containing nulls and data 
# not containing nulls we'll train on the latter and make
# 'predictions' on the null data to impute monthly_income
train_w_monthly_income = train[train.MonthlyIncome.isnull()==False]
train_w_null_monthly_income = train[train.MonthlyIncome.isnull()==True]

train_w_monthly_income.corr()

train_w_monthly_income.corr().ix[:,5]

Unnamed: 0                             -0.001740
SeriousDlqin2yrs                       -0.002992
RevolvingUtilizationOfUnsecuredLines    0.000358
age                                    -0.000707
NumberOfTime30-59DaysPastDueNotWorse   -0.001784
DebtRatio                               1.000000
MonthlyIncome                          -0.032842
NumberOfOpenCreditLinesAndLoans         0.008168
NumberOfTimes90DaysLate                -0.002447
NumberRealEstateLoansOrLines            0.018133
NumberOfTime60-89DaysPastDueNotWorse   -0.001662
NumberOfDependents                      0.009495
Name: DebtRatio, dtype: float64

cols = ['NumberRealEstateLoansOrLines', 'NumberOfOpenCreditLinesAndLoans']
income_imputer.fit(train_w_monthly_income[cols], train_w_monthly_income.MonthlyIncome)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=1, p=2,
          weights='uniform')

new_values = income_imputer.predict(train_w_null_monthly_income[cols])

train_w_null_monthly_income['monthly_income'] = new_values
new_values

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

array([ 9933.,  5000.,  1400., ...,  1739.,  3500.,  8633.])

#combine the data back together
train = train_w_monthly_income.append(train_w_null_monthly_income)
len(train)

112254

test['monthly_income_imputed'] = income_imputer.predict(test[cols])
test.head()

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

test['monthly_income'] = np.where(test.MonthlyIncome.isnull(), test.monthly_income_imputed,
                                  test.MonthlyIncome)

C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

print (pd.value_counts(train.monthly_income.isnull()))
print (pd.value_counts(test.monthly_income.isnull()))

True     89961
False    22293
Name: monthly_income, dtype: int64
False    37746
Name: monthly_income, dtype: int64

train.to_csv("out-credit-data-trainingset.csv", index=False)
test.to_csv("out-credit-data-testset.csv", index=False)

	Unnamed: 0	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfTime60-89DaysPastDueNotWorse	NumberOfDependents
Unnamed: 0	1.000000	0.002208	-0.001783	0.004961	-0.001572	-0.001740	0.003208	0.002954	-0.002260	-0.001140	-0.001816	-0.000564
SeriousDlqin2yrs	0.002208	1.000000	-0.002176	-0.102413	0.118400	-0.002992	-0.022379	-0.025339	0.107595	-0.004633	0.088683	0.049048
RevolvingUtilizationOfUnsecuredLines	-0.001783	-0.002176	1.000000	-0.003730	-0.001396	0.000358	0.005093	-0.011287	-0.000790	0.002972	-0.001048	0.000683
age	0.004961	-0.102413	-0.003730	1.000000	-0.049533	-0.000707	0.046543	0.185591	-0.047804	0.064734	-0.043649	-0.205451
NumberOfTime30-59DaysPastDueNotWorse	-0.001572	0.118400	-0.001396	-0.049533	1.000000	-0.001784	-0.012267	-0.042336	0.974372	-0.023618	0.979387	0.003590
DebtRatio	-0.001740	-0.002992	0.000358	-0.000707	-0.001784	1.000000	-0.032842	0.008168	-0.002447	0.018133	-0.001662	0.009495
MonthlyIncome	0.003208	-0.022379	0.005093	0.046543	-0.012267	-0.032842	1.000000	0.111687	-0.015292	0.154098	-0.013404	0.072802
NumberOfOpenCreditLinesAndLoans	0.002954	-0.025339	-0.011287	0.185591	-0.042336	0.008168	0.111687	1.000000	-0.071497	0.427374	-0.060510	0.038853
NumberOfTimes90DaysLate	-0.002260	0.107595	-0.000790	-0.047804	0.974372	-0.002447	-0.015292	-0.071497	1.000000	-0.040088	0.988663	-0.003841
NumberRealEstateLoansOrLines	-0.001140	-0.004633	0.002972	0.064734	-0.023618	0.018133	0.154098	0.427374	-0.040088	1.000000	-0.033641	0.117116
NumberOfTime60-89DaysPastDueNotWorse	-0.001816	0.088683	-0.001048	-0.043649	0.979387	-0.001662	-0.013404	-0.060510	0.988663	-0.033641	1.000000	-0.005185
NumberOfDependents	-0.000564	0.049048	0.000683	-0.205451	0.003590	0.009495	0.072802	0.038853	-0.003841	0.117116	-0.005185	1.000000

	Unnamed: 0	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfTime60-89DaysPastDueNotWorse	NumberOfDependents	monthly_income_imputed
8	9	0	0.116951	27	0	46.000000	NaN	2	0	0	0	NaN	1600.0
13	14	1	0.964673	40	3	0.382965	13700.0	9	3	1	1	2.0	3800.0
14	15	0	0.019657	76	0	477.000000	0.0	6	0	1	0	0.0	3750.0
20	21	0	0.200923	43	0	0.430046	12300.0	10	0	2	0	0.0	5000.0
24	25	0	0.046560	58	0	0.241622	2416.0	9	0	1	0	0.0	3800.0

Python @ Praxis

Sunday, October 15, 2017

DG 05 Imputing Data

No comments:

Post a Comment

	Unnamed: 0	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfDependents
0	1	1	0.766127	45	2	0.802982	9120.0	13	0	6	2.0
1	2	0	0.957151	40	0	0.121876	2600.0	4	0	0	1.0
2	3	0	0.658180	38	1	0.085113	3042.0	2	1	0	0.0
3	4	0	0.233810	30	0	0.036050	3300.0	5	0	0	0.0
4	5	0	0.907239	49	1	0.024926	63588.0	7	0	1	0.0