In [1]:
import matplotlib
%matplotlib inline
In [2]:
import pandas as pd
import numpy as np
import pylab as pl
In [3]:
df = pd.read_csv("cs-training.csv")
In [4]:
is_test = np.random.uniform(0, 1, len(df)) > 0.75
train = df[is_test==False]
test = df[is_test==True]
In [5]:
len(train), len(test)
Out[5]:
In [7]:
df.head()
Out[7]:
In [8]:
from sklearn.neighbors import KNeighborsRegressor
income_imputer = KNeighborsRegressor(n_neighbors=1)
#split our data into 2 groups; data containing nulls and data
# not containing nulls we'll train on the latter and make
# 'predictions' on the null data to impute monthly_income
train_w_monthly_income = train[train.MonthlyIncome.isnull()==False]
train_w_null_monthly_income = train[train.MonthlyIncome.isnull()==True]
In [9]:
train_w_monthly_income.corr()
Out[9]:
In [10]:
train_w_monthly_income.corr().ix[:,5]
Out[10]:
In [13]:
cols = ['NumberRealEstateLoansOrLines', 'NumberOfOpenCreditLinesAndLoans']
income_imputer.fit(train_w_monthly_income[cols], train_w_monthly_income.MonthlyIncome)
Out[13]:
In [14]:
new_values = income_imputer.predict(train_w_null_monthly_income[cols])
In [15]:
train_w_null_monthly_income['monthly_income'] = new_values
new_values
Out[15]:
In [16]:
#combine the data back together
train = train_w_monthly_income.append(train_w_null_monthly_income)
len(train)
Out[16]:
In [17]:
test['monthly_income_imputed'] = income_imputer.predict(test[cols])
test.head()
Out[17]:
In [19]:
test['monthly_income'] = np.where(test.MonthlyIncome.isnull(), test.monthly_income_imputed,
test.MonthlyIncome)
In [20]:
print (pd.value_counts(train.monthly_income.isnull()))
print (pd.value_counts(test.monthly_income.isnull()))
In [21]:
train.to_csv("out-credit-data-trainingset.csv", index=False)
test.to_csv("out-credit-data-testset.csv", index=False)
In [ ]:
No comments:
Post a Comment