Sunday, October 15, 2017

DG 03 Data Import

dg13-03-Importing_Data
In [19]:
import matplotlib
%matplotlib inline  
In [1]:
import pandas as pd
import pylab as pl
import numpy as np
import re
In [2]:
df = pd.read_csv("cs-training.csv")
In [4]:
df.head()
Out[4]:
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
0 1 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0
1 2 0 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0
2 3 0 0.658180 38 1 0.085113 3042.0 2 1 0 0 0.0
3 4 0 0.233810 30 0 0.036050 3300.0 5 0 0 0 0.0
4 5 0 0.907239 49 1 0.024926 63588.0 7 0 1 0 0.0
In [5]:
type(df)
Out[5]:
pandas.core.frame.DataFrame
In [7]:
print (df.SeriousDlqin2yrs.head())
type(df.SeriousDlqin2yrs)
0    1
1    0
2    0
3    0
4    0
Name: SeriousDlqin2yrs, dtype: int64
Out[7]:
pandas.core.series.Series
In [8]:
df.dtypes
Out[8]:
Unnamed: 0                                int64
SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object
In [9]:
df['DebtRatio']
df.DebtRatio
Out[9]:
0            0.802982
1            0.121876
2            0.085113
3            0.036050
4            0.024926
5            0.375607
6         5710.000000
7            0.209940
8           46.000000
9            0.606291
10           0.309476
11           0.531529
12           0.298354
13           0.382965
14         477.000000
15           0.209892
16        2058.000000
17           0.188274
18           0.527888
19           0.065868
20           0.430046
21           0.475841
22           0.241104
23           0.085512
24           0.241622
25           1.595253
26           0.097672
27           0.042383
28           0.011761
29           0.436103
             ...     
149970       0.253855
149971       0.013997
149972       0.008638
149973       0.494819
149974       0.603479
149975    2716.000000
149976      60.000000
149977     349.000000
149978       0.259496
149979       0.057235
149980       0.254976
149981       0.121752
149982       0.250272
149983       0.000800
149984      25.000000
149985       0.324962
149986       0.080384
149987       0.055692
149988       0.347924
149989       0.001408
149990       0.609779
149991       0.477658
149992    4132.000000
149993       0.000000
149994       0.404293
149995       0.225131
149996       0.716562
149997    3870.000000
149998       0.000000
149999       0.249908
Name: DebtRatio, dtype: float64
In [10]:
df.RevolvingUtilizationOfUnsecuredLines.tail()
Out[10]:
149995    0.040674
149996    0.299745
149997    0.246044
149998    0.000000
149999    0.850283
Name: RevolvingUtilizationOfUnsecuredLines, dtype: float64
In [11]:
df.describe()
Out[11]:
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
count 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 1.202690e+05 150000.000000 150000.000000 150000.000000 150000.000000 146076.000000
mean 75000.500000 0.066840 6.048438 52.295207 0.421033 353.005076 6.670221e+03 8.452760 0.265973 1.018240 0.240387 0.757222
std 43301.414527 0.249746 249.755371 14.771866 4.192781 2037.818523 1.438467e+04 5.145951 4.169304 1.129771 4.155179 1.115086
min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000e+00 0.000000 0.000000 0.000000 0.000000 0.000000
25% 37500.750000 0.000000 0.029867 41.000000 0.000000 0.175074 3.400000e+03 5.000000 0.000000 0.000000 0.000000 0.000000
50% 75000.500000 0.000000 0.154181 52.000000 0.000000 0.366508 5.400000e+03 8.000000 0.000000 1.000000 0.000000 0.000000
75% 112500.250000 0.000000 0.559046 63.000000 0.000000 0.868254 8.249000e+03 11.000000 0.000000 2.000000 0.000000 1.000000
max 150000.000000 1.000000 50708.000000 109.000000 98.000000 329664.000000 3.008750e+06 58.000000 98.000000 54.000000 98.000000 20.000000
In [13]:
df.age.describe()
Out[13]:
count    150000.000000
mean         52.295207
std          14.771866
min           0.000000
25%          41.000000
50%          52.000000
75%          63.000000
max         109.000000
Name: age, dtype: float64
In [15]:
df.NumberOfDependents.unique()
Out[15]:
array([  2.,   1.,   0.,  nan,   3.,   4.,   5.,   6.,   8.,   7.,  20.,
        10.,   9.,  13.])
In [16]:
df.NumberOfDependents.nunique()
Out[16]:
13
In [17]:
pd.value_counts(df.NumberOfDependents)
df.NumberOfDependents.value_counts()
Out[17]:
0.0     86902
1.0     26316
2.0     19522
3.0      9483
4.0      2862
5.0       746
6.0       158
7.0        51
8.0        24
9.0         5
10.0        5
13.0        1
20.0        1
Name: NumberOfDependents, dtype: int64
In [20]:
pd.value_counts(df.NumberOfDependents).plot(kind='bar')
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e960082748>
In [21]:
pd.crosstab(df.NumberOfTimes90DaysLate, df.SeriousDlqin2yrs)
Out[21]:
SeriousDlqin2yrs 0 1
NumberOfTimes90DaysLate
0 135108 6554
1 3478 1765
2 779 776
3 282 385
4 96 195
5 48 83
6 32 48
7 7 31
8 6 15
9 5 14
10 3 5
11 2 3
12 1 1
13 2 2
14 1 1
15 2 0
17 0 1
96 1 4
98 121 143
In [22]:
def camel_to_snake(column_name):
    """
    converts a string that is camelCase into snake_case
    Example:
        print camel_to_snake("javaLovesCamelCase")
        > java_loves_camel_case
    See Also:
        http://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-camel-case
    """
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', column_name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
In [23]:
camel_to_snake("javaLovesCamelCase")
Out[23]:
'java_loves_camel_case'
In [24]:
df.columns = [camel_to_snake(col) for col in df.columns]
df.columns.tolist()
Out[24]:
['unnamed: 0',
 'serious_dlqin2yrs',
 'revolving_utilization_of_unsecured_lines',
 'age',
 'number_of_time30-59_days_past_due_not_worse',
 'debt_ratio',
 'monthly_income',
 'number_of_open_credit_lines_and_loans',
 'number_of_times90_days_late',
 'number_real_estate_loans_or_lines',
 'number_of_time60-89_days_past_due_not_worse',
 'number_of_dependents']
In [25]:
df['monthly_income'].head()
df.monthly_income.head()
Out[25]:
0     9120.0
1     2600.0
2     3042.0
3     3300.0
4    63588.0
Name: monthly_income, dtype: float64
In [26]:
df[['monthly_income', 'serious_dlqin2yrs']].head()
Out[26]:
monthly_income serious_dlqin2yrs
0 9120.0 1
1 2600.0 0
2 3042.0 0
3 3300.0 0
4 63588.0 0
In [27]:
columns_i_want = ['monthly_income', 'serious_dlqin2yrs']
df[columns_i_want].head()
Out[27]:
monthly_income serious_dlqin2yrs
0 9120.0 1
1 2600.0 0
2 3042.0 0
3 3300.0 0
4 63588.0 0
In [28]:
df['one'] = 1
df.one.head()
Out[28]:
0    1
1    1
2    1
3    1
4    1
Name: one, dtype: int64
In [29]:
del df['one']
In [30]:
df.monthly_income > 5000
Out[30]:
0          True
1         False
2         False
3         False
4          True
5         False
6         False
7         False
8         False
9          True
10        False
11         True
12         True
13         True
14        False
15         True
16        False
17         True
18        False
19        False
20         True
21        False
22        False
23         True
24        False
25        False
26         True
27        False
28        False
29         True
          ...  
149970     True
149971    False
149972     True
149973     True
149974    False
149975    False
149976    False
149977    False
149978    False
149979     True
149980     True
149981     True
149982    False
149983    False
149984    False
149985    False
149986    False
149987    False
149988     True
149989     True
149990    False
149991     True
149992    False
149993    False
149994    False
149995    False
149996     True
149997    False
149998     True
149999     True
Name: monthly_income, dtype: bool
In [32]:
gt_5k = df[df.monthly_income > 5000]
print (len(gt_5k),"people with monthly_income > 5000")
df[df.monthly_income > 5000].head()
64410 people with monthly_income > 5000
Out[32]:
unnamed: 0 serious_dlqin2yrs revolving_utilization_of_unsecured_lines age number_of_time30-59_days_past_due_not_worse debt_ratio monthly_income number_of_open_credit_lines_and_loans number_of_times90_days_late number_real_estate_loans_or_lines number_of_time60-89_days_past_due_not_worse number_of_dependents
0 1 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0
4 5 0 0.907239 49 1 0.024926 63588.0 7 0 1 0 0.0
9 10 0 0.189169 57 0 0.606291 23684.0 9 0 4 0 2.0
11 12 0 0.018798 51 0 0.531529 6501.0 7 0 2 0 2.0
12 13 0 0.010352 46 0 0.298354 12454.0 13 0 2 0 2.0
In [33]:
df.ix[40:45, 0:2]
Out[33]:
unnamed: 0 serious_dlqin2yrs
40 41 0
41 42 0
42 43 0
43 44 0
44 45 0
45 46 0
In [34]:
mask = (df.monthly_income > 5000) & (df.serious_dlqin2yrs==1)
df[mask].head()
Out[34]:
unnamed: 0 serious_dlqin2yrs revolving_utilization_of_unsecured_lines age number_of_time30-59_days_past_due_not_worse debt_ratio monthly_income number_of_open_credit_lines_and_loans number_of_times90_days_late number_real_estate_loans_or_lines number_of_time60-89_days_past_due_not_worse number_of_dependents
0 1 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0
13 14 1 0.964673 40 3 0.382965 13700.0 9 3 1 1 2.0
74 75 1 0.133063 49 1 0.182882 10257.0 9 0 2 0 3.0
162 163 1 1.046279 47 1 1.104301 5416.0 6 0 2 0 1.0
184 185 1 0.085338 51 0 0.477504 6200.0 11 0 1 0 0.0
In [35]:
mask = (df.age >= 35) & (df.serious_dlqin2yrs==0) & (df.number_of_open_credit_lines_and_loans < 10)
len(df[mask])==76151
Out[35]:
True
In [36]:
mask = (df.monthly_income == df.monthly_income.quantile(0.90)) & (df.serious_dlqin2yrs==0)
len(df[mask])
Out[36]:
147
In [38]:
?pd.melt
In [39]:
# By not specifying id_vars, we're going to melt EVERYTHING
df_lng = pd.melt(df)
# now our data is a series of (key, value) rows. 
#think of when you've done this in Excel so that you can
#create a pivot table 
df_lng.head()
Out[39]:
variable value
0 unnamed: 0 1.0
1 unnamed: 0 2.0
2 unnamed: 0 3.0
3 unnamed: 0 4.0
4 unnamed: 0 5.0
In [40]:
null_variables = df_lng.value.isnull()
null_variables.sum()
Out[40]:
33655
In [41]:
# crosstab creates a frequency table between 2 variables
# it's going to automatically enumerate the possibilities between
# the two Series and show you a count of occurrences 
#in each possible bucket
pd.crosstab(df_lng.variable, null_variables)
Out[41]:
value False True
variable
age 150000 0
debt_ratio 150000 0
monthly_income 120269 29731
number_of_dependents 146076 3924
number_of_open_credit_lines_and_loans 150000 0
number_of_time30-59_days_past_due_not_worse 150000 0
number_of_time60-89_days_past_due_not_worse 150000 0
number_of_times90_days_late 150000 0
number_real_estate_loans_or_lines 150000 0
revolving_utilization_of_unsecured_lines 150000 0
serious_dlqin2yrs 150000 0
unnamed: 0 150000 0
In [42]:
# let's abstract that code into a function so we can easily 
# recalculate it
def print_null_freq(df):
    """
    for a given DataFrame, calculates how many values for 
    each variable is null and prints the resulting table to stdout
    """
    df_lng = pd.melt(df)
    null_variables = df_lng.value.isnull()
    return pd.crosstab(df_lng.variable, null_variables)
print_null_freq(df)
Out[42]:
value False True
variable
age 150000 0
debt_ratio 150000 0
monthly_income 120269 29731
number_of_dependents 146076 3924
number_of_open_credit_lines_and_loans 150000 0
number_of_time30-59_days_past_due_not_worse 150000 0
number_of_time60-89_days_past_due_not_worse 150000 0
number_of_times90_days_late 150000 0
number_real_estate_loans_or_lines 150000 0
revolving_utilization_of_unsecured_lines 150000 0
serious_dlqin2yrs 150000 0
unnamed: 0 150000 0
In [46]:
melted = pd.melt(..., id_vars=[...], value_vars=[...])

print (len(melted)==300000)
print (melted.variable.unique()==np.array(['age', 'debt_ratio']))
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-46-9401d860345d> in <module>()
----> 1 melted = pd.melt(..., id_vars=[...], value_vars=[...])
      2 
      3 print (len(melted)==300000)
      4 print (melted.variable.unique()==np.array(['age', 'debt_ratio']))

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\reshape.py in melt(frame, id_vars, value_vars, var_name, value_name, col_level)
    766         if not isinstance(value_vars, (tuple, list, np.ndarray)):
    767             value_vars = [value_vars]
--> 768         frame = frame.ix[:, id_vars + value_vars]
    769     else:
    770         frame = frame.copy()

AttributeError: 'ellipsis' object has no attribute 'ix'
In [47]:
s = pd.Series([1, 2, None, 4])
s
Out[47]:
0    1.0
1    2.0
2    NaN
3    4.0
dtype: float64
In [48]:
s.fillna(3)
Out[48]:
0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64
In [49]:
s.ffill()
Out[49]:
0    1.0
1    2.0
2    2.0
3    4.0
dtype: float64
In [50]:
s.fillna(s.mean())
Out[50]:
0    1.000000
1    2.000000
2    2.333333
3    4.000000
dtype: float64
In [51]:
df.number_of_dependents = df.number_of_dependents.fillna(0)
# proof that the number_of_dependents no longer contains nulls
print_null_freq(df)
Out[51]:
value False True
variable
age 150000 0
debt_ratio 150000 0
monthly_income 120269 29731
number_of_dependents 150000 0
number_of_open_credit_lines_and_loans 150000 0
number_of_time30-59_days_past_due_not_worse 150000 0
number_of_time60-89_days_past_due_not_worse 150000 0
number_of_times90_days_late 150000 0
number_real_estate_loans_or_lines 150000 0
revolving_utilization_of_unsecured_lines 150000 0
serious_dlqin2yrs 150000 0
unnamed: 0 150000 0
In [52]:
df.monthly_income.describe()
Out[52]:
count    1.202690e+05
mean     6.670221e+03
std      1.438467e+04
min      0.000000e+00
25%      3.400000e+03
50%      5.400000e+03
75%      8.249000e+03
max      3.008750e+06
Name: monthly_income, dtype: float64
In [ ]:
 

No comments:

Post a Comment