http://nbviewer.jupyter.org/github/yhat/DataGotham2013/blob/master/notebooks/3%20-%20Importing%20Data.ipynb

import matplotlib
%matplotlib inline

import pandas as pd
import pylab as pl
import numpy as np
import re

df = pd.read_csv("cs-training.csv")

df.head()

type(df)

pandas.core.frame.DataFrame

print (df.SeriousDlqin2yrs.head())
type(df.SeriousDlqin2yrs)

0    1
1    0
2    0
3    0
4    0
Name: SeriousDlqin2yrs, dtype: int64

pandas.core.series.Series

df.dtypes

Unnamed: 0                                int64
SeriousDlqin2yrs                          int64
RevolvingUtilizationOfUnsecuredLines    float64
age                                       int64
NumberOfTime30-59DaysPastDueNotWorse      int64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans           int64
NumberOfTimes90DaysLate                   int64
NumberRealEstateLoansOrLines              int64
NumberOfTime60-89DaysPastDueNotWorse      int64
NumberOfDependents                      float64
dtype: object

df['DebtRatio']
df.DebtRatio

0            0.802982
1            0.121876
2            0.085113
3            0.036050
4            0.024926
5            0.375607
6         5710.000000
7            0.209940
8           46.000000
9            0.606291
10           0.309476
11           0.531529
12           0.298354
13           0.382965
14         477.000000
15           0.209892
16        2058.000000
17           0.188274
18           0.527888
19           0.065868
20           0.430046
21           0.475841
22           0.241104
23           0.085512
24           0.241622
25           1.595253
26           0.097672
27           0.042383
28           0.011761
29           0.436103
             ...     
149970       0.253855
149971       0.013997
149972       0.008638
149973       0.494819
149974       0.603479
149975    2716.000000
149976      60.000000
149977     349.000000
149978       0.259496
149979       0.057235
149980       0.254976
149981       0.121752
149982       0.250272
149983       0.000800
149984      25.000000
149985       0.324962
149986       0.080384
149987       0.055692
149988       0.347924
149989       0.001408
149990       0.609779
149991       0.477658
149992    4132.000000
149993       0.000000
149994       0.404293
149995       0.225131
149996       0.716562
149997    3870.000000
149998       0.000000
149999       0.249908
Name: DebtRatio, dtype: float64

df.RevolvingUtilizationOfUnsecuredLines.tail()

149995    0.040674
149996    0.299745
149997    0.246044
149998    0.000000
149999    0.850283
Name: RevolvingUtilizationOfUnsecuredLines, dtype: float64

df.describe()

df.age.describe()

count    150000.000000
mean         52.295207
std          14.771866
min           0.000000
25%          41.000000
50%          52.000000
75%          63.000000
max         109.000000
Name: age, dtype: float64

df.NumberOfDependents.unique()

array([  2.,   1.,   0.,  nan,   3.,   4.,   5.,   6.,   8.,   7.,  20.,
        10.,   9.,  13.])

df.NumberOfDependents.nunique()

13

pd.value_counts(df.NumberOfDependents)
df.NumberOfDependents.value_counts()

0.0     86902
1.0     26316
2.0     19522
3.0      9483
4.0      2862
5.0       746
6.0       158
7.0        51
8.0        24
9.0         5
10.0        5
13.0        1
20.0        1
Name: NumberOfDependents, dtype: int64

pd.value_counts(df.NumberOfDependents).plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x1e960082748>

pd.crosstab(df.NumberOfTimes90DaysLate, df.SeriousDlqin2yrs)

def camel_to_snake(column_name):
    """
    converts a string that is camelCase into snake_case
    Example:
        print camel_to_snake("javaLovesCamelCase")
        > java_loves_camel_case
    See Also:
        http://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-camel-case
    """
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', column_name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

camel_to_snake("javaLovesCamelCase")

'java_loves_camel_case'

df.columns = [camel_to_snake(col) for col in df.columns]
df.columns.tolist()

['unnamed: 0',
 'serious_dlqin2yrs',
 'revolving_utilization_of_unsecured_lines',
 'age',
 'number_of_time30-59_days_past_due_not_worse',
 'debt_ratio',
 'monthly_income',
 'number_of_open_credit_lines_and_loans',
 'number_of_times90_days_late',
 'number_real_estate_loans_or_lines',
 'number_of_time60-89_days_past_due_not_worse',
 'number_of_dependents']

df['monthly_income'].head()
df.monthly_income.head()

0     9120.0
1     2600.0
2     3042.0
3     3300.0
4    63588.0
Name: monthly_income, dtype: float64

df[['monthly_income', 'serious_dlqin2yrs']].head()

columns_i_want = ['monthly_income', 'serious_dlqin2yrs']
df[columns_i_want].head()

df['one'] = 1
df.one.head()

0    1
1    1
2    1
3    1
4    1
Name: one, dtype: int64

del df['one']

df.monthly_income > 5000

0          True
1         False
2         False
3         False
4          True
5         False
6         False
7         False
8         False
9          True
10        False
11         True
12         True
13         True
14        False
15         True
16        False
17         True
18        False
19        False
20         True
21        False
22        False
23         True
24        False
25        False
26         True
27        False
28        False
29         True
          ...  
149970     True
149971    False
149972     True
149973     True
149974    False
149975    False
149976    False
149977    False
149978    False
149979     True
149980     True
149981     True
149982    False
149983    False
149984    False
149985    False
149986    False
149987    False
149988     True
149989     True
149990    False
149991     True
149992    False
149993    False
149994    False
149995    False
149996     True
149997    False
149998     True
149999     True
Name: monthly_income, dtype: bool

gt_5k = df[df.monthly_income > 5000]
print (len(gt_5k),"people with monthly_income > 5000")
df[df.monthly_income > 5000].head()

64410 people with monthly_income > 5000

df.ix[40:45, 0:2]

mask = (df.monthly_income > 5000) & (df.serious_dlqin2yrs==1)
df[mask].head()

mask = (df.age >= 35) & (df.serious_dlqin2yrs==0) & (df.number_of_open_credit_lines_and_loans < 10)
len(df[mask])==76151

True

mask = (df.monthly_income == df.monthly_income.quantile(0.90)) & (df.serious_dlqin2yrs==0)
len(df[mask])

147

?pd.melt

# By not specifying id_vars, we're going to melt EVERYTHING
df_lng = pd.melt(df)
# now our data is a series of (key, value) rows. 
#think of when you've done this in Excel so that you can
#create a pivot table 
df_lng.head()

null_variables = df_lng.value.isnull()
null_variables.sum()

33655

# crosstab creates a frequency table between 2 variables
# it's going to automatically enumerate the possibilities between
# the two Series and show you a count of occurrences 
#in each possible bucket
pd.crosstab(df_lng.variable, null_variables)

# let's abstract that code into a function so we can easily 
# recalculate it
def print_null_freq(df):
    """
    for a given DataFrame, calculates how many values for 
    each variable is null and prints the resulting table to stdout
    """
    df_lng = pd.melt(df)
    null_variables = df_lng.value.isnull()
    return pd.crosstab(df_lng.variable, null_variables)
print_null_freq(df)

melted = pd.melt(..., id_vars=[...], value_vars=[...])

print (len(melted)==300000)
print (melted.variable.unique()==np.array(['age', 'debt_ratio']))

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-46-9401d860345d> in <module>()
----> 1 melted = pd.melt(..., id_vars=[...], value_vars=[...])
      2 
      3 print (len(melted)==300000)
      4 print (melted.variable.unique()==np.array(['age', 'debt_ratio']))

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\reshape.py in melt(frame, id_vars, value_vars, var_name, value_name, col_level)
    766         if not isinstance(value_vars, (tuple, list, np.ndarray)):
    767             value_vars = [value_vars]
--> 768         frame = frame.ix[:, id_vars + value_vars]
    769     else:
    770         frame = frame.copy()

AttributeError: 'ellipsis' object has no attribute 'ix'

s = pd.Series([1, 2, None, 4])
s

0    1.0
1    2.0
2    NaN
3    4.0
dtype: float64

s.fillna(3)

0    1.0
1    2.0
2    3.0
3    4.0
dtype: float64

s.ffill()

0    1.0
1    2.0
2    2.0
3    4.0
dtype: float64

s.fillna(s.mean())

0    1.000000
1    2.000000
2    2.333333
3    4.000000
dtype: float64

df.number_of_dependents = df.number_of_dependents.fillna(0)
# proof that the number_of_dependents no longer contains nulls
print_null_freq(df)

df.monthly_income.describe()

count    1.202690e+05
mean     6.670221e+03
std      1.438467e+04
min      0.000000e+00
25%      3.400000e+03
50%      5.400000e+03
75%      8.249000e+03
max      3.008750e+06
Name: monthly_income, dtype: float64

Python @ Praxis

Sunday, October 15, 2017

DG 03 Data Import

No comments:

Post a Comment

	Unnamed: 0	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfDependents
0	1	1	0.766127	45	2	0.802982	9120.0	13	0	6	2.0
1	2	0	0.957151	40	0	0.121876	2600.0	4	0	0	1.0
2	3	0	0.658180	38	1	0.085113	3042.0	2	1	0	0.0
3	4	0	0.233810	30	0	0.036050	3300.0	5	0	0	0.0
4	5	0	0.907239	49	1	0.024926	63588.0	7	0	1	0.0

	Unnamed: 0	SeriousDlqin2yrs	RevolvingUtilizationOfUnsecuredLines	age	NumberOfTime30-59DaysPastDueNotWorse	DebtRatio	MonthlyIncome	NumberOfOpenCreditLinesAndLoans	NumberOfTimes90DaysLate	NumberRealEstateLoansOrLines	NumberOfTime60-89DaysPastDueNotWorse	NumberOfDependents
count	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	1.202690e+05	150000.000000	150000.000000	150000.000000	150000.000000	146076.000000
mean	75000.500000	0.066840	6.048438	52.295207	0.421033	353.005076	6.670221e+03	8.452760	0.265973	1.018240	0.240387	0.757222
std	43301.414527	0.249746	249.755371	14.771866	4.192781	2037.818523	1.438467e+04	5.145951	4.169304	1.129771	4.155179	1.115086
min	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000
25%	37500.750000	0.000000	0.029867	41.000000	0.000000	0.175074	3.400000e+03	5.000000	0.000000	0.000000	0.000000	0.000000
50%	75000.500000	0.000000	0.154181	52.000000	0.000000	0.366508	5.400000e+03	8.000000	0.000000	1.000000	0.000000	0.000000
75%	112500.250000	0.000000	0.559046	63.000000	0.000000	0.868254	8.249000e+03	11.000000	0.000000	2.000000	0.000000	1.000000
max	150000.000000	1.000000	50708.000000	109.000000	98.000000	329664.000000	3.008750e+06	58.000000	98.000000	54.000000	98.000000	20.000000

SeriousDlqin2yrs	0	1
NumberOfTimes90DaysLate
0	135108	6554
1	3478	1765
2	779	776
3	282	385
4	96	195
5	48	83
6	32	48
7	7	31
8	6	15
9	5	14
10	3	5
11	2	3
12	1	1
13	2	2
14	1	1
15	2	0
17	0	1
96	1	4
98	121	143

	variable	value
0	unnamed: 0	1.0
1	unnamed: 0	2.0
2	unnamed: 0	3.0
3	unnamed: 0	4.0
4	unnamed: 0	5.0

value	False	True
variable
age	150000	0
debt_ratio	150000	0
monthly_income	120269	29731
number_of_dependents	146076	3924
number_of_open_credit_lines_and_loans	150000	0
number_of_time30-59_days_past_due_not_worse	150000	0
number_of_time60-89_days_past_due_not_worse	150000	0
number_of_times90_days_late	150000	0
number_real_estate_loans_or_lines	150000	0
revolving_utilization_of_unsecured_lines	150000	0
serious_dlqin2yrs	150000	0
unnamed: 0	150000	0