In [19]:
import matplotlib
%matplotlib inline
In [1]:
import pandas as pd
import pylab as pl
import numpy as np
import re
In [2]:
df = pd.read_csv("cs-training.csv")
In [4]:
df.head()
Out[4]:
In [5]:
type(df)
Out[5]:
In [7]:
print (df.SeriousDlqin2yrs.head())
type(df.SeriousDlqin2yrs)
Out[7]:
In [8]:
df.dtypes
Out[8]:
In [9]:
df['DebtRatio']
df.DebtRatio
Out[9]:
In [10]:
df.RevolvingUtilizationOfUnsecuredLines.tail()
Out[10]:
In [11]:
df.describe()
Out[11]:
In [13]:
df.age.describe()
Out[13]:
In [15]:
df.NumberOfDependents.unique()
Out[15]:
In [16]:
df.NumberOfDependents.nunique()
Out[16]:
In [17]:
pd.value_counts(df.NumberOfDependents)
df.NumberOfDependents.value_counts()
Out[17]:
In [20]:
pd.value_counts(df.NumberOfDependents).plot(kind='bar')
Out[20]:
In [21]:
pd.crosstab(df.NumberOfTimes90DaysLate, df.SeriousDlqin2yrs)
Out[21]:
In [22]:
def camel_to_snake(column_name):
"""
converts a string that is camelCase into snake_case
Example:
print camel_to_snake("javaLovesCamelCase")
> java_loves_camel_case
See Also:
http://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-camel-case
"""
s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', column_name)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
In [23]:
camel_to_snake("javaLovesCamelCase")
Out[23]:
In [24]:
df.columns = [camel_to_snake(col) for col in df.columns]
df.columns.tolist()
Out[24]:
In [25]:
df['monthly_income'].head()
df.monthly_income.head()
Out[25]:
In [26]:
df[['monthly_income', 'serious_dlqin2yrs']].head()
Out[26]:
In [27]:
columns_i_want = ['monthly_income', 'serious_dlqin2yrs']
df[columns_i_want].head()
Out[27]:
In [28]:
df['one'] = 1
df.one.head()
Out[28]:
In [29]:
del df['one']
In [30]:
df.monthly_income > 5000
Out[30]:
In [32]:
gt_5k = df[df.monthly_income > 5000]
print (len(gt_5k),"people with monthly_income > 5000")
df[df.monthly_income > 5000].head()
Out[32]:
In [33]:
df.ix[40:45, 0:2]
Out[33]:
In [34]:
mask = (df.monthly_income > 5000) & (df.serious_dlqin2yrs==1)
df[mask].head()
Out[34]:
In [35]:
mask = (df.age >= 35) & (df.serious_dlqin2yrs==0) & (df.number_of_open_credit_lines_and_loans < 10)
len(df[mask])==76151
Out[35]:
In [36]:
mask = (df.monthly_income == df.monthly_income.quantile(0.90)) & (df.serious_dlqin2yrs==0)
len(df[mask])
Out[36]:
In [38]:
?pd.melt
In [39]:
# By not specifying id_vars, we're going to melt EVERYTHING
df_lng = pd.melt(df)
# now our data is a series of (key, value) rows.
#think of when you've done this in Excel so that you can
#create a pivot table
df_lng.head()
Out[39]:
In [40]:
null_variables = df_lng.value.isnull()
null_variables.sum()
Out[40]:
In [41]:
# crosstab creates a frequency table between 2 variables
# it's going to automatically enumerate the possibilities between
# the two Series and show you a count of occurrences
#in each possible bucket
pd.crosstab(df_lng.variable, null_variables)
Out[41]:
In [42]:
# let's abstract that code into a function so we can easily
# recalculate it
def print_null_freq(df):
"""
for a given DataFrame, calculates how many values for
each variable is null and prints the resulting table to stdout
"""
df_lng = pd.melt(df)
null_variables = df_lng.value.isnull()
return pd.crosstab(df_lng.variable, null_variables)
print_null_freq(df)
Out[42]:
In [46]:
melted = pd.melt(..., id_vars=[...], value_vars=[...])
print (len(melted)==300000)
print (melted.variable.unique()==np.array(['age', 'debt_ratio']))
In [47]:
s = pd.Series([1, 2, None, 4])
s
Out[47]:
In [48]:
s.fillna(3)
Out[48]:
In [49]:
s.ffill()
Out[49]:
In [50]:
s.fillna(s.mean())
Out[50]:
In [51]:
df.number_of_dependents = df.number_of_dependents.fillna(0)
# proof that the number_of_dependents no longer contains nulls
print_null_freq(df)
Out[51]:
In [52]:
df.monthly_income.describe()
Out[52]:
In [ ]:
No comments:
Post a Comment