In [1]:
import pandas as pd
import numpy as np
import pylab as pl
In [2]:
df = pd.read_csv("cs-training.csv")
df.head()
Out[2]:
| Unnamed: 0 | SeriousDlqin2yrs | RevolvingUtilizationOfUnsecuredLines | age | NumberOfTime30-59DaysPastDueNotWorse | DebtRatio | MonthlyIncome | NumberOfOpenCreditLinesAndLoans | NumberOfTimes90DaysLate | NumberRealEstateLoansOrLines | NumberOfTime60-89DaysPastDueNotWorse | NumberOfDependents | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0.766127 | 45 | 2 | 0.802982 | 9120.0 | 13 | 0 | 6 | 0 | 2.0 |
| 1 | 2 | 0 | 0.957151 | 40 | 0 | 0.121876 | 2600.0 | 4 | 0 | 0 | 0 | 1.0 |
| 2 | 3 | 0 | 0.658180 | 38 | 1 | 0.085113 | 3042.0 | 2 | 1 | 0 | 0 | 0.0 |
| 3 | 4 | 0 | 0.233810 | 30 | 0 | 0.036050 | 3300.0 | 5 | 0 | 0 | 0 | 0.0 |
| 4 | 5 | 0 | 0.907239 | 49 | 1 | 0.024926 | 63588.0 | 7 | 0 | 1 | 0 | 0.0 |
In [4]:
df.MonthlyIncome.apply(np.log)
Out[4]:
0 9.118225
1 7.863267
2 8.020270
3 8.101678
4 11.060180
5 8.160518
6 NaN
7 8.160518
8 NaN
9 10.072555
10 7.824046
11 8.779711
12 9.429797
13 9.525151
14 -inf
15 9.338030
16 NaN
17 9.082507
18 8.095599
19 5.808142
20 9.417355
21 8.006368
22 7.824046
23 8.976641
24 7.789869
25 8.450198
26 9.027979
27 7.824046
28 8.131531
29 8.612503
...
149970 9.648595
149971 8.517193
149972 8.845777
149973 8.612503
149974 8.517193
149975 NaN
149976 NaN
149977 NaN
149978 7.824046
149979 9.071078
149980 8.617039
149981 8.831858
149982 7.922986
149983 8.517193
149984 NaN
149985 7.575585
149986 8.517193
149987 8.086103
149988 8.924656
149989 9.130539
149990 8.374477
149991 9.241451
149992 NaN
149993 6.709304
149994 8.131531
149995 7.649693
149996 8.627661
149997 NaN
149998 8.651025
149999 9.006754
Name: MonthlyIncome, dtype: float64
In [5]:
add_10 = lambda x: x + 10
plus = lambda x, y: x + y
print (add_10(9))
print (plus(10, 20))
19 30
In [6]:
df.MonthlyIncome.apply(lambda x: np.log(x + 1))
Out[6]:
0 9.118335
1 7.863651
2 8.020599
3 8.101981
4 11.060196
5 8.160804
6 NaN
7 8.160804
8 NaN
9 10.072597
10 7.824446
11 8.779865
12 9.429877
13 9.525224
14 0.000000
15 9.338118
16 NaN
17 9.082621
18 8.095904
19 5.811141
20 9.417436
21 8.006701
22 7.824446
23 8.976768
24 7.790282
25 8.450412
26 9.028099
27 7.824446
28 8.131825
29 8.612685
...
149970 9.648660
149971 8.517393
149972 8.845921
149973 8.612685
149974 8.517393
149975 NaN
149976 NaN
149977 NaN
149978 7.824446
149979 9.071193
149980 8.617220
149981 8.832004
149982 7.923348
149983 8.517393
149984 NaN
149985 7.576097
149986 8.517393
149987 8.086410
149988 8.924789
149989 9.130648
149990 8.374708
149991 9.241548
149992 NaN
149993 6.710523
149994 8.131825
149995 7.650169
149996 8.627840
149997 NaN
149998 8.651199
149999 9.006877
Name: MonthlyIncome, dtype: float64
In [7]:
def inverse(x):
return 1 / (x + 1)
df.MonthlyIncome.apply(inverse)
Out[7]:
0 0.000110
1 0.000384
2 0.000329
3 0.000303
4 0.000016
5 0.000286
6 NaN
7 0.000286
8 NaN
9 0.000042
10 0.000400
11 0.000154
12 0.000080
13 0.000073
14 1.000000
15 0.000088
16 NaN
17 0.000114
18 0.000305
19 0.002994
20 0.000081
21 0.000333
22 0.000400
23 0.000126
24 0.000414
25 0.000214
26 0.000120
27 0.000400
28 0.000294
29 0.000182
...
149970 0.000065
149971 0.000200
149972 0.000144
149973 0.000182
149974 0.000200
149975 NaN
149976 NaN
149977 NaN
149978 0.000400
149979 0.000115
149980 0.000181
149981 0.000146
149982 0.000362
149983 0.000200
149984 NaN
149985 0.000513
149986 0.000200
149987 0.000308
149988 0.000133
149989 0.000108
149990 0.000231
149991 0.000097
149992 NaN
149993 0.001218
149994 0.000294
149995 0.000476
149996 0.000179
149997 NaN
149998 0.000175
149999 0.000123
Name: MonthlyIncome, dtype: float64
In [8]:
def cap_value(x, cap):
"""
x - a value
cap - threshold value for x; if x > cap, then x is set to cap
Examples:
cap_value(1000, 10)
10
cap_value(10, 100)
10
"""
# your code here
return None
In [10]:
print (cap_value(1000, 10)==10)
print (cap_value(10, 100)==10)
print (df.DebtRatio.apply(lambda x: cap_value(x, 5.0)).mean())#should be close to 1.28
False False nan
In [11]:
subset = df[['SeriousDlqin2yrs', 'age', 'MonthlyIncome']]
subset.groupby("SeriousDlqin2yrs")
Out[11]:
<pandas.core.groupby.DataFrameGroupBy object at 0x000001B214C90B38>
In [12]:
subset.groupby("SeriousDlqin2yrs").mean()
Out[12]:
| age | MonthlyIncome | |
|---|---|---|
| SeriousDlqin2yrs | ||
| 0 | 52.751375 | 6747.837774 |
| 1 | 45.926591 | 5630.826493 |
In [14]:
for name, group in subset.groupby("SeriousDlqin2yrs"):
print ("splitting by: ", name)
print (group.mean())
print ("*"*80)
splitting by: 0 SeriousDlqin2yrs 0.000000 age 52.751375 MonthlyIncome 6747.837774 dtype: float64 ******************************************************************************** splitting by: 1 SeriousDlqin2yrs 1.000000 age 45.926591 MonthlyIncome 5630.826493 dtype: float64 ********************************************************************************
In [15]:
subset.groupby("SeriousDlqin2yrs").agg([np.min, np.mean, np.median, np.max])
Out[15]:
| age | MonthlyIncome | |||||||
|---|---|---|---|---|---|---|---|---|
| amin | mean | median | amax | amin | mean | median | amax | |
| SeriousDlqin2yrs | ||||||||
| 0 | 0 | 52.751375 | 52 | 109 | 0.0 | 6747.837774 | 5466.0 | 3008750.0 |
| 1 | 21 | 45.926591 | 45 | 101 | 0.0 | 5630.826493 | 4500.0 | 250000.0 |
In [17]:
def age_x_income(frame):
x = (frame.age * frame.MonthlyIncome)
return np.mean(x)
subset.groupby("SeriousDlqin2yrs").apply(age_x_income)
Out[17]:
SeriousDlqin2yrs 0 356068.428882 1 267892.225679 dtype: float64
In [18]:
pop = pd.read_csv("uspop.csv")
pop
Out[18]:
| age | est_pop | |
|---|---|---|
| 0 | 10 | 20055.346939 |
| 1 | 11 | 20073.020408 |
| 2 | 12 | 20090.693878 |
| 3 | 13 | 20108.367347 |
| 4 | 14 | 20139.081633 |
| 5 | 15 | 20912.081633 |
| 6 | 16 | 20925.122449 |
| 7 | 17 | 20938.163265 |
| 8 | 18 | 20951.204082 |
| 9 | 19 | 20961.326531 |
| 10 | 20 | 21519.163265 |
| 11 | 21 | 21516.244898 |
| 12 | 22 | 21513.326531 |
| 13 | 23 | 21510.408163 |
| 14 | 24 | 21483.408163 |
| 15 | 25 | 21333.836735 |
| 16 | 26 | 21309.755102 |
| 17 | 27 | 21285.673469 |
| 18 | 28 | 21261.591837 |
| 19 | 29 | 21237.510204 |
| 20 | 29 | 20182.673469 |
| 21 | 30 | 20163.346939 |
| 22 | 31 | 20144.020408 |
| 23 | 32 | 20124.693878 |
| 24 | 33 | 20105.367347 |
| 25 | 34 | 20113.224490 |
| 26 | 35 | 19309.367347 |
| 27 | 36 | 19336.551020 |
| 28 | 37 | 19363.734694 |
| 29 | 38 | 19390.918367 |
| ... | ... | ... |
| 48 | 57 | 19380.612245 |
| 49 | 58 | 19337.265306 |
| 50 | 59 | 19293.918367 |
| 51 | 59 | 17322.448980 |
| 52 | 60 | 17214.897959 |
| 53 | 61 | 17107.346939 |
| 54 | 62 | 16999.795918 |
| 55 | 63 | 16892.244898 |
| 56 | 64 | 16725.387755 |
| 57 | 65 | 12041.387755 |
| 58 | 66 | 11982.081633 |
| 59 | 67 | 11922.775510 |
| 60 | 68 | 11863.469388 |
| 61 | 69 | 11759.959184 |
| 62 | 70 | 9165.591837 |
| 63 | 71 | 9121.387755 |
| 64 | 72 | 9077.183673 |
| 65 | 73 | 9032.979592 |
| 66 | 74 | 8960.836735 |
| 67 | 75 | 7032.122449 |
| 68 | 76 | 7004.183673 |
| 69 | 77 | 6976.244898 |
| 70 | 78 | 6948.306122 |
| 71 | 79 | 6904.816327 |
| 72 | 80 | 5687.897959 |
| 73 | 81 | 5672.346939 |
| 74 | 82 | 5656.795918 |
| 75 | 83 | 5641.244898 |
| 76 | 84 | 5625.693878 |
| 77 | 85 | 4957.000000 |
78 rows × 2 columns
In [20]:
cols = ['age', 'MonthlyIncome', 'SeriousDlqin2yrs']
result = pd.merge(df[cols] , pop, how='left', on='age')
result
Out[20]:
| age | MonthlyIncome | SeriousDlqin2yrs | est_pop | |
|---|---|---|---|---|
| 0 | 45 | 9120.0 | 1 | 21988.020408 |
| 1 | 40 | 2600.0 | 0 | 20644.224490 |
| 2 | 38 | 3042.0 | 0 | 19390.918367 |
| 3 | 30 | 3300.0 | 0 | 20163.346939 |
| 4 | 49 | 63588.0 | 0 | 21936.857143 |
| 5 | 74 | 3500.0 | 0 | 8960.836735 |
| 6 | 57 | NaN | 0 | 19380.612245 |
| 7 | 39 | 3500.0 | 0 | 19446.714286 |
| 8 | 27 | NaN | 0 | 21285.673469 |
| 9 | 57 | 23684.0 | 0 | 19380.612245 |
| 10 | 30 | 2500.0 | 0 | 20163.346939 |
| 11 | 51 | 6501.0 | 0 | 21817.387755 |
| 12 | 46 | 12454.0 | 0 | 21987.530612 |
| 13 | 40 | 13700.0 | 1 | 20644.224490 |
| 14 | 76 | 0.0 | 0 | 7004.183673 |
| 15 | 64 | 11362.0 | 0 | 16725.387755 |
| 16 | 78 | NaN | 0 | 6948.306122 |
| 17 | 53 | 8800.0 | 0 | 21718.979592 |
| 18 | 43 | 3280.0 | 0 | 20730.061224 |
| 19 | 25 | 333.0 | 0 | 21333.836735 |
| 20 | 43 | 12300.0 | 0 | 20730.061224 |
| 21 | 38 | 3000.0 | 1 | 19390.918367 |
| 22 | 39 | 2500.0 | 0 | 19446.714286 |
| 23 | 32 | 7916.0 | 0 | 20124.693878 |
| 24 | 58 | 2416.0 | 0 | 19337.265306 |
| 25 | 50 | 4676.0 | 1 | 21866.591837 |
| 26 | 58 | 8333.0 | 0 | 19337.265306 |
| 27 | 69 | 2500.0 | 0 | 11759.959184 |
| 28 | 24 | 3400.0 | 0 | 21483.408163 |
| 29 | 58 | 5500.0 | 0 | 19337.265306 |
| ... | ... | ... | ... | ... |
| 154952 | 42 | 6945.0 | 0 | 20701.448980 |
| 154953 | 44 | 5500.0 | 0 | 20758.183673 |
| 154954 | 61 | 5000.0 | 0 | 17107.346939 |
| 154955 | 58 | NaN | 0 | 19337.265306 |
| 154956 | 76 | NaN | 0 | 7004.183673 |
| 154957 | 29 | NaN | 0 | 21237.510204 |
| 154958 | 29 | NaN | 0 | 20182.673469 |
| 154959 | 52 | 2500.0 | 0 | 21768.183673 |
| 154960 | 55 | 8700.0 | 1 | 19467.306122 |
| 154961 | 64 | 5525.0 | 0 | 16725.387755 |
| 154962 | 43 | 6849.0 | 0 | 20730.061224 |
| 154963 | 37 | 2760.0 | 0 | 19363.734694 |
| 154964 | 82 | 5000.0 | 0 | 5656.795918 |
| 154965 | 84 | NaN | 0 | 5625.693878 |
| 154966 | 26 | 1950.0 | 0 | 21309.755102 |
| 154967 | 49 | 5000.0 | 0 | 21936.857143 |
| 154968 | 28 | 3249.0 | 0 | 21261.591837 |
| 154969 | 31 | 7515.0 | 0 | 20144.020408 |
| 154970 | 62 | 9233.0 | 0 | 16999.795918 |
| 154971 | 46 | 4335.0 | 0 | 21987.530612 |
| 154972 | 59 | 10316.0 | 0 | 19293.918367 |
| 154973 | 59 | 10316.0 | 0 | 17322.448980 |
| 154974 | 50 | NaN | 0 | 21866.591837 |
| 154975 | 22 | 820.0 | 0 | 21513.326531 |
| 154976 | 50 | 3400.0 | 0 | 21866.591837 |
| 154977 | 74 | 2100.0 | 0 | 8960.836735 |
| 154978 | 44 | 5584.0 | 0 | 20758.183673 |
| 154979 | 58 | NaN | 0 | 19337.265306 |
| 154980 | 30 | 5716.0 | 0 | 20163.346939 |
| 154981 | 64 | 8158.0 | 0 | 16725.387755 |
154982 rows × 4 columns
In [21]:
len(result) > len(df)
Out[21]:
True
In [22]:
pd.value_counts(pop.age).head()
Out[22]:
29 2 59 2 36 1 30 1 31 1 Name: age, dtype: int64
In [23]:
pop = pop[pop.age.duplicated()==False]
In [25]:
cols = ['age', 'MonthlyIncome', 'SeriousDlqin2yrs']
joined = pd.merge(df[cols] , pop, how='left', on='age')
joined
Out[25]:
| age | MonthlyIncome | SeriousDlqin2yrs | est_pop | |
|---|---|---|---|---|
| 0 | 45 | 9120.0 | 1 | 21988.020408 |
| 1 | 40 | 2600.0 | 0 | 20644.224490 |
| 2 | 38 | 3042.0 | 0 | 19390.918367 |
| 3 | 30 | 3300.0 | 0 | 20163.346939 |
| 4 | 49 | 63588.0 | 0 | 21936.857143 |
| 5 | 74 | 3500.0 | 0 | 8960.836735 |
| 6 | 57 | NaN | 0 | 19380.612245 |
| 7 | 39 | 3500.0 | 0 | 19446.714286 |
| 8 | 27 | NaN | 0 | 21285.673469 |
| 9 | 57 | 23684.0 | 0 | 19380.612245 |
| 10 | 30 | 2500.0 | 0 | 20163.346939 |
| 11 | 51 | 6501.0 | 0 | 21817.387755 |
| 12 | 46 | 12454.0 | 0 | 21987.530612 |
| 13 | 40 | 13700.0 | 1 | 20644.224490 |
| 14 | 76 | 0.0 | 0 | 7004.183673 |
| 15 | 64 | 11362.0 | 0 | 16725.387755 |
| 16 | 78 | NaN | 0 | 6948.306122 |
| 17 | 53 | 8800.0 | 0 | 21718.979592 |
| 18 | 43 | 3280.0 | 0 | 20730.061224 |
| 19 | 25 | 333.0 | 0 | 21333.836735 |
| 20 | 43 | 12300.0 | 0 | 20730.061224 |
| 21 | 38 | 3000.0 | 1 | 19390.918367 |
| 22 | 39 | 2500.0 | 0 | 19446.714286 |
| 23 | 32 | 7916.0 | 0 | 20124.693878 |
| 24 | 58 | 2416.0 | 0 | 19337.265306 |
| 25 | 50 | 4676.0 | 1 | 21866.591837 |
| 26 | 58 | 8333.0 | 0 | 19337.265306 |
| 27 | 69 | 2500.0 | 0 | 11759.959184 |
| 28 | 24 | 3400.0 | 0 | 21483.408163 |
| 29 | 58 | 5500.0 | 0 | 19337.265306 |
| ... | ... | ... | ... | ... |
| 149970 | 58 | 15500.0 | 0 | 19337.265306 |
| 149971 | 83 | 5000.0 | 0 | 5641.244898 |
| 149972 | 42 | 6945.0 | 0 | 20701.448980 |
| 149973 | 44 | 5500.0 | 0 | 20758.183673 |
| 149974 | 61 | 5000.0 | 0 | 17107.346939 |
| 149975 | 58 | NaN | 0 | 19337.265306 |
| 149976 | 76 | NaN | 0 | 7004.183673 |
| 149977 | 29 | NaN | 0 | 21237.510204 |
| 149978 | 52 | 2500.0 | 0 | 21768.183673 |
| 149979 | 55 | 8700.0 | 1 | 19467.306122 |
| 149980 | 64 | 5525.0 | 0 | 16725.387755 |
| 149981 | 43 | 6849.0 | 0 | 20730.061224 |
| 149982 | 37 | 2760.0 | 0 | 19363.734694 |
| 149983 | 82 | 5000.0 | 0 | 5656.795918 |
| 149984 | 84 | NaN | 0 | 5625.693878 |
| 149985 | 26 | 1950.0 | 0 | 21309.755102 |
| 149986 | 49 | 5000.0 | 0 | 21936.857143 |
| 149987 | 28 | 3249.0 | 0 | 21261.591837 |
| 149988 | 31 | 7515.0 | 0 | 20144.020408 |
| 149989 | 62 | 9233.0 | 0 | 16999.795918 |
| 149990 | 46 | 4335.0 | 0 | 21987.530612 |
| 149991 | 59 | 10316.0 | 0 | 19293.918367 |
| 149992 | 50 | NaN | 0 | 21866.591837 |
| 149993 | 22 | 820.0 | 0 | 21513.326531 |
| 149994 | 50 | 3400.0 | 0 | 21866.591837 |
| 149995 | 74 | 2100.0 | 0 | 8960.836735 |
| 149996 | 44 | 5584.0 | 0 | 20758.183673 |
| 149997 | 58 | NaN | 0 | 19337.265306 |
| 149998 | 30 | 5716.0 | 0 | 20163.346939 |
| 149999 | 64 | 8158.0 | 0 | 16725.387755 |
150000 rows × 4 columns
In [26]:
joined.est_pop = joined.est_pop.fillna(4957.0)
joined.est_pop.describe()
Out[26]:
count 150000.000000 mean 17882.219144 std 4875.568984 min 4957.000000 25% 16892.244898 50% 19467.306122 75% 21510.408163 max 21988.020408 Name: est_pop, dtype: float64
In [27]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
In [29]:
query = """
select
SeriousDlqin2yrs
, sum(1) as total
from
df
group by
SeriousDlqin2yrs;
"""
pysqldf(query)
Out[29]:
| SeriousDlqin2yrs | total | |
|---|---|---|
| 0 | 0 | 139974 |
| 1 | 1 | 10026 |
In [30]:
query = """
select
age
, avg(SeriousDlqin2yrs) as pct_delinquent
from
df
group by
age
order by
age;
"""
pysqldf(query)
Out[30]:
| age | pct_delinquent | |
|---|---|---|
| 0 | 0 | 0.000000 |
| 1 | 21 | 0.071038 |
| 2 | 22 | 0.082949 |
| 3 | 23 | 0.109204 |
| 4 | 24 | 0.120098 |
| 5 | 25 | 0.126967 |
| 6 | 26 | 0.123219 |
| 7 | 27 | 0.124066 |
| 8 | 28 | 0.131410 |
| 9 | 29 | 0.105170 |
| 10 | 30 | 0.107899 |
| 11 | 31 | 0.106477 |
| 12 | 32 | 0.113659 |
| 13 | 33 | 0.109870 |
| 14 | 34 | 0.097448 |
| 15 | 35 | 0.107302 |
| 16 | 36 | 0.099622 |
| 17 | 37 | 0.090044 |
| 18 | 38 | 0.089320 |
| 19 | 39 | 0.093740 |
| 20 | 40 | 0.085354 |
| 21 | 41 | 0.094170 |
| 22 | 42 | 0.093770 |
| 23 | 43 | 0.086035 |
| 24 | 44 | 0.073467 |
| 25 | 45 | 0.081097 |
| 26 | 46 | 0.087237 |
| 27 | 47 | 0.082280 |
| 28 | 48 | 0.075145 |
| 29 | 49 | 0.081574 |
| ... | ... | ... |
| 56 | 76 | 0.020287 |
| 57 | 77 | 0.016379 |
| 58 | 78 | 0.022770 |
| 59 | 79 | 0.022426 |
| 60 | 80 | 0.021689 |
| 61 | 81 | 0.011628 |
| 62 | 82 | 0.029366 |
| 63 | 83 | 0.019531 |
| 64 | 84 | 0.016667 |
| 65 | 85 | 0.018634 |
| 66 | 86 | 0.014742 |
| 67 | 87 | 0.022409 |
| 68 | 88 | 0.025559 |
| 69 | 89 | 0.032609 |
| 70 | 90 | 0.015152 |
| 71 | 91 | 0.032468 |
| 72 | 92 | 0.000000 |
| 73 | 93 | 0.011494 |
| 74 | 94 | 0.021277 |
| 75 | 95 | 0.022222 |
| 76 | 96 | 0.000000 |
| 77 | 97 | 0.000000 |
| 78 | 98 | 0.000000 |
| 79 | 99 | 0.222222 |
| 80 | 101 | 0.333333 |
| 81 | 102 | 0.000000 |
| 82 | 103 | 0.000000 |
| 83 | 105 | 0.000000 |
| 84 | 107 | 0.000000 |
| 85 | 109 | 0.000000 |
86 rows × 2 columns
In [ ]:
No comments:
Post a Comment