Sunday, October 15, 2017

DG 06 Aggregation and Grouping

dg13-06-Aggregation-Grouping
In [1]:
import pandas as pd
import numpy as np
import pylab as pl
In [2]:
df = pd.read_csv("cs-training.csv")
df.head()
Out[2]:
Unnamed: 0 SeriousDlqin2yrs RevolvingUtilizationOfUnsecuredLines age NumberOfTime30-59DaysPastDueNotWorse DebtRatio MonthlyIncome NumberOfOpenCreditLinesAndLoans NumberOfTimes90DaysLate NumberRealEstateLoansOrLines NumberOfTime60-89DaysPastDueNotWorse NumberOfDependents
0 1 1 0.766127 45 2 0.802982 9120.0 13 0 6 0 2.0
1 2 0 0.957151 40 0 0.121876 2600.0 4 0 0 0 1.0
2 3 0 0.658180 38 1 0.085113 3042.0 2 1 0 0 0.0
3 4 0 0.233810 30 0 0.036050 3300.0 5 0 0 0 0.0
4 5 0 0.907239 49 1 0.024926 63588.0 7 0 1 0 0.0
In [4]:
df.MonthlyIncome.apply(np.log)
Out[4]:
0          9.118225
1          7.863267
2          8.020270
3          8.101678
4         11.060180
5          8.160518
6               NaN
7          8.160518
8               NaN
9         10.072555
10         7.824046
11         8.779711
12         9.429797
13         9.525151
14             -inf
15         9.338030
16              NaN
17         9.082507
18         8.095599
19         5.808142
20         9.417355
21         8.006368
22         7.824046
23         8.976641
24         7.789869
25         8.450198
26         9.027979
27         7.824046
28         8.131531
29         8.612503
            ...    
149970     9.648595
149971     8.517193
149972     8.845777
149973     8.612503
149974     8.517193
149975          NaN
149976          NaN
149977          NaN
149978     7.824046
149979     9.071078
149980     8.617039
149981     8.831858
149982     7.922986
149983     8.517193
149984          NaN
149985     7.575585
149986     8.517193
149987     8.086103
149988     8.924656
149989     9.130539
149990     8.374477
149991     9.241451
149992          NaN
149993     6.709304
149994     8.131531
149995     7.649693
149996     8.627661
149997          NaN
149998     8.651025
149999     9.006754
Name: MonthlyIncome, dtype: float64
In [5]:
add_10 = lambda x: x + 10
plus = lambda x, y: x + y

print (add_10(9))
print (plus(10, 20))
19
30
In [6]:
df.MonthlyIncome.apply(lambda x: np.log(x + 1))
Out[6]:
0          9.118335
1          7.863651
2          8.020599
3          8.101981
4         11.060196
5          8.160804
6               NaN
7          8.160804
8               NaN
9         10.072597
10         7.824446
11         8.779865
12         9.429877
13         9.525224
14         0.000000
15         9.338118
16              NaN
17         9.082621
18         8.095904
19         5.811141
20         9.417436
21         8.006701
22         7.824446
23         8.976768
24         7.790282
25         8.450412
26         9.028099
27         7.824446
28         8.131825
29         8.612685
            ...    
149970     9.648660
149971     8.517393
149972     8.845921
149973     8.612685
149974     8.517393
149975          NaN
149976          NaN
149977          NaN
149978     7.824446
149979     9.071193
149980     8.617220
149981     8.832004
149982     7.923348
149983     8.517393
149984          NaN
149985     7.576097
149986     8.517393
149987     8.086410
149988     8.924789
149989     9.130648
149990     8.374708
149991     9.241548
149992          NaN
149993     6.710523
149994     8.131825
149995     7.650169
149996     8.627840
149997          NaN
149998     8.651199
149999     9.006877
Name: MonthlyIncome, dtype: float64
In [7]:
def inverse(x):
    return 1 / (x + 1)

df.MonthlyIncome.apply(inverse)
Out[7]:
0         0.000110
1         0.000384
2         0.000329
3         0.000303
4         0.000016
5         0.000286
6              NaN
7         0.000286
8              NaN
9         0.000042
10        0.000400
11        0.000154
12        0.000080
13        0.000073
14        1.000000
15        0.000088
16             NaN
17        0.000114
18        0.000305
19        0.002994
20        0.000081
21        0.000333
22        0.000400
23        0.000126
24        0.000414
25        0.000214
26        0.000120
27        0.000400
28        0.000294
29        0.000182
            ...   
149970    0.000065
149971    0.000200
149972    0.000144
149973    0.000182
149974    0.000200
149975         NaN
149976         NaN
149977         NaN
149978    0.000400
149979    0.000115
149980    0.000181
149981    0.000146
149982    0.000362
149983    0.000200
149984         NaN
149985    0.000513
149986    0.000200
149987    0.000308
149988    0.000133
149989    0.000108
149990    0.000231
149991    0.000097
149992         NaN
149993    0.001218
149994    0.000294
149995    0.000476
149996    0.000179
149997         NaN
149998    0.000175
149999    0.000123
Name: MonthlyIncome, dtype: float64
In [8]:
def cap_value(x, cap):
    """
    x - a value
    cap - threshold value for x; if x > cap, then x is set to cap
    Examples:
        cap_value(1000, 10)
        10
        cap_value(10, 100)
        10
        """
    # your code here
    return None
In [10]:
print (cap_value(1000, 10)==10)
print (cap_value(10, 100)==10)
print (df.DebtRatio.apply(lambda x: cap_value(x, 5.0)).mean())#should be close to 1.28
False
False
nan
In [11]:
subset = df[['SeriousDlqin2yrs', 'age', 'MonthlyIncome']]
subset.groupby("SeriousDlqin2yrs")
Out[11]:
<pandas.core.groupby.DataFrameGroupBy object at 0x000001B214C90B38>
In [12]:
subset.groupby("SeriousDlqin2yrs").mean()
Out[12]:
age MonthlyIncome
SeriousDlqin2yrs
0 52.751375 6747.837774
1 45.926591 5630.826493
In [14]:
for name, group in subset.groupby("SeriousDlqin2yrs"):
    print ("splitting by: ", name)
    print (group.mean())
    print ("*"*80)
splitting by:  0
SeriousDlqin2yrs       0.000000
age                   52.751375
MonthlyIncome       6747.837774
dtype: float64
********************************************************************************
splitting by:  1
SeriousDlqin2yrs       1.000000
age                   45.926591
MonthlyIncome       5630.826493
dtype: float64
********************************************************************************
In [15]:
subset.groupby("SeriousDlqin2yrs").agg([np.min, np.mean, np.median, np.max])
Out[15]:
age MonthlyIncome
amin mean median amax amin mean median amax
SeriousDlqin2yrs
0 0 52.751375 52 109 0.0 6747.837774 5466.0 3008750.0
1 21 45.926591 45 101 0.0 5630.826493 4500.0 250000.0
In [17]:
def age_x_income(frame):
    x = (frame.age * frame.MonthlyIncome)
    return np.mean(x)

subset.groupby("SeriousDlqin2yrs").apply(age_x_income)
Out[17]:
SeriousDlqin2yrs
0    356068.428882
1    267892.225679
dtype: float64
In [18]:
pop = pd.read_csv("uspop.csv")
pop
Out[18]:
age est_pop
0 10 20055.346939
1 11 20073.020408
2 12 20090.693878
3 13 20108.367347
4 14 20139.081633
5 15 20912.081633
6 16 20925.122449
7 17 20938.163265
8 18 20951.204082
9 19 20961.326531
10 20 21519.163265
11 21 21516.244898
12 22 21513.326531
13 23 21510.408163
14 24 21483.408163
15 25 21333.836735
16 26 21309.755102
17 27 21285.673469
18 28 21261.591837
19 29 21237.510204
20 29 20182.673469
21 30 20163.346939
22 31 20144.020408
23 32 20124.693878
24 33 20105.367347
25 34 20113.224490
26 35 19309.367347
27 36 19336.551020
28 37 19363.734694
29 38 19390.918367
... ... ...
48 57 19380.612245
49 58 19337.265306
50 59 19293.918367
51 59 17322.448980
52 60 17214.897959
53 61 17107.346939
54 62 16999.795918
55 63 16892.244898
56 64 16725.387755
57 65 12041.387755
58 66 11982.081633
59 67 11922.775510
60 68 11863.469388
61 69 11759.959184
62 70 9165.591837
63 71 9121.387755
64 72 9077.183673
65 73 9032.979592
66 74 8960.836735
67 75 7032.122449
68 76 7004.183673
69 77 6976.244898
70 78 6948.306122
71 79 6904.816327
72 80 5687.897959
73 81 5672.346939
74 82 5656.795918
75 83 5641.244898
76 84 5625.693878
77 85 4957.000000

78 rows × 2 columns

In [20]:
cols = ['age', 'MonthlyIncome', 'SeriousDlqin2yrs']
result = pd.merge(df[cols] , pop, how='left', on='age')
result
Out[20]:
age MonthlyIncome SeriousDlqin2yrs est_pop
0 45 9120.0 1 21988.020408
1 40 2600.0 0 20644.224490
2 38 3042.0 0 19390.918367
3 30 3300.0 0 20163.346939
4 49 63588.0 0 21936.857143
5 74 3500.0 0 8960.836735
6 57 NaN 0 19380.612245
7 39 3500.0 0 19446.714286
8 27 NaN 0 21285.673469
9 57 23684.0 0 19380.612245
10 30 2500.0 0 20163.346939
11 51 6501.0 0 21817.387755
12 46 12454.0 0 21987.530612
13 40 13700.0 1 20644.224490
14 76 0.0 0 7004.183673
15 64 11362.0 0 16725.387755
16 78 NaN 0 6948.306122
17 53 8800.0 0 21718.979592
18 43 3280.0 0 20730.061224
19 25 333.0 0 21333.836735
20 43 12300.0 0 20730.061224
21 38 3000.0 1 19390.918367
22 39 2500.0 0 19446.714286
23 32 7916.0 0 20124.693878
24 58 2416.0 0 19337.265306
25 50 4676.0 1 21866.591837
26 58 8333.0 0 19337.265306
27 69 2500.0 0 11759.959184
28 24 3400.0 0 21483.408163
29 58 5500.0 0 19337.265306
... ... ... ... ...
154952 42 6945.0 0 20701.448980
154953 44 5500.0 0 20758.183673
154954 61 5000.0 0 17107.346939
154955 58 NaN 0 19337.265306
154956 76 NaN 0 7004.183673
154957 29 NaN 0 21237.510204
154958 29 NaN 0 20182.673469
154959 52 2500.0 0 21768.183673
154960 55 8700.0 1 19467.306122
154961 64 5525.0 0 16725.387755
154962 43 6849.0 0 20730.061224
154963 37 2760.0 0 19363.734694
154964 82 5000.0 0 5656.795918
154965 84 NaN 0 5625.693878
154966 26 1950.0 0 21309.755102
154967 49 5000.0 0 21936.857143
154968 28 3249.0 0 21261.591837
154969 31 7515.0 0 20144.020408
154970 62 9233.0 0 16999.795918
154971 46 4335.0 0 21987.530612
154972 59 10316.0 0 19293.918367
154973 59 10316.0 0 17322.448980
154974 50 NaN 0 21866.591837
154975 22 820.0 0 21513.326531
154976 50 3400.0 0 21866.591837
154977 74 2100.0 0 8960.836735
154978 44 5584.0 0 20758.183673
154979 58 NaN 0 19337.265306
154980 30 5716.0 0 20163.346939
154981 64 8158.0 0 16725.387755

154982 rows × 4 columns

In [21]:
len(result) > len(df)
Out[21]:
True
In [22]:
pd.value_counts(pop.age).head()
Out[22]:
29    2
59    2
36    1
30    1
31    1
Name: age, dtype: int64
In [23]:
pop = pop[pop.age.duplicated()==False]
In [25]:
cols = ['age', 'MonthlyIncome', 'SeriousDlqin2yrs']
joined = pd.merge(df[cols] , pop, how='left', on='age')
joined
Out[25]:
age MonthlyIncome SeriousDlqin2yrs est_pop
0 45 9120.0 1 21988.020408
1 40 2600.0 0 20644.224490
2 38 3042.0 0 19390.918367
3 30 3300.0 0 20163.346939
4 49 63588.0 0 21936.857143
5 74 3500.0 0 8960.836735
6 57 NaN 0 19380.612245
7 39 3500.0 0 19446.714286
8 27 NaN 0 21285.673469
9 57 23684.0 0 19380.612245
10 30 2500.0 0 20163.346939
11 51 6501.0 0 21817.387755
12 46 12454.0 0 21987.530612
13 40 13700.0 1 20644.224490
14 76 0.0 0 7004.183673
15 64 11362.0 0 16725.387755
16 78 NaN 0 6948.306122
17 53 8800.0 0 21718.979592
18 43 3280.0 0 20730.061224
19 25 333.0 0 21333.836735
20 43 12300.0 0 20730.061224
21 38 3000.0 1 19390.918367
22 39 2500.0 0 19446.714286
23 32 7916.0 0 20124.693878
24 58 2416.0 0 19337.265306
25 50 4676.0 1 21866.591837
26 58 8333.0 0 19337.265306
27 69 2500.0 0 11759.959184
28 24 3400.0 0 21483.408163
29 58 5500.0 0 19337.265306
... ... ... ... ...
149970 58 15500.0 0 19337.265306
149971 83 5000.0 0 5641.244898
149972 42 6945.0 0 20701.448980
149973 44 5500.0 0 20758.183673
149974 61 5000.0 0 17107.346939
149975 58 NaN 0 19337.265306
149976 76 NaN 0 7004.183673
149977 29 NaN 0 21237.510204
149978 52 2500.0 0 21768.183673
149979 55 8700.0 1 19467.306122
149980 64 5525.0 0 16725.387755
149981 43 6849.0 0 20730.061224
149982 37 2760.0 0 19363.734694
149983 82 5000.0 0 5656.795918
149984 84 NaN 0 5625.693878
149985 26 1950.0 0 21309.755102
149986 49 5000.0 0 21936.857143
149987 28 3249.0 0 21261.591837
149988 31 7515.0 0 20144.020408
149989 62 9233.0 0 16999.795918
149990 46 4335.0 0 21987.530612
149991 59 10316.0 0 19293.918367
149992 50 NaN 0 21866.591837
149993 22 820.0 0 21513.326531
149994 50 3400.0 0 21866.591837
149995 74 2100.0 0 8960.836735
149996 44 5584.0 0 20758.183673
149997 58 NaN 0 19337.265306
149998 30 5716.0 0 20163.346939
149999 64 8158.0 0 16725.387755

150000 rows × 4 columns

In [26]:
joined.est_pop = joined.est_pop.fillna(4957.0)
joined.est_pop.describe()
Out[26]:
count    150000.000000
mean      17882.219144
std        4875.568984
min        4957.000000
25%       16892.244898
50%       19467.306122
75%       21510.408163
max       21988.020408
Name: est_pop, dtype: float64
In [27]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
In [29]:
query = """
select
    SeriousDlqin2yrs
    , sum(1) as total
from
    df
group by
    SeriousDlqin2yrs;
"""
pysqldf(query)
Out[29]:
SeriousDlqin2yrs total
0 0 139974
1 1 10026
In [30]:
query = """
select
    age
    , avg(SeriousDlqin2yrs) as pct_delinquent
from
    df
group by
    age
order by
    age;
"""
pysqldf(query)
Out[30]:
age pct_delinquent
0 0 0.000000
1 21 0.071038
2 22 0.082949
3 23 0.109204
4 24 0.120098
5 25 0.126967
6 26 0.123219
7 27 0.124066
8 28 0.131410
9 29 0.105170
10 30 0.107899
11 31 0.106477
12 32 0.113659
13 33 0.109870
14 34 0.097448
15 35 0.107302
16 36 0.099622
17 37 0.090044
18 38 0.089320
19 39 0.093740
20 40 0.085354
21 41 0.094170
22 42 0.093770
23 43 0.086035
24 44 0.073467
25 45 0.081097
26 46 0.087237
27 47 0.082280
28 48 0.075145
29 49 0.081574
... ... ...
56 76 0.020287
57 77 0.016379
58 78 0.022770
59 79 0.022426
60 80 0.021689
61 81 0.011628
62 82 0.029366
63 83 0.019531
64 84 0.016667
65 85 0.018634
66 86 0.014742
67 87 0.022409
68 88 0.025559
69 89 0.032609
70 90 0.015152
71 91 0.032468
72 92 0.000000
73 93 0.011494
74 94 0.021277
75 95 0.022222
76 96 0.000000
77 97 0.000000
78 98 0.000000
79 99 0.222222
80 101 0.333333
81 102 0.000000
82 103 0.000000
83 105 0.000000
84 107 0.000000
85 109 0.000000

86 rows × 2 columns

In [ ]:
 

No comments:

Post a Comment