In [1]:
import pandas as pd
import numpy as np
import pylab as pl
In [2]:
df = pd.read_csv("cs-training.csv")
df.head()
Out[2]:
Unnamed: 0 | SeriousDlqin2yrs | RevolvingUtilizationOfUnsecuredLines | age | NumberOfTime30-59DaysPastDueNotWorse | DebtRatio | MonthlyIncome | NumberOfOpenCreditLinesAndLoans | NumberOfTimes90DaysLate | NumberRealEstateLoansOrLines | NumberOfTime60-89DaysPastDueNotWorse | NumberOfDependents | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0.766127 | 45 | 2 | 0.802982 | 9120.0 | 13 | 0 | 6 | 0 | 2.0 |
1 | 2 | 0 | 0.957151 | 40 | 0 | 0.121876 | 2600.0 | 4 | 0 | 0 | 0 | 1.0 |
2 | 3 | 0 | 0.658180 | 38 | 1 | 0.085113 | 3042.0 | 2 | 1 | 0 | 0 | 0.0 |
3 | 4 | 0 | 0.233810 | 30 | 0 | 0.036050 | 3300.0 | 5 | 0 | 0 | 0 | 0.0 |
4 | 5 | 0 | 0.907239 | 49 | 1 | 0.024926 | 63588.0 | 7 | 0 | 1 | 0 | 0.0 |
In [4]:
df.MonthlyIncome.apply(np.log)
Out[4]:
0 9.118225 1 7.863267 2 8.020270 3 8.101678 4 11.060180 5 8.160518 6 NaN 7 8.160518 8 NaN 9 10.072555 10 7.824046 11 8.779711 12 9.429797 13 9.525151 14 -inf 15 9.338030 16 NaN 17 9.082507 18 8.095599 19 5.808142 20 9.417355 21 8.006368 22 7.824046 23 8.976641 24 7.789869 25 8.450198 26 9.027979 27 7.824046 28 8.131531 29 8.612503 ... 149970 9.648595 149971 8.517193 149972 8.845777 149973 8.612503 149974 8.517193 149975 NaN 149976 NaN 149977 NaN 149978 7.824046 149979 9.071078 149980 8.617039 149981 8.831858 149982 7.922986 149983 8.517193 149984 NaN 149985 7.575585 149986 8.517193 149987 8.086103 149988 8.924656 149989 9.130539 149990 8.374477 149991 9.241451 149992 NaN 149993 6.709304 149994 8.131531 149995 7.649693 149996 8.627661 149997 NaN 149998 8.651025 149999 9.006754 Name: MonthlyIncome, dtype: float64
In [5]:
add_10 = lambda x: x + 10
plus = lambda x, y: x + y
print (add_10(9))
print (plus(10, 20))
19 30
In [6]:
df.MonthlyIncome.apply(lambda x: np.log(x + 1))
Out[6]:
0 9.118335 1 7.863651 2 8.020599 3 8.101981 4 11.060196 5 8.160804 6 NaN 7 8.160804 8 NaN 9 10.072597 10 7.824446 11 8.779865 12 9.429877 13 9.525224 14 0.000000 15 9.338118 16 NaN 17 9.082621 18 8.095904 19 5.811141 20 9.417436 21 8.006701 22 7.824446 23 8.976768 24 7.790282 25 8.450412 26 9.028099 27 7.824446 28 8.131825 29 8.612685 ... 149970 9.648660 149971 8.517393 149972 8.845921 149973 8.612685 149974 8.517393 149975 NaN 149976 NaN 149977 NaN 149978 7.824446 149979 9.071193 149980 8.617220 149981 8.832004 149982 7.923348 149983 8.517393 149984 NaN 149985 7.576097 149986 8.517393 149987 8.086410 149988 8.924789 149989 9.130648 149990 8.374708 149991 9.241548 149992 NaN 149993 6.710523 149994 8.131825 149995 7.650169 149996 8.627840 149997 NaN 149998 8.651199 149999 9.006877 Name: MonthlyIncome, dtype: float64
In [7]:
def inverse(x):
return 1 / (x + 1)
df.MonthlyIncome.apply(inverse)
Out[7]:
0 0.000110 1 0.000384 2 0.000329 3 0.000303 4 0.000016 5 0.000286 6 NaN 7 0.000286 8 NaN 9 0.000042 10 0.000400 11 0.000154 12 0.000080 13 0.000073 14 1.000000 15 0.000088 16 NaN 17 0.000114 18 0.000305 19 0.002994 20 0.000081 21 0.000333 22 0.000400 23 0.000126 24 0.000414 25 0.000214 26 0.000120 27 0.000400 28 0.000294 29 0.000182 ... 149970 0.000065 149971 0.000200 149972 0.000144 149973 0.000182 149974 0.000200 149975 NaN 149976 NaN 149977 NaN 149978 0.000400 149979 0.000115 149980 0.000181 149981 0.000146 149982 0.000362 149983 0.000200 149984 NaN 149985 0.000513 149986 0.000200 149987 0.000308 149988 0.000133 149989 0.000108 149990 0.000231 149991 0.000097 149992 NaN 149993 0.001218 149994 0.000294 149995 0.000476 149996 0.000179 149997 NaN 149998 0.000175 149999 0.000123 Name: MonthlyIncome, dtype: float64
In [8]:
def cap_value(x, cap):
"""
x - a value
cap - threshold value for x; if x > cap, then x is set to cap
Examples:
cap_value(1000, 10)
10
cap_value(10, 100)
10
"""
# your code here
return None
In [10]:
print (cap_value(1000, 10)==10)
print (cap_value(10, 100)==10)
print (df.DebtRatio.apply(lambda x: cap_value(x, 5.0)).mean())#should be close to 1.28
False False nan
In [11]:
subset = df[['SeriousDlqin2yrs', 'age', 'MonthlyIncome']]
subset.groupby("SeriousDlqin2yrs")
Out[11]:
<pandas.core.groupby.DataFrameGroupBy object at 0x000001B214C90B38>
In [12]:
subset.groupby("SeriousDlqin2yrs").mean()
Out[12]:
age | MonthlyIncome | |
---|---|---|
SeriousDlqin2yrs | ||
0 | 52.751375 | 6747.837774 |
1 | 45.926591 | 5630.826493 |
In [14]:
for name, group in subset.groupby("SeriousDlqin2yrs"):
print ("splitting by: ", name)
print (group.mean())
print ("*"*80)
splitting by: 0 SeriousDlqin2yrs 0.000000 age 52.751375 MonthlyIncome 6747.837774 dtype: float64 ******************************************************************************** splitting by: 1 SeriousDlqin2yrs 1.000000 age 45.926591 MonthlyIncome 5630.826493 dtype: float64 ********************************************************************************
In [15]:
subset.groupby("SeriousDlqin2yrs").agg([np.min, np.mean, np.median, np.max])
Out[15]:
age | MonthlyIncome | |||||||
---|---|---|---|---|---|---|---|---|
amin | mean | median | amax | amin | mean | median | amax | |
SeriousDlqin2yrs | ||||||||
0 | 0 | 52.751375 | 52 | 109 | 0.0 | 6747.837774 | 5466.0 | 3008750.0 |
1 | 21 | 45.926591 | 45 | 101 | 0.0 | 5630.826493 | 4500.0 | 250000.0 |
In [17]:
def age_x_income(frame):
x = (frame.age * frame.MonthlyIncome)
return np.mean(x)
subset.groupby("SeriousDlqin2yrs").apply(age_x_income)
Out[17]:
SeriousDlqin2yrs 0 356068.428882 1 267892.225679 dtype: float64
In [18]:
pop = pd.read_csv("uspop.csv")
pop
Out[18]:
age | est_pop | |
---|---|---|
0 | 10 | 20055.346939 |
1 | 11 | 20073.020408 |
2 | 12 | 20090.693878 |
3 | 13 | 20108.367347 |
4 | 14 | 20139.081633 |
5 | 15 | 20912.081633 |
6 | 16 | 20925.122449 |
7 | 17 | 20938.163265 |
8 | 18 | 20951.204082 |
9 | 19 | 20961.326531 |
10 | 20 | 21519.163265 |
11 | 21 | 21516.244898 |
12 | 22 | 21513.326531 |
13 | 23 | 21510.408163 |
14 | 24 | 21483.408163 |
15 | 25 | 21333.836735 |
16 | 26 | 21309.755102 |
17 | 27 | 21285.673469 |
18 | 28 | 21261.591837 |
19 | 29 | 21237.510204 |
20 | 29 | 20182.673469 |
21 | 30 | 20163.346939 |
22 | 31 | 20144.020408 |
23 | 32 | 20124.693878 |
24 | 33 | 20105.367347 |
25 | 34 | 20113.224490 |
26 | 35 | 19309.367347 |
27 | 36 | 19336.551020 |
28 | 37 | 19363.734694 |
29 | 38 | 19390.918367 |
... | ... | ... |
48 | 57 | 19380.612245 |
49 | 58 | 19337.265306 |
50 | 59 | 19293.918367 |
51 | 59 | 17322.448980 |
52 | 60 | 17214.897959 |
53 | 61 | 17107.346939 |
54 | 62 | 16999.795918 |
55 | 63 | 16892.244898 |
56 | 64 | 16725.387755 |
57 | 65 | 12041.387755 |
58 | 66 | 11982.081633 |
59 | 67 | 11922.775510 |
60 | 68 | 11863.469388 |
61 | 69 | 11759.959184 |
62 | 70 | 9165.591837 |
63 | 71 | 9121.387755 |
64 | 72 | 9077.183673 |
65 | 73 | 9032.979592 |
66 | 74 | 8960.836735 |
67 | 75 | 7032.122449 |
68 | 76 | 7004.183673 |
69 | 77 | 6976.244898 |
70 | 78 | 6948.306122 |
71 | 79 | 6904.816327 |
72 | 80 | 5687.897959 |
73 | 81 | 5672.346939 |
74 | 82 | 5656.795918 |
75 | 83 | 5641.244898 |
76 | 84 | 5625.693878 |
77 | 85 | 4957.000000 |
78 rows × 2 columns
In [20]:
cols = ['age', 'MonthlyIncome', 'SeriousDlqin2yrs']
result = pd.merge(df[cols] , pop, how='left', on='age')
result
Out[20]:
age | MonthlyIncome | SeriousDlqin2yrs | est_pop | |
---|---|---|---|---|
0 | 45 | 9120.0 | 1 | 21988.020408 |
1 | 40 | 2600.0 | 0 | 20644.224490 |
2 | 38 | 3042.0 | 0 | 19390.918367 |
3 | 30 | 3300.0 | 0 | 20163.346939 |
4 | 49 | 63588.0 | 0 | 21936.857143 |
5 | 74 | 3500.0 | 0 | 8960.836735 |
6 | 57 | NaN | 0 | 19380.612245 |
7 | 39 | 3500.0 | 0 | 19446.714286 |
8 | 27 | NaN | 0 | 21285.673469 |
9 | 57 | 23684.0 | 0 | 19380.612245 |
10 | 30 | 2500.0 | 0 | 20163.346939 |
11 | 51 | 6501.0 | 0 | 21817.387755 |
12 | 46 | 12454.0 | 0 | 21987.530612 |
13 | 40 | 13700.0 | 1 | 20644.224490 |
14 | 76 | 0.0 | 0 | 7004.183673 |
15 | 64 | 11362.0 | 0 | 16725.387755 |
16 | 78 | NaN | 0 | 6948.306122 |
17 | 53 | 8800.0 | 0 | 21718.979592 |
18 | 43 | 3280.0 | 0 | 20730.061224 |
19 | 25 | 333.0 | 0 | 21333.836735 |
20 | 43 | 12300.0 | 0 | 20730.061224 |
21 | 38 | 3000.0 | 1 | 19390.918367 |
22 | 39 | 2500.0 | 0 | 19446.714286 |
23 | 32 | 7916.0 | 0 | 20124.693878 |
24 | 58 | 2416.0 | 0 | 19337.265306 |
25 | 50 | 4676.0 | 1 | 21866.591837 |
26 | 58 | 8333.0 | 0 | 19337.265306 |
27 | 69 | 2500.0 | 0 | 11759.959184 |
28 | 24 | 3400.0 | 0 | 21483.408163 |
29 | 58 | 5500.0 | 0 | 19337.265306 |
... | ... | ... | ... | ... |
154952 | 42 | 6945.0 | 0 | 20701.448980 |
154953 | 44 | 5500.0 | 0 | 20758.183673 |
154954 | 61 | 5000.0 | 0 | 17107.346939 |
154955 | 58 | NaN | 0 | 19337.265306 |
154956 | 76 | NaN | 0 | 7004.183673 |
154957 | 29 | NaN | 0 | 21237.510204 |
154958 | 29 | NaN | 0 | 20182.673469 |
154959 | 52 | 2500.0 | 0 | 21768.183673 |
154960 | 55 | 8700.0 | 1 | 19467.306122 |
154961 | 64 | 5525.0 | 0 | 16725.387755 |
154962 | 43 | 6849.0 | 0 | 20730.061224 |
154963 | 37 | 2760.0 | 0 | 19363.734694 |
154964 | 82 | 5000.0 | 0 | 5656.795918 |
154965 | 84 | NaN | 0 | 5625.693878 |
154966 | 26 | 1950.0 | 0 | 21309.755102 |
154967 | 49 | 5000.0 | 0 | 21936.857143 |
154968 | 28 | 3249.0 | 0 | 21261.591837 |
154969 | 31 | 7515.0 | 0 | 20144.020408 |
154970 | 62 | 9233.0 | 0 | 16999.795918 |
154971 | 46 | 4335.0 | 0 | 21987.530612 |
154972 | 59 | 10316.0 | 0 | 19293.918367 |
154973 | 59 | 10316.0 | 0 | 17322.448980 |
154974 | 50 | NaN | 0 | 21866.591837 |
154975 | 22 | 820.0 | 0 | 21513.326531 |
154976 | 50 | 3400.0 | 0 | 21866.591837 |
154977 | 74 | 2100.0 | 0 | 8960.836735 |
154978 | 44 | 5584.0 | 0 | 20758.183673 |
154979 | 58 | NaN | 0 | 19337.265306 |
154980 | 30 | 5716.0 | 0 | 20163.346939 |
154981 | 64 | 8158.0 | 0 | 16725.387755 |
154982 rows × 4 columns
In [21]:
len(result) > len(df)
Out[21]:
True
In [22]:
pd.value_counts(pop.age).head()
Out[22]:
29 2 59 2 36 1 30 1 31 1 Name: age, dtype: int64
In [23]:
pop = pop[pop.age.duplicated()==False]
In [25]:
cols = ['age', 'MonthlyIncome', 'SeriousDlqin2yrs']
joined = pd.merge(df[cols] , pop, how='left', on='age')
joined
Out[25]:
age | MonthlyIncome | SeriousDlqin2yrs | est_pop | |
---|---|---|---|---|
0 | 45 | 9120.0 | 1 | 21988.020408 |
1 | 40 | 2600.0 | 0 | 20644.224490 |
2 | 38 | 3042.0 | 0 | 19390.918367 |
3 | 30 | 3300.0 | 0 | 20163.346939 |
4 | 49 | 63588.0 | 0 | 21936.857143 |
5 | 74 | 3500.0 | 0 | 8960.836735 |
6 | 57 | NaN | 0 | 19380.612245 |
7 | 39 | 3500.0 | 0 | 19446.714286 |
8 | 27 | NaN | 0 | 21285.673469 |
9 | 57 | 23684.0 | 0 | 19380.612245 |
10 | 30 | 2500.0 | 0 | 20163.346939 |
11 | 51 | 6501.0 | 0 | 21817.387755 |
12 | 46 | 12454.0 | 0 | 21987.530612 |
13 | 40 | 13700.0 | 1 | 20644.224490 |
14 | 76 | 0.0 | 0 | 7004.183673 |
15 | 64 | 11362.0 | 0 | 16725.387755 |
16 | 78 | NaN | 0 | 6948.306122 |
17 | 53 | 8800.0 | 0 | 21718.979592 |
18 | 43 | 3280.0 | 0 | 20730.061224 |
19 | 25 | 333.0 | 0 | 21333.836735 |
20 | 43 | 12300.0 | 0 | 20730.061224 |
21 | 38 | 3000.0 | 1 | 19390.918367 |
22 | 39 | 2500.0 | 0 | 19446.714286 |
23 | 32 | 7916.0 | 0 | 20124.693878 |
24 | 58 | 2416.0 | 0 | 19337.265306 |
25 | 50 | 4676.0 | 1 | 21866.591837 |
26 | 58 | 8333.0 | 0 | 19337.265306 |
27 | 69 | 2500.0 | 0 | 11759.959184 |
28 | 24 | 3400.0 | 0 | 21483.408163 |
29 | 58 | 5500.0 | 0 | 19337.265306 |
... | ... | ... | ... | ... |
149970 | 58 | 15500.0 | 0 | 19337.265306 |
149971 | 83 | 5000.0 | 0 | 5641.244898 |
149972 | 42 | 6945.0 | 0 | 20701.448980 |
149973 | 44 | 5500.0 | 0 | 20758.183673 |
149974 | 61 | 5000.0 | 0 | 17107.346939 |
149975 | 58 | NaN | 0 | 19337.265306 |
149976 | 76 | NaN | 0 | 7004.183673 |
149977 | 29 | NaN | 0 | 21237.510204 |
149978 | 52 | 2500.0 | 0 | 21768.183673 |
149979 | 55 | 8700.0 | 1 | 19467.306122 |
149980 | 64 | 5525.0 | 0 | 16725.387755 |
149981 | 43 | 6849.0 | 0 | 20730.061224 |
149982 | 37 | 2760.0 | 0 | 19363.734694 |
149983 | 82 | 5000.0 | 0 | 5656.795918 |
149984 | 84 | NaN | 0 | 5625.693878 |
149985 | 26 | 1950.0 | 0 | 21309.755102 |
149986 | 49 | 5000.0 | 0 | 21936.857143 |
149987 | 28 | 3249.0 | 0 | 21261.591837 |
149988 | 31 | 7515.0 | 0 | 20144.020408 |
149989 | 62 | 9233.0 | 0 | 16999.795918 |
149990 | 46 | 4335.0 | 0 | 21987.530612 |
149991 | 59 | 10316.0 | 0 | 19293.918367 |
149992 | 50 | NaN | 0 | 21866.591837 |
149993 | 22 | 820.0 | 0 | 21513.326531 |
149994 | 50 | 3400.0 | 0 | 21866.591837 |
149995 | 74 | 2100.0 | 0 | 8960.836735 |
149996 | 44 | 5584.0 | 0 | 20758.183673 |
149997 | 58 | NaN | 0 | 19337.265306 |
149998 | 30 | 5716.0 | 0 | 20163.346939 |
149999 | 64 | 8158.0 | 0 | 16725.387755 |
150000 rows × 4 columns
In [26]:
joined.est_pop = joined.est_pop.fillna(4957.0)
joined.est_pop.describe()
Out[26]:
count 150000.000000 mean 17882.219144 std 4875.568984 min 4957.000000 25% 16892.244898 50% 19467.306122 75% 21510.408163 max 21988.020408 Name: est_pop, dtype: float64
In [27]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
In [29]:
query = """
select
SeriousDlqin2yrs
, sum(1) as total
from
df
group by
SeriousDlqin2yrs;
"""
pysqldf(query)
Out[29]:
SeriousDlqin2yrs | total | |
---|---|---|
0 | 0 | 139974 |
1 | 1 | 10026 |
In [30]:
query = """
select
age
, avg(SeriousDlqin2yrs) as pct_delinquent
from
df
group by
age
order by
age;
"""
pysqldf(query)
Out[30]:
age | pct_delinquent | |
---|---|---|
0 | 0 | 0.000000 |
1 | 21 | 0.071038 |
2 | 22 | 0.082949 |
3 | 23 | 0.109204 |
4 | 24 | 0.120098 |
5 | 25 | 0.126967 |
6 | 26 | 0.123219 |
7 | 27 | 0.124066 |
8 | 28 | 0.131410 |
9 | 29 | 0.105170 |
10 | 30 | 0.107899 |
11 | 31 | 0.106477 |
12 | 32 | 0.113659 |
13 | 33 | 0.109870 |
14 | 34 | 0.097448 |
15 | 35 | 0.107302 |
16 | 36 | 0.099622 |
17 | 37 | 0.090044 |
18 | 38 | 0.089320 |
19 | 39 | 0.093740 |
20 | 40 | 0.085354 |
21 | 41 | 0.094170 |
22 | 42 | 0.093770 |
23 | 43 | 0.086035 |
24 | 44 | 0.073467 |
25 | 45 | 0.081097 |
26 | 46 | 0.087237 |
27 | 47 | 0.082280 |
28 | 48 | 0.075145 |
29 | 49 | 0.081574 |
... | ... | ... |
56 | 76 | 0.020287 |
57 | 77 | 0.016379 |
58 | 78 | 0.022770 |
59 | 79 | 0.022426 |
60 | 80 | 0.021689 |
61 | 81 | 0.011628 |
62 | 82 | 0.029366 |
63 | 83 | 0.019531 |
64 | 84 | 0.016667 |
65 | 85 | 0.018634 |
66 | 86 | 0.014742 |
67 | 87 | 0.022409 |
68 | 88 | 0.025559 |
69 | 89 | 0.032609 |
70 | 90 | 0.015152 |
71 | 91 | 0.032468 |
72 | 92 | 0.000000 |
73 | 93 | 0.011494 |
74 | 94 | 0.021277 |
75 | 95 | 0.022222 |
76 | 96 | 0.000000 |
77 | 97 | 0.000000 |
78 | 98 | 0.000000 |
79 | 99 | 0.222222 |
80 | 101 | 0.333333 |
81 | 102 | 0.000000 |
82 | 103 | 0.000000 |
83 | 105 | 0.000000 |
84 | 107 | 0.000000 |
85 | 109 | 0.000000 |
86 rows × 2 columns
In [ ]:
No comments:
Post a Comment