Saturday, October 14, 2017

SQL in Python with PandaSQL

pandasql
In [1]:
!pip install pandasql
Collecting pandasql
  Downloading pandasql-0.7.3.tar.gz
Requirement already satisfied: numpy in c:\programdata\anaconda3\lib\site-packages (from pandasql)
Requirement already satisfied: pandas in c:\programdata\anaconda3\lib\site-packages (from pandasql)
Requirement already satisfied: sqlalchemy in c:\programdata\anaconda3\lib\site-packages (from pandasql)
Requirement already satisfied: python-dateutil>=2 in c:\programdata\anaconda3\lib\site-packages (from pandas->pandasql)
Requirement already satisfied: pytz>=2011k in c:\programdata\anaconda3\lib\site-packages (from pandas->pandasql)
Requirement already satisfied: six>=1.5 in c:\programdata\anaconda3\lib\site-packages (from python-dateutil>=2->pandas->pandasql)
Building wheels for collected packages: pandasql
  Running setup.py bdist_wheel for pandasql: started
  Running setup.py bdist_wheel for pandasql: finished with status 'done'
  Stored in directory: C:\Users\Yantrajaal\AppData\Local\pip\Cache\wheels\a4\4b\e6\88a009d13e4b64f7292e802a93d19619cceb135ba599235430
Successfully built pandasql
Installing collected packages: pandasql
Successfully installed pandasql-0.7.3
In [2]:
#Checking out meat and birth data
from pandasql import sqldf
from pandasql import load_meat, load_births
In [4]:
meat = load_meat()
births = load_births()

#You can inspect the dataframes directly if you're using Rodeo
#These print statements are here just in case you want to check out your data in the editor, too
print (meat.head())
print (births.head())
        date   beef   veal    pork  lamb_and_mutton  broilers  other_chicken  \
0 1944-01-01  751.0   85.0  1280.0             89.0       NaN            NaN   
1 1944-02-01  713.0   77.0  1169.0             72.0       NaN            NaN   
2 1944-03-01  741.0   90.0  1128.0             75.0       NaN            NaN   
3 1944-04-01  650.0   89.0   978.0             66.0       NaN            NaN   
4 1944-05-01  681.0  106.0  1029.0             78.0       NaN            NaN   

   turkey  
0     NaN  
1     NaN  
2     NaN  
3     NaN  
4     NaN  
        date  births
0 1975-01-01  265775
1 1975-02-01  241045
2 1975-03-01  268849
3 1975-04-01  247455
4 1975-05-01  254545
In [5]:
meat.to_csv("data-meat.csv", index=False)
births.to_csv("data-births.csv", index=False)
In [7]:
import pandas as pd
import numpy as np
import pylab as pl
In [8]:
df_meat = pd.read_csv("data-meat.csv")
print(df_meat.head())
         date   beef   veal    pork  lamb_and_mutton  broilers  other_chicken  \
0  1944-01-01  751.0   85.0  1280.0             89.0       NaN            NaN   
1  1944-02-01  713.0   77.0  1169.0             72.0       NaN            NaN   
2  1944-03-01  741.0   90.0  1128.0             75.0       NaN            NaN   
3  1944-04-01  650.0   89.0   978.0             66.0       NaN            NaN   
4  1944-05-01  681.0  106.0  1029.0             78.0       NaN            NaN   

   turkey  
0     NaN  
1     NaN  
2     NaN  
3     NaN  
4     NaN  
In [9]:
df_births = pd.read_csv("data-births.csv")
print(df_births.head())
         date  births
0  1975-01-01  265775
1  1975-02-01  241045
2  1975-03-01  268849
3  1975-04-01  247455
4  1975-05-01  254545
In [10]:
import matplotlib.pyplot as plt
%matplotlib inline  
In [12]:
q = """
    SELECT
        *
    FROM
        df_meat
    LIMIT 10;"""

print (sqldf(q, locals()))
         date   beef   veal    pork  lamb_and_mutton broilers other_chicken  \
0  1944-01-01  751.0   85.0  1280.0             89.0     None          None   
1  1944-02-01  713.0   77.0  1169.0             72.0     None          None   
2  1944-03-01  741.0   90.0  1128.0             75.0     None          None   
3  1944-04-01  650.0   89.0   978.0             66.0     None          None   
4  1944-05-01  681.0  106.0  1029.0             78.0     None          None   
5  1944-06-01  658.0  125.0   962.0             79.0     None          None   
6  1944-07-01  662.0  142.0   796.0             82.0     None          None   
7  1944-08-01  787.0  175.0   748.0             87.0     None          None   
8  1944-09-01  774.0  182.0   678.0             91.0     None          None   
9  1944-10-01  834.0  215.0   777.0            100.0     None          None   

  turkey  
0   None  
1   None  
2   None  
3   None  
4   None  
5   None  
6   None  
7   None  
8   None  
9   None  
In [16]:
# births per year
q = """
    SELECT
        strftime("%Y", date) year
        , SUM(births) total_Births
    FROM df_births
    GROUP BY 1
    ORDER BY 1;
            """

print (sqldf(q, locals()))
    year  total_Births
0   1975       3136965
1   1976       6304156
2   1979       3333279
3   1982       3612258
4   1983       7333238
5   1986       7308074
6   1987       3760561
7   1988       3756547
8   1990       7718904
9   1991      11714356
10  1993       8194907
11  1995       4000240
12  1996       3952767
13  1997       3899589
14  1998       3891494
15  1999       3880894
16  2000       3941553
17  2001       8018231
18  2002       4025933
19  2003       4021726
20  2005       8210950
21  2007       4138349
22  2008       4265555
23  2011       4247694
24  2012       4130665
In [18]:
def pysqldf(q):
    return sqldf(q, globals())
In [19]:
# joining meats + births on date
q = """
    SELECT
        m.date
        , b.births
        , m.beef
    FROM
        df_meat m
    INNER JOIN
        df_births b
            on m.date = b.date
    ORDER BY
        m.date
    LIMIT 50;
    """

joined = pysqldf(q)
print (joined.head())
         date  births    beef
0  1975-01-01  265775  2106.0
1  1975-02-01  241045  1845.0
2  1975-03-01  268849  1891.0
3  1975-04-01  247455  1895.0
4  1975-05-01  254545  1849.0
In [21]:
q = """
    SELECT
        date
        , beef
        , veal
        , pork
        , lamb_and_mutton
    FROM
        df_meat
    WHERE
        lamb_and_mutton >= veal
    ORDER BY date DESC
    LIMIT 10;
    """

print (pysqldf(q))
         date    beef  veal    pork  lamb_and_mutton
0  2012-11-01  2206.6  10.1  2078.7             12.4
1  2012-10-01  2343.7  10.3  2210.4             14.2
2  2012-09-01  2016.0   8.8  1911.0             12.5
3  2012-08-01  2367.5  10.1  1997.9             14.2
4  2012-07-01  2200.8   9.5  1721.8             12.5
5  2012-06-01  2252.1   8.9  1750.4             12.4
6  2012-05-01  2232.0  10.4  1926.8             13.6
7  2012-04-01  1990.6   9.9  1841.7             12.9
8  2012-03-01  2159.8  10.0  1987.9             14.2
9  2012-02-01  2009.0   9.8  1882.9             12.3
In [22]:
#################################################
# SQL FUNCTIONS
# e.g. `RANDOM()`
#################################################
q = """SELECT
    *
    FROM
        meat
    ORDER BY RANDOM()
    LIMIT 10;"""
print (pysqldf(q))
                         date    beef   veal    pork  lamb_and_mutton  \
0  1949-10-01 00:00:00.000000   772.0  120.0   764.0             56.0   
1  2006-12-01 00:00:00.000000  2049.6   13.1  1796.4             15.4   
2  1949-12-01 00:00:00.000000   717.0   97.0  1041.0             53.0   
3  2007-11-01 00:00:00.000000  2228.8   10.1  2045.2             16.1   
4  1986-01-01 00:00:00.000000  2140.0   46.0  1266.0             31.0   
5  2004-12-01 00:00:00.000000  2042.0   14.9  1857.0             17.0   
6  2002-08-01 00:00:00.000000  2469.0   16.7  1637.0             16.6   
7  2011-02-01 00:00:00.000000  2020.4   10.5  1768.1             10.9   
8  1952-09-01 00:00:00.000000   841.0  111.0   720.0             59.0   
9  1986-08-01 00:00:00.000000  2076.0   41.0  1037.0             25.0   

   broilers  other_chicken  turkey  
0       NaN            NaN     NaN  
1    2737.8           36.9   429.1  
2       NaN            NaN     NaN  
3    3033.2           41.0   526.0  
4    1211.3            NaN   187.5  
5    2836.9           42.5   440.0  
6    2828.3           49.2   481.9  
7    2852.4           37.4   435.6  
8       NaN            NaN     NaN  
9    1181.0            NaN   299.4  
In [23]:
#################################################
# UNION ALL
#################################################
q = """
        SELECT
            date
            , 'beef' AS meat_type
            , beef AS value
        FROM df_meat
        UNION ALL
        SELECT
            date
            , 'veal' AS meat_type
            , veal AS value
        FROM df_meat

        UNION ALL

        SELECT
            date
            , 'pork' AS meat_type
            , pork AS value
        FROM df_meat
        UNION ALL
        SELECT
            date
            , 'lamb_and_mutton' AS meat_type
            , lamb_and_mutton AS value
        FROM df_meat
        ORDER BY 1
    """
print (pysqldf(q).head(20))
          date        meat_type   value
0   1944-01-01             beef   751.0
1   1944-01-01             veal    85.0
2   1944-01-01             pork  1280.0
3   1944-01-01  lamb_and_mutton    89.0
4   1944-02-01             beef   713.0
5   1944-02-01             veal    77.0
6   1944-02-01             pork  1169.0
7   1944-02-01  lamb_and_mutton    72.0
8   1944-03-01             beef   741.0
9   1944-03-01             veal    90.0
10  1944-03-01             pork  1128.0
11  1944-03-01  lamb_and_mutton    75.0
12  1944-04-01             beef   650.0
13  1944-04-01             veal    89.0
14  1944-04-01             pork   978.0
15  1944-04-01  lamb_and_mutton    66.0
16  1944-05-01             beef   681.0
17  1944-05-01             veal   106.0
18  1944-05-01             pork  1029.0
19  1944-05-01  lamb_and_mutton    78.0
In [25]:
#################################################
# subqueries
# fancy!
#################################################
q = """
    SELECT
        m1.date
        , m1.beef
    FROM
        df_meat m1
    WHERE m1.date IN
        (SELECT
            date
        FROM df_meat
        WHERE
            beef >= broilers
        ORDER BY date)
"""

more_beef_than_broilers = pysqldf(q)
print (more_beef_than_broilers.head(10))
         date    beef
0  1960-01-01  1196.0
1  1960-02-01  1089.0
2  1960-03-01  1201.0
3  1960-04-01  1066.0
4  1960-05-01  1202.0
5  1960-06-01  1247.0
6  1960-07-01  1166.0
7  1960-08-01  1307.0
8  1960-09-01  1298.0
9  1960-10-01  1263.0
In [26]:
import matplotlib.pyplot as plt
%matplotlib inline  
In [28]:
pysqldf = lambda q: sqldf(q, globals())

q  = """
SELECT
  m.date
  , m.beef
  , b.births
FROM
  df_meat m
LEFT JOIN
  df_births b
  ON m.date = b.date
WHERE
    m.date > '1974-12-31';
"""


df = pysqldf(q)
df.births = df.births.fillna(method='backfill')

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(pd.rolling_mean(df['beef'], 12), color='b')
ax1.set_xlabel('months since 1975')
ax1.set_ylabel('cattle slaughtered', color='b')

ax2 = ax1.twinx()
ax2.plot(pd.rolling_mean(df['births'], 12), color='r')
ax2.set_ylabel('babies born', color='r')
plt.title("Beef Consumption and the Birth Rate")
plt.show()
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:23: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
 Series.rolling(window=12,center=False).mean()
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\__main__.py:28: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
 Series.rolling(window=12,center=False).mean()
In [33]:
df.head()
Out[33]:
date beef births
0 1975-01-01 2106.0 265775.0
1 1975-02-01 1845.0 241045.0
2 1975-03-01 1891.0 268849.0
3 1975-04-01 1895.0 247455.0
4 1975-05-01 1849.0 254545.0
In [36]:
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(df['beef'].rolling(window=12,center=False).mean(), color='b')
ax1.set_xlabel('months since 1975')
ax1.set_ylabel('cattle slaughtered', color='b')

ax2 = ax1.twinx()
ax2.plot(df['births'].rolling(window=12,center=False).mean(), color='r')
ax2.set_ylabel('babies born', color='r')
plt.title("Beef Consumption and the Birth Rate")
plt.show()
In [ ]:
 

No comments:

Post a Comment