http://nbviewer.jupyter.org/github/yhat/DataGotham2013/blob/master/notebooks/4%20-%20scikit-learn%20basics.ipynb

import matplotlib
%matplotlib inline

import pandas as pd
import numpy as np
import pylab as pl

from sklearn.datasets import load_iris

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['species'] = iris.target

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

svm_clf = SVC()
neighbors_clf = KNeighborsClassifier()
clfs = [
    ("svc", SVC()),
    ("KNN", KNeighborsClassifier())
    ]
for name, clf in clfs:
    clf.fit(df[iris.feature_names], df.species)
    print (name, clf.predict(iris.data))
    print ("*"*80)

svc [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
********************************************************************************
KNN [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1
 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
********************************************************************************

clf = RandomForestClassifier()
clf.fit(df[iris.feature_names], df.species)
clf.predict(df[iris.feature_names])
pd.crosstab(df.species, clf.predict(df[iris.feature_names]))

from sklearn import tree

clf = tree.DecisionTreeClassifier(max_features="auto",
                                  min_samples_leaf=10)
clf.fit(df[iris.feature_names], df.species)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=10,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

from sklearn.externals.six import StringIO
with open("irisdotfile.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f)

to view the file created in the previous step¶

from IPython.core.display import Image
Image(url="https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEgq0SHTZpK8BUtTniRJUARv8nK65U2ypmO29Dn_mg3qq2spfZdUWQjjMdK2saHuvxrDJo8DgX5ijr8bXf2NtEFU0dmNOqXYM_L7skaYaru7DFe9m5mY3e-CgQtwWgcCQjXUijJtDmqbwn0/s1600/drop_shadows_background.png",
      width=700)

from sklearn.datasets import load_boston
boston = load_boston()

df = pd.DataFrame(boston.data)
df.head()

import re


def camel_to_snake(column_name):
    """
    converts a string that is camelCase into snake_case
    Example:
        print camel_to_snake("javaLovesCamelCase")
        > java_loves_camel_case
    See Also:
        http://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-camel-case
    """
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', column_name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

df = pd.DataFrame(boston.data)
#df.columns = [camel_to_snake(col) for col in boston.feature_names[:-1]]
df.columns = [camel_to_snake(col) for col in boston.feature_names[:]]
# add in prices
df['price'] = boston.target
print (len(df)==506)
df.head()

True

from sklearn.linear_model import LinearRegression

features = ['age', 'lstat', 'tax']
lm = LinearRegression()
lm.fit(df[features], df.price)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

# add your actual vs. predicted points
pl.scatter(df.price, lm.predict(df[features]))
# add the line of perfect fit
straight_line = np.arange(0, 60)
pl.plot(straight_line, straight_line)
pl.title("Fitted Values")

<matplotlib.text.Text at 0x2046466cf98>

	0	1	2	4	5	6	7	8	9	10	11	12
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	b	lstat	price
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33	36.2

Python @ Praxis

Sunday, October 15, 2017

DG 04 SciKit

use http://www.webgraphviz.com/ to view the file created in the previous step¶

No comments:

Post a Comment

col_0	0	1	2
species
0	50	0	0
1	0	50	0
2	0	0	50