In [56]:
# Feature Importance with Extra Trees Classifier
# http://machinelearningmastery.com/feature-selection-machine-learning-python/
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier
# load data
#url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['30 stock m,' '30 stock b', '30 mom m', '30 mom b', '30 diff m', '30 diff b', '30 vol','30 stock pt','30 mom pt','30 diff pt','buy']
dataframe = read_csv('2017-09-01.csv', names=names)
array = dataframe.values
X = array[:,0:9]
Y = array[:,9]
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[ 0.12130768  0.11433021  0.08516347  0.12617479  0.09338614  0.10210343
  0.13806305  0.0962042   0.12326702]


In [71]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
# url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['30 stock m', '30 stock b', '30 mom m', '30 mom b', '30 diff m', '30 diff b','30 stock pt','30 mom pt','30 diff pt','buy']
dataframe = read_csv('2017-09-01 3.csv', names=names)
array = dataframe.values
X = array[:,0:9]
Y = array[:,9]
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 1)
fit = rfe.fit(X, Y)
print( ("Num Features: %d") % fit.n_features_ )
print( ("Selected Features: %s") % fit.support_ )
print( ("Feature Ranking: %s") % fit.ranking_ )

Num Features: 1
Selected Features: [False  True False False False False False False False]
Feature Ranking: [6 1 4 7 3 8 2 5 9]


In [21]:
# Feature Extraction with PCA
import numpy
from pandas import read_csv
from sklearn.decomposition import PCA
# load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['30 stock m', '30 stock b', '30 mom m', '30 mom b','30 vol','buy']
dataframe = read_csv('8-31-2017.csv', names=names)
array = dataframe.values
X = array[:,0:5]
Y = array[:,5]
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print( ("Explained Variance: %s") % fit.explained_variance_ratio_ )
print(fit.components_)

Explained Variance: [  9.99999998e-01   1.59901812e-09   2.95574355e-10]
[[ -1.06351828e-07   5.66252514e-06   2.69165098e-07   1.84777817e-06
    1.00000000e+00]
 [ -1.32847395e-02  -9.92314154e-01  -3.73854923e-02  -1.17211179e-01
    5.84423412e-06]
 [ -5.06884349e-02   1.16887753e-01   2.42166676e-02  -9.91555086e-01
    1.15838492e-06]]


In [8]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['stock slope','stock half slope', 'stock tenth slope', 'mom slope', 'mom half slope', 'mom tenth slope', 'stock mom slope diff', 'slope diff', 'stock half mom slope diff', 'slope half diff', 'stock tenth mom slope diff', 'slope tenth diff', 'buy']
dataframe = pandas.read_csv('2017-09-01.csv.csv', names=names)
array = dataframe.values
X = array[:,0:12]
Y = array[:,12]
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

[ 0.09   0.019  0.78   0.012  0.274  0.191  0.005  0.009  0.01   0.212
  0.509  0.049]
[[  1.102e-02   1.162e-02   8.894e-03   8.087e-03]
 [  2.360e-03   1.049e-02   1.552e-02   1.243e-02]
 [  9.322e-02   4.505e-02   5.400e-02   8.837e-02]
 [  0.000e+00   4.794e-03   4.677e-05   1.869e-04]
 [  2.536e-04   0.000e+00   0.000e+00   2.767e-02]]


In [None]:
# http://www.ritchieng.com/machine-learning-evaluate-classification-model/

In [36]:
# Classification accuracy


# read the data into a Pandas DataFrame
import pandas as pd

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
col_names = ['30 stock m', '30 stock b', '30 mom m', '30 mom b','30 diff higher stock','30 mom higher stock','both above','both below','30 sum stock diff','30 sum stock mom','buy']
pima = pd.read_csv('8-31-2017.csv', names=col_names)

# print the first 5 rows of data from the dataframe
pima.head() #uncomment to see first 5 rows of data

Unnamed: 0,30 stock m,30 stock b,30 mom m,30 mom b,30 diff higher stock,30 mom higher stock,both above,both below,30 sum stock diff,30 sum stock mom,buy
0,-0.061032,-0.453012,-0.019646,0.841038,600,0,,,,,
1,0.091194,-5.800931,-0.728834,1.916138,1600,0,,,,,
2,-0.2073,-5.695017,0.177447,-0.032307,100,0,,,,,
3,-0.004533,0.635315,-0.006813,0.252167,3303,0,,,,,
4,0.140502,0.167326,0.143062,0.368725,20614,1,,,,,


In [37]:
# define X and y
#feature_cols = ['stock tenth slope', 'slope half diff', 'mom half slope', 'slope diff', 'mom tenth slope']
feature_cols = ['30 sum stock diff','30 sum stock mom', '30 mom higher stock']

# X is a matrix, hence we use [] to access the features we want in feature_cols
X = pima[feature_cols]

# y is a vector, hence we use dot to access 'label'
y = pima.buy


# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)



# train a logistic regression model on the training set
from sklearn.linear_model import LogisticRegression

# instantiate model
logreg = LogisticRegression()

# fit model
logreg.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [26]:
# make class predictions for the testing set
y_pred_class = logreg.predict(X_test)

# calculate accuracy
########## Null accuracy: accuracy that could be achieved by always predicting the most frequent class
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

0.76


In [27]:
# examine the class distribution of the testing set (using a Pandas Series method)
y_test.value_counts()

0    18
1     7
Name: buy, dtype: int64

In [10]:
# calculate the percentage of ones
# because y_test only contains ones and zeros, we can simply calculate the mean = percentage of ones
y_test.mean()

0.22916666666666666

In [28]:
# calculate the percentage of zeros
1 - y_test.mean()

0.71999999999999997

In [29]:
# calculate null accuracy in a single line of code
# only for binary classification problems coded as 0/1
########## This means that a dumb model that always predicts 0 would be right XX% of the time
max(y_test.mean(), 1 - y_test.mean())

0.71999999999999997

In [30]:
# calculate null accuracy (for multi-class classification problems)
y_test.value_counts().head(1) / len(y_test)

0    0.72
Name: buy, dtype: float64

In [31]:
# print the first 25 true and predicted responses
######### Comparing the true and predicted response values
print('True:', y_test.values[0:25])
print('False:', y_pred_class[0:25])

True: [0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0]
False: [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]


In [32]:
# confusion matrix
# IMPORTANT: first argument is true values, second argument is predicted values
# this produces a 2x2 numpy array (matrix)
print(metrics.confusion_matrix(y_test, y_pred_class))

[[18  0]
 [ 6  1]]
