Sei sulla pagina 1di 19

pima_indian

April 4, 2019

In [1]: %matplotlib inline


import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]: data = 'Documents/ML/pima-data.csv'


diabetes = pd.read_csv(data)
diabetes.head()

Out[2]: num_preg glucose_conc diastolic_bp thickness insulin bmi diab_pred \


0 6 148 72 35 0 33.6 0.627
1 1 85 66 29 0 26.6 0.351
2 8 183 64 0 0 23.3 0.672
3 1 89 66 23 94 28.1 0.167
4 0 137 40 35 168 43.1 2.288

age skin diabetes


0 50 1.3790 True
1 31 1.1426 False
2 32 0.0000 True
3 21 0.9062 False
4 33 1.3790 True

In [3]: diabetes.describe()

Out[3]: num_preg glucose_conc diastolic_bp thickness insulin \


count 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479
std 3.369578 31.972618 19.355807 15.952218 115.244002
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000
75% 6.000000 140.250000 80.000000 32.000000 127.250000
max 17.000000 199.000000 122.000000 99.000000 846.000000

bmi diab_pred age skin


count 768.000000 768.000000 768.000000 768.000000
mean 31.992578 0.471876 33.240885 0.809136

1
std 7.884160 0.331329 11.760232 0.628517
min 0.000000 0.078000 21.000000 0.000000
25% 27.300000 0.243750 24.000000 0.000000
50% 32.000000 0.372500 29.000000 0.906200
75% 36.600000 0.626250 41.000000 1.260800
max 67.100000 2.420000 81.000000 3.900600

In [4]: diabetes.shape

Out[4]: (768, 10)

In [5]: diabetes.isnull().values.any()

Out[5]: False

In [6]: def plot_corr(df, size=15):

corr = diabetes.corr()
fig, ax = plt.subplots(figsize=(size,size))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)

In [7]: plot_corr(diabetes)

2
In [8]: diabetes.corr()

Out[8]: num_preg glucose_conc diastolic_bp thickness insulin \


num_preg 1.000000 0.129459 0.141282 -0.081672 -0.073535
glucose_conc 0.129459 1.000000 0.152590 0.057328 0.331357
diastolic_bp 0.141282 0.152590 1.000000 0.207371 0.088933
thickness -0.081672 0.057328 0.207371 1.000000 0.436783
insulin -0.073535 0.331357 0.088933 0.436783 1.000000
bmi 0.017683 0.221071 0.281805 0.392573 0.197859
diab_pred -0.033523 0.137337 0.041265 0.183928 0.185071
age 0.544341 0.263514 0.239528 -0.113970 -0.042163
skin -0.081672 0.057328 0.207371 1.000000 0.436783
diabetes 0.221898 0.466581 0.065068 0.074752 0.130548

bmi diab_pred age skin diabetes

3
num_preg 0.017683 -0.033523 0.544341 -0.081672 0.221898
glucose_conc 0.221071 0.137337 0.263514 0.057328 0.466581
diastolic_bp 0.281805 0.041265 0.239528 0.207371 0.065068
thickness 0.392573 0.183928 -0.113970 1.000000 0.074752
insulin 0.197859 0.185071 -0.042163 0.436783 0.130548
bmi 1.000000 0.140647 0.036242 0.392573 0.292695
diab_pred 0.140647 1.000000 0.033561 0.183928 0.173844
age 0.036242 0.033561 1.000000 -0.113970 0.238356
skin 0.392573 0.183928 -0.113970 1.000000 0.074752
diabetes 0.292695 0.173844 0.238356 0.074752 1.000000

In [9]: del diabetes['skin']

In [10]: diabetes.head()

Out[10]: num_preg glucose_conc diastolic_bp thickness insulin bmi diab_pred \


0 6 148 72 35 0 33.6 0.627
1 1 85 66 29 0 26.6 0.351
2 8 183 64 0 0 23.3 0.672
3 1 89 66 23 94 28.1 0.167
4 0 137 40 35 168 43.1 2.288

age diabetes
0 50 True
1 31 False
2 32 True
3 21 False
4 33 True

In [11]: plot_corr(diabetes)

4
In [12]: diabetes_map = {True:1, False:0}

In [13]: diabetes['diabetes'] = diabetes['diabetes'].map(diabetes_map)

In [14]: diabetes.head()

Out[14]: num_preg glucose_conc diastolic_bp thickness insulin bmi diab_pred \


0 6 148 72 35 0 33.6 0.627
1 1 85 66 29 0 26.6 0.351
2 8 183 64 0 0 23.3 0.672
3 1 89 66 23 94 28.1 0.167
4 0 137 40 35 168 43.1 2.288

age diabetes
0 50 1

5
1 31 0
2 32 1
3 21 0
4 33 1
In [15]: num_true = len(diabetes.loc[diabetes['diabetes'] == True])
num_false = len(diabetes.loc[diabetes['diabetes'] == False])
print("Number of True Cases: {0} ({1:2.2f}%)".format(num_true, (num_true/(num_true + n
print("Number of False Cases: {0} ({1:2.2f}%)".format(num_false, (num_false/(num_true +
Number of True Cases: 268 (34.90%)
Number of False Cases: 500 (65.10%)

In [16]: from sklearn.model_selection import train_test_split

features = ['num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin', 'bmi',


predict = ['diabetes']

x = diabetes[features].values
y = diabetes[predict].values

split_test_size = 0.30

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = split_test_size,


In [17]: print("{0:1.2f}% in training set".format((len(x_train)/len(diabetes.index)) * 100))
print("{0:1.2f}% in test set".format((len(x_test)/len(diabetes.index)) * 100))
69.92% in training set
30.08% in test set

In [18]: print("Total number of true: {0}".format(len(diabetes.loc[diabetes['diabetes']==1])))


print("Total number of false: {0}".format(len(diabetes.loc[diabetes['diabetes']==0])))
print("")
print("True in training set: {0}".format(len(y_train[y_train[:]==1])))
print("False in training set: {0}".format(len(y_train[y_train[:]==0])))
print("")
print("True in test set: {0}".format(len(y_test[y_test[:]==1])))
print("False in test set: {0}".format(len(y_test[y_test[:]==0])))
Total number of true: 268
Total number of false: 500

True in training set: 188


False in training set: 349

True in test set: 80


False in test set: 151

6
In [19]: diabetes.columns

Out[19]: Index(['num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin',


'bmi', 'diab_pred', 'age', 'diabetes'],
dtype='object')

In [20]: print("Rows missing glucose_conc: {0}".format(len(diabetes.loc[diabetes['glucose_conc']


print("Rows missing diastolic_bp: {0}".format(len(diabetes.loc[diabetes['diastolic_bp']
print("Rows missing thickness: {0}".format(len(diabetes.loc[diabetes['thickness'] ==
print("Rows missing bmi: {0}".format(len(diabetes.loc[diabetes['bmi'] == 0])))
print("Rows missing insulin: {0}".format(len(diabetes.loc[diabetes['insulin'] == 0
print("Rows missing diab_pred: {0}".format(len(diabetes.loc[diabetes['diab_pred'] ==
print("Rows missing age: {0}".format(len(diabetes.loc[diabetes['age'] == 0])))

Rows missing glucose_conc: 5


Rows missing diastolic_bp: 35
Rows missing thickness: 227
Rows missing bmi: 11
Rows missing insulin: 374
Rows missing diab_pred: 0
Rows missing age: 0

In [21]: from sklearn.preprocessing import Imputer

fill_0 = Imputer(missing_values=0, strategy='mean', axis=0)

x_train = fill_0.fit_transform(x_train)
x_test = fill_0.fit_transform(x_test)

/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/utils/deprecation.py:58: Deprecation
warnings.warn(msg, category=DeprecationWarning)

In [22]: print("Rows missing glucose_conc: {0}".format(len(diabetes.loc[diabetes['glucose_conc']


print("Rows missing diastolic_bp: {0}".format(len(diabetes.loc[diabetes['diastolic_bp']
print("Rows missing thickness: {0}".format(len(diabetes.loc[diabetes['thickness'] ==
print("Rows missing bmi: {0}".format(len(diabetes.loc[diabetes['bmi'] == 0])))
print("Rows missing insulin: {0}".format(len(diabetes.loc[diabetes['insulin'] == 0
print("Rows missing diab_pred: {0}".format(len(diabetes.loc[diabetes['diab_pred'] ==
print("Rows missing age: {0}".format(len(diabetes.loc[diabetes['age'] == 0])))

Rows missing glucose_conc: 5


Rows missing diastolic_bp: 35
Rows missing thickness: 227
Rows missing bmi: 11
Rows missing insulin: 374
Rows missing diab_pred: 0
Rows missing age: 0

7
1 Naive Bayes - Gaussian
In [23]: from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()

nb_model.fit(x_train, y_train.ravel())

Out[23]: GaussianNB(priors=None, var_smoothing=1e-09)

In [24]: nb_predict_train = nb_model.predict(x_train)

from sklearn import metrics

print("Accuracy of training data using nb: {0:0.4f}".format(metrics.accuracy_score(y_tr

Accuracy of training data using nb: 0.7542

In [25]: nb_predict_test = nb_model.predict(x_test)

print("Accuracy of testing data using nb: {0:0.4f}".format(metrics.accuracy_score(y_te

Accuracy of testing data using nb: 0.7359

In [26]: print("Confusion Matrix:")


print("{0}".format(metrics.confusion_matrix(y_test, nb_predict_test)))
print("")

print("Classification Report:")
print("{0}".format(metrics.classification_report(y_test, nb_predict_test)))

Confusion Matrix:
[[118 33]
[ 28 52]]

Classification Report:
precision recall f1-score support

0 0.81 0.78 0.79 151


1 0.61 0.65 0.63 80

micro avg 0.74 0.74 0.74 231


macro avg 0.71 0.72 0.71 231
weighted avg 0.74 0.74 0.74 231

8
2 Random Forest
In [27]: from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_train,y_train.ravel())
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:246: FutureWarnin
"10 in version 0.20 to 100 in 0.22.", FutureWarning)

Out[27]: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',


max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
oob_score=False, random_state=42, verbose=0, warm_start=False)
In [28]: rf_predict_train = rf_model.predict(x_train)
print("Accuracy on training data using RF: {0:.4f}".format(metrics.accuracy_score(y_tra
Accuracy on training data using RF: 0.9870

In [29]: rf_predict_test = rf_model.predict(x_test)


print("Accuracy on test data using RF: {0:.4f}".format(metrics.accuracy_score(y_test, r
Accuracy on test data using RF: 0.7100

In [30]: print("Confusion Matrix of Test data")


print(metrics.confusion_matrix(y_test,rf_predict_test))
print("")
print("Classification Report")
print(metrics.classification_report(y_test,rf_predict_test))
Confusion Matrix of Test data
[[121 30]
[ 37 43]]

Classification Report
precision recall f1-score support

0 0.77 0.80 0.78 151


1 0.59 0.54 0.56 80

micro avg 0.71 0.71 0.71 231


macro avg 0.68 0.67 0.67 231
weighted avg 0.70 0.71 0.71 231

9
Accuracy on training data is 98.70% whereas on test data it is 71%. This means that our model
is overfitting the training data.
The method to overcome the problem of overfitting is known as regularization. For imple-
menting regularizations, there are hyperparameters available in the functions
Another method to overcome overfitting is known as cross validation
Both cross validation and regularization are mutually exclusive. Both can be used at the same
time.
Now we will implement Logistic Regression and observe if overfitting is detected or not

3 Logistic Regression
In [31]: from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=42)
lr_model_predict = lr_model.fit(x_train, y_train.ravel())
lr_predict_test = lr_model.predict(x_test)

print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))


print("")
print("Confusion Matrix")
print(metrics.confusion_matrix(y_test, lr_predict_test))
print("")
print("Classification Report")
print(metrics.classification_report(y_test,lr_predict_test))
Accuracy: 0.7532

Confusion Matrix
[[128 23]
[ 34 46]]

Classification Report
precision recall f1-score support

0 0.79 0.85 0.82 151


1 0.67 0.57 0.62 80

micro avg 0.75 0.75 0.75 231


macro avg 0.73 0.71 0.72 231
weighted avg 0.75 0.75 0.75 231

/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)

Let us set the regularization hyperparameters and find out in which configuration we get the
highest recall score

10
In [32]: C_start = 0.1
C_end = 5
C_inc = 0.1

C_values, recall_scores=[],[]

C_val = C_start
best_recall_score=0

while(C_val < C_end):


C_values.append(C_val)
lr_model_loop = LogisticRegression(C=C_val, random_state=42)
lr_model_loop.fit(x_train, y_train.ravel())
lr_predict_loop_test = lr_model_loop.predict(x_test)
recall_score = metrics.recall_score(y_test,lr_predict_loop_test)
recall_scores.append(recall_score)

if(recall_score > best_recall_score):


best_recall_score = recall_score
best_lr_predict_test = lr_predict_loop_test

C_val = C_val + C_inc

best_score_C_val = C_values[recall_scores.index(best_recall_score)]
print("1st max value of {0:.3f} occured at C = {1:.3f}".format(best_recall_score,best_s

plt.plot(C_values, recall_scores, "-")


plt.xlabel("C_values")
plt.ylabel("recall_score")

/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)

11
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)

12
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)

1st max value of 0.613 occured at C = 1.400

Out[32]: Text(0, 0.5, 'recall_score')

13
Still after passing and getting the best value for the regularization parameter we are getting
the recall value of just 61.3%.
Now one of the main reason for this is the classes are not balanced. That is the number of
examples with the ratio of diabetes and without diabetes is almost 35% and 65%. So this creates
biasing. To overcome this there is another hyper parameter for this in our function. Let us apply
that and see if there is any change is the result.

In [41]: C_start = 0.1


C_end = 5
C_inc = 0.1

C_values, recall_scores = [],[]

C_val = C_start
best_recall_score = 0

while C_val < C_end:


C_values.append(C_val)
lr_model_loop = LogisticRegression(C=C_val, class_weight ='balanced', random_state=
lr_model_loop.fit(x_train, y_train.ravel())
lr_predict_loop_test = lr_model_loop.predict(x_test)
recall_score = metrics.recall_score(y_test, lr_predict_loop_test)
recall_scores.append(recall_score)

if(recall_score > best_recall_score):

14
best_recall_score = recall_score
best_lr_predict_test = lr_predict_loop_test

C_val = C_val + C_inc

best_score_C_val = C_values[recall_scores.index(best_recall_score)]
print("1st max value of {0:.3f} occured at C = {1:.3f}".format(best_recall_score,best_s

plt.plot(C_values, recall_scores, "-")


plt.xlabel("C_values")
plt.ylabel("recall_score")

1st max value of 0.738 occured at C = 0.300

/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future

15
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future

16
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)
/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)

Out[41]: Text(0, 0.5, 'recall_score')

Here you can see after balancing the weights of the classes we get the recall score of 73.8%
In [34]: from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(class_weight='balanced', C=best_score_C_val, random_state

17
lr_model.fit(x_train, y_train.ravel())
lr_predict_test = lr_model.predict(x_test)

print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))


print("")
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, lr_predict_test))
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lr_predict_test))
print("Recall Score: {0:.4f}".format(metrics.recall_score(y_test, lr_predict_test)))

Accuracy: 0.7143

Confusion Matrix:
[[106 45]
[ 21 59]]

Classification Report
precision recall f1-score support

0 0.83 0.70 0.76 151


1 0.57 0.74 0.64 80

micro avg 0.71 0.71 0.71 231


macro avg 0.70 0.72 0.70 231
weighted avg 0.74 0.71 0.72 231

Recall Score: 0.7375

/home/edutech/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: Future
FutureWarning)

4 Logistic Regression with Cross Validation


In [35]: from sklearn.linear_model import LogisticRegressionCV

lr_cv_model = LogisticRegressionCV(n_jobs=-1, random_state=42, Cs=3, cv=10, refit=False


lr_cv_model.fit(x_train,y_train.ravel())

Out[35]: LogisticRegressionCV(Cs=3, class_weight='balanced', cv=10, dual=False,


fit_intercept=True, intercept_scaling=1.0, max_iter=100,
multi_class='warn', n_jobs=-1, penalty='l2', random_state=42,
refit=False, scoring=None, solver='lbfgs', tol=0.0001,
verbose=0)

18
In [36]: lr_cv_predict_test = lr_cv_model.predict(x_test)

print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_cv_predict_test)))


print("")
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, lr_cv_predict_test))
print("")
print("Classification Report:")
print(metrics.classification_report(y_test, lr_cv_predict_test))
print("")
print("Recall Score:")
print(metrics.recall_score(y_test, lr_cv_predict_test))

Accuracy: 0.7013

Confusion Matrix:
[[108 43]
[ 26 54]]

Classification Report:
precision recall f1-score support

0 0.81 0.72 0.76 151


1 0.56 0.68 0.61 80

micro avg 0.70 0.70 0.70 231


macro avg 0.68 0.70 0.68 231
weighted avg 0.72 0.70 0.71 231

Recall Score:
0.675

In [ ]:

19

Potrebbero piacerti anche