In [1]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/chaitanyabaranwal/ParkinsonAnalysis/master/parkinsons.csv")
In [3]:
data.shape
Out[3]:
(195, 24)
In [4]:
data.head()
Out[4]:
name MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer MDVP:Shimmer(dB) Shimmer:APQ3 Shimmer:APQ5 MDVP:APQ Shimmer:DDA NHR HNR RPDE DFA spread1 spread2 D2 PPE status
0 phon_R01_S01_1 119.992 157.302 74.997 0.00784 0.00007 0.00370 0.00554 0.01109 0.04374 0.426 0.02182 0.03130 0.02971 0.06545 0.02211 21.033 0.414783 0.815285 -4.813031 0.266482 2.301442 0.284654 1
1 phon_R01_S01_2 122.400 148.650 113.819 0.00968 0.00008 0.00465 0.00696 0.01394 0.06134 0.626 0.03134 0.04518 0.04368 0.09403 0.01929 19.085 0.458359 0.819521 -4.075192 0.335590 2.486855 0.368674 1
2 phon_R01_S01_3 116.682 131.111 111.555 0.01050 0.00009 0.00544 0.00781 0.01633 0.05233 0.482 0.02757 0.03858 0.03590 0.08270 0.01309 20.651 0.429895 0.825288 -4.443179 0.311173 2.342259 0.332634 1
3 phon_R01_S01_4 116.676 137.871 111.366 0.00997 0.00009 0.00502 0.00698 0.01505 0.05492 0.517 0.02924 0.04005 0.03772 0.08771 0.01353 20.644 0.434969 0.819235 -4.117501 0.334147 2.405554 0.368975 1
4 phon_R01_S01_5 116.014 141.781 110.655 0.01284 0.00011 0.00655 0.00908 0.01966 0.06425 0.584 0.03490 0.04825 0.04465 0.10470 0.01767 19.649 0.417356 0.823484 -3.747787 0.234513 2.332180 0.410335 1
In [5]:
features = data.loc[:,data.columns!='status'].values[:,1:]
In [6]:
features
Out[6]:
array([[119.992, 157.30200000000002, 74.997, ..., 0.266482,
        2.3014419999999998, 0.284654],
       [122.4, 148.65, 113.819, ..., 0.33559, 2.486855, 0.368674],
       [116.682, 131.111, 111.555, ..., 0.311173, 2.342259,
        0.33263400000000004],
       ...,
       [174.688, 240.005, 74.28699999999999, ..., 0.158453,
        2.6797720000000003, 0.13172799999999998],
       [198.764, 396.961, 74.904, ..., 0.207454, 2.138608, 0.123306],
       [214.28900000000002, 260.277, 77.973, ..., 0.190667, 2.555477,
        0.148569]], dtype=object)
In [7]:
labels = data.loc[:,'status'].values
In [8]:
labels
Out[8]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
In [9]:
labels[labels==1].shape[0]
Out[9]:
147
In [10]:
labels[labels==0].shape[0]
Out[10]:
48
In [11]:
scalar = MinMaxScaler((-1,1))
In [12]:
x = scalar.fit_transform(features)
y = labels
In [13]:
x
Out[13]:
array([[-0.63138346, -0.77481654, -0.89037042, ...,  0.17153026,
        -0.21867743, -0.0053808 ],
       [-0.6033463 , -0.81013911, -0.4433544 , ...,  0.48267409,
        -0.05370956,  0.34265204],
       [-0.66992292, -0.88174367, -0.46942324, ...,  0.37274182,
        -0.18236124,  0.19336492],
       ...,
       [ 0.00546073, -0.43717403, -0.89854572, ..., -0.31484696,
         0.11793486, -0.63884033],
       [ 0.28578581,  0.20361309, -0.89144127, ..., -0.09423055,
        -0.36355605, -0.67372646],
       [ 0.46654868, -0.35441175, -0.85610326, ..., -0.16981039,
         0.00734563, -0.5690805 ]])
In [14]:
y
Out[14]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
In [15]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=32,random_state=42)
In [16]:
from xgboost import XGBClassifier
In [17]:
xgb = XGBClassifier()
xgb.fit(x_train,y_train)
y_pred = xgb.predict(x_test)
In [18]:
accuracy_score(y_test,y_pred)
Out[18]:
0.96875