Data handling
판다스를 활용한 데이터 조사
# 필요한 라이브러리를 불러옵니다.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 깃허브에 준비된 데이터를 가져옵니다.
!git clone https://github.com/taehojo/data.git
Cloning into 'data'...
# 피마 인디언 당뇨병 데이터셋을 불러옵니다.
df = pd.read_csv('./data/pima-indians-diabetes3.csv')
df.head()
| pregnant | plasma | pressure | thickness | insulin | bmi | pedigree | age | diabetes | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
df['diabetes'].value_counts()
0 500
1 268
Name: diabetes, dtype: int64
# 각 컬럼별 통계치 출력
df.describe()
| pregnant | plasma | pressure | thickness | insulin | bmi | pedigree | age | diabetes | |
|---|---|---|---|---|---|---|---|---|---|
| count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
| mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
| std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
| 25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
| 50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
| 75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
| max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
상관관계
피마 인디언을 대상으로 당뇨병 여부를 측정하는 것이 목적이므로,
diabetes(=당뇨병 발병 여부) 컬럼이 다른 컬럼들과 갖는 상관관계를 눈여겨보기로 한다.
round(df.corr(),2)
| pregnant | plasma | pressure | thickness | insulin | bmi | pedigree | age | diabetes | |
|---|---|---|---|---|---|---|---|---|---|
| pregnant | 1.00 | 0.13 | 0.14 | -0.08 | -0.07 | 0.02 | -0.03 | 0.54 | 0.22 |
| plasma | 0.13 | 1.00 | 0.15 | 0.06 | 0.33 | 0.22 | 0.14 | 0.26 | 0.47 |
| pressure | 0.14 | 0.15 | 1.00 | 0.21 | 0.09 | 0.28 | 0.04 | 0.24 | 0.07 |
| thickness | -0.08 | 0.06 | 0.21 | 1.00 | 0.44 | 0.39 | 0.18 | -0.11 | 0.07 |
| insulin | -0.07 | 0.33 | 0.09 | 0.44 | 1.00 | 0.20 | 0.19 | -0.04 | 0.13 |
| bmi | 0.02 | 0.22 | 0.28 | 0.39 | 0.20 | 1.00 | 0.14 | 0.04 | 0.29 |
| pedigree | -0.03 | 0.14 | 0.04 | 0.18 | 0.19 | 0.14 | 1.00 | 0.03 | 0.17 |
| age | 0.54 | 0.26 | 0.24 | -0.11 | -0.04 | 0.04 | 0.03 | 1.00 | 0.24 |
| diabetes | 0.22 | 0.47 | 0.07 | 0.07 | 0.13 | 0.29 | 0.17 | 0.24 | 1.00 |
colormap = plt.cm.gist_heat # 그래프의 색상 구성을 정합니다.
plt.figure(figsize=(12,10)) # 그래프의 크기를 정합니다.
sns.heatmap(df.corr(), linewidths=0.1, vmax = 0.5, cmap=colormap, linecolor='white',annot=True)
plt.show()

중요한 데이터 추출하기
plt.hist(x = [df[df.diabetes==0]['plasma'], df[df.diabetes==1]['plasma']], bins=30, label=['normal','diabetes'])
plt.legend()
<matplotlib.legend.Legend at 0x205f3831820>

plt.hist(x = [df[df.diabetes==0]['plasma'], df[df.diabetes==1]['plasma']], bins=30, histtype='barstacked', label=['normal','diabetes'])
plt.legend()
<matplotlib.legend.Legend at 0x205f216abe0>

plt.hist(x = [df[df.diabetes==0]['bmi'], df[df.diabetes==1]['bmi']], bins=30, histtype='barstacked', label=['normal','diabetes'])
plt.legend()
<matplotlib.legend.Legend at 0x205f2256b50>

피마 인디언의 당뇨병 예측 실행
# 머신러닝 라이브러리를 불러옵니다.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# 깃허브에 준비된 데이터를 가져온다.
!git clone https://github.com/taehojo/data.git
# 피마 인디언 당뇨병 데이터셋을 불러온다.
df = pd.read_csv('./data/pima-indians-diabetes3.csv')
X = df.iloc[:, 0:8] # 세부 정보(0번째 컬럼~7번째 컬럼)를 X로 지정한다.
y = df.iloc[:, 8] # 당뇨병 여부(8번째 컬럼)를 y로 지정한다.
fatal: destination path 'data' already exists and is not an empty directory.
# 모델을 설정합니다.
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu', name='Dense_1'))
model.add(Dense(8, activation='relu', name='Dense_2'))
model.add(Dense(1, activation='sigmoid', name='Dense_3'))
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
Dense_1 (Dense) (None, 12) 108
Dense_2 (Dense) (None, 8) 104
Dense_3 (Dense) (None, 1) 9
=================================================================
Total params: 221
Trainable params: 221
Non-trainable params: 0
_________________________________________________________________
# 모델을 컴파일합니다.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# 모델을 실행합니다.
history = model.fit(X, y, epochs=100, batch_size=5)
Epoch 1/100
154/154 [==============================] - 3s 5ms/step - loss: 1.4316 - accuracy: 0.5859
Epoch 2/100
154/154 [==============================] - 1s 4ms/step - loss: 0.9464 - accuracy: 0.6198
Epoch 3/100
154/154 [==============================] - 1s 4ms/step - loss: 0.8328 - accuracy: 0.6589
Epoch 4/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7819 - accuracy: 0.6849
Epoch 5/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7723 - accuracy: 0.6823
Epoch 6/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7317 - accuracy: 0.6745
Epoch 7/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7479 - accuracy: 0.6628
Epoch 8/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6975 - accuracy: 0.6823
Epoch 9/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6726 - accuracy: 0.7135
Epoch 10/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7023 - accuracy: 0.6888
Epoch 11/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7281 - accuracy: 0.6849
Epoch 12/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6596 - accuracy: 0.6927
Epoch 13/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6803 - accuracy: 0.6823
Epoch 14/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6663 - accuracy: 0.6953
Epoch 15/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6445 - accuracy: 0.7044
Epoch 16/100
154/154 [==============================] - 1s 5ms/step - loss: 0.6444 - accuracy: 0.6875
Epoch 17/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6272 - accuracy: 0.6979
Epoch 18/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6452 - accuracy: 0.6875
Epoch 19/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6277 - accuracy: 0.7005
Epoch 20/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6343 - accuracy: 0.7018
Epoch 21/100
154/154 [==============================] - 1s 3ms/step - loss: 0.6115 - accuracy: 0.7174
Epoch 22/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6135 - accuracy: 0.7057
Epoch 23/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5833 - accuracy: 0.7083
Epoch 24/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5899 - accuracy: 0.7044
Epoch 25/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5826 - accuracy: 0.7188
Epoch 26/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5798 - accuracy: 0.7227
Epoch 27/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5617 - accuracy: 0.7174
Epoch 28/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6035 - accuracy: 0.7201
Epoch 29/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5758 - accuracy: 0.7188
Epoch 30/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5691 - accuracy: 0.7135
Epoch 31/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5973 - accuracy: 0.7188
Epoch 32/100
154/154 [==============================] - 1s 3ms/step - loss: 0.6078 - accuracy: 0.7057
Epoch 33/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5485 - accuracy: 0.7214
Epoch 34/100
154/154 [==============================] - 1s 3ms/step - loss: 0.6110 - accuracy: 0.7174
Epoch 35/100
154/154 [==============================] - 1s 3ms/step - loss: 0.6024 - accuracy: 0.7148
Epoch 36/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5678 - accuracy: 0.7174
Epoch 37/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5779 - accuracy: 0.7070
Epoch 38/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5524 - accuracy: 0.7305
Epoch 39/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5767 - accuracy: 0.7148
Epoch 40/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5720 - accuracy: 0.7201
Epoch 41/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5335 - accuracy: 0.7370
Epoch 42/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5865 - accuracy: 0.7174
Epoch 43/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5595 - accuracy: 0.7279
Epoch 44/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5475 - accuracy: 0.7201
Epoch 45/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5507 - accuracy: 0.7448
Epoch 46/100
154/154 [==============================] - 1s 5ms/step - loss: 0.5818 - accuracy: 0.7227
Epoch 47/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5629 - accuracy: 0.7188
Epoch 48/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5516 - accuracy: 0.7266
Epoch 49/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5475 - accuracy: 0.7396
Epoch 50/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5574 - accuracy: 0.7135
Epoch 51/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5501 - accuracy: 0.7318
Epoch 52/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5468 - accuracy: 0.7344
Epoch 53/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5508 - accuracy: 0.7344
Epoch 54/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5659 - accuracy: 0.7135
Epoch 55/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5476 - accuracy: 0.7214
Epoch 56/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5596 - accuracy: 0.7253
Epoch 57/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5302 - accuracy: 0.7370
Epoch 58/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5264 - accuracy: 0.7500
Epoch 59/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5513 - accuracy: 0.7331
Epoch 60/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5599 - accuracy: 0.7174
Epoch 61/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5705 - accuracy: 0.7214
Epoch 62/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5520 - accuracy: 0.7266
Epoch 63/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5375 - accuracy: 0.7383
Epoch 64/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5342 - accuracy: 0.7344
Epoch 65/100
154/154 [==============================] - 1s 5ms/step - loss: 0.5320 - accuracy: 0.7526
Epoch 66/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5284 - accuracy: 0.7409
Epoch 67/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5327 - accuracy: 0.7565
Epoch 68/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5282 - accuracy: 0.7357
Epoch 69/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5186 - accuracy: 0.7461
Epoch 70/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5337 - accuracy: 0.7513
Epoch 71/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5272 - accuracy: 0.7461
Epoch 72/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5322 - accuracy: 0.7656
Epoch 73/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5296 - accuracy: 0.7461
Epoch 74/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5300 - accuracy: 0.7370
Epoch 75/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5376 - accuracy: 0.7422
Epoch 76/100
154/154 [==============================] - 1s 5ms/step - loss: 0.5430 - accuracy: 0.7292
Epoch 77/100
154/154 [==============================] - 1s 5ms/step - loss: 0.5305 - accuracy: 0.7435
Epoch 78/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5299 - accuracy: 0.7305
Epoch 79/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5170 - accuracy: 0.7526
Epoch 80/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5277 - accuracy: 0.7409
Epoch 81/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5284 - accuracy: 0.7552
Epoch 82/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5331 - accuracy: 0.7344
Epoch 83/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5130 - accuracy: 0.7422
Epoch 84/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5421 - accuracy: 0.7292
Epoch 85/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5067 - accuracy: 0.7760
Epoch 86/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5174 - accuracy: 0.7604
Epoch 87/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5390 - accuracy: 0.7161
Epoch 88/100
154/154 [==============================] - 1s 5ms/step - loss: 0.5227 - accuracy: 0.7461
Epoch 89/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5132 - accuracy: 0.7513
Epoch 90/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5214 - accuracy: 0.7448
Epoch 91/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5108 - accuracy: 0.7409
Epoch 92/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5188 - accuracy: 0.7487
Epoch 93/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5093 - accuracy: 0.7565
Epoch 94/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5217 - accuracy: 0.7513
Epoch 95/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5299 - accuracy: 0.7318
Epoch 96/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5127 - accuracy: 0.7461
Epoch 97/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5124 - accuracy: 0.7448
Epoch 98/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5185 - accuracy: 0.7591
Epoch 99/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5068 - accuracy: 0.7565
Epoch 100/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5291 - accuracy: 0.7513