9 분 소요

판다스를 활용한 데이터 조사

# 필요한 라이브러리를 불러옵니다. 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# 깃허브에 준비된 데이터를 가져옵니다. 
!git clone https://github.com/taehojo/data.git
Cloning into 'data'...
# 피마 인디언 당뇨병 데이터셋을 불러옵니다.
df = pd.read_csv('./data/pima-indians-diabetes3.csv')
df.head()
pregnant plasma pressure thickness insulin bmi pedigree age diabetes
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
df['diabetes'].value_counts()
0    500
1    268
Name: diabetes, dtype: int64
# 각 컬럼별 통계치 출력
df.describe()
pregnant plasma pressure thickness insulin bmi pedigree age diabetes
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000

상관관계

피마 인디언을 대상으로 당뇨병 여부를 측정하는 것이 목적이므로,

diabetes(=당뇨병 발병 여부) 컬럼이 다른 컬럼들과 갖는 상관관계를 눈여겨보기로 한다.

round(df.corr(),2)
pregnant plasma pressure thickness insulin bmi pedigree age diabetes
pregnant 1.00 0.13 0.14 -0.08 -0.07 0.02 -0.03 0.54 0.22
plasma 0.13 1.00 0.15 0.06 0.33 0.22 0.14 0.26 0.47
pressure 0.14 0.15 1.00 0.21 0.09 0.28 0.04 0.24 0.07
thickness -0.08 0.06 0.21 1.00 0.44 0.39 0.18 -0.11 0.07
insulin -0.07 0.33 0.09 0.44 1.00 0.20 0.19 -0.04 0.13
bmi 0.02 0.22 0.28 0.39 0.20 1.00 0.14 0.04 0.29
pedigree -0.03 0.14 0.04 0.18 0.19 0.14 1.00 0.03 0.17
age 0.54 0.26 0.24 -0.11 -0.04 0.04 0.03 1.00 0.24
diabetes 0.22 0.47 0.07 0.07 0.13 0.29 0.17 0.24 1.00
colormap = plt.cm.gist_heat # 그래프의 색상 구성을 정합니다.
plt.figure(figsize=(12,10)) # 그래프의 크기를 정합니다.
sns.heatmap(df.corr(), linewidths=0.1, vmax = 0.5, cmap=colormap, linecolor='white',annot=True)
plt.show()

png

중요한 데이터 추출하기

plt.hist(x = [df[df.diabetes==0]['plasma'], df[df.diabetes==1]['plasma']], bins=30, label=['normal','diabetes'])
plt.legend()
<matplotlib.legend.Legend at 0x205f3831820>

png

plt.hist(x = [df[df.diabetes==0]['plasma'], df[df.diabetes==1]['plasma']], bins=30, histtype='barstacked', label=['normal','diabetes'])
plt.legend()
<matplotlib.legend.Legend at 0x205f216abe0>

png

plt.hist(x = [df[df.diabetes==0]['bmi'], df[df.diabetes==1]['bmi']], bins=30, histtype='barstacked', label=['normal','diabetes'])
plt.legend()
<matplotlib.legend.Legend at 0x205f2256b50>

png

피마 인디언의 당뇨병 예측 실행

# 머신러닝 라이브러리를 불러옵니다.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# 깃허브에 준비된 데이터를 가져온다. 
!git clone https://github.com/taehojo/data.git

# 피마 인디언 당뇨병 데이터셋을 불러온다.
df = pd.read_csv('./data/pima-indians-diabetes3.csv')

X = df.iloc[:, 0:8] # 세부 정보(0번째 컬럼~7번째 컬럼)를 X로 지정한다. 
y = df.iloc[:, 8] # 당뇨병 여부(8번째 컬럼)를 y로 지정한다. 
fatal: destination path 'data' already exists and is not an empty directory.
# 모델을 설정합니다. 
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu', name='Dense_1'))
model.add(Dense(8, activation='relu', name='Dense_2'))
model.add(Dense(1, activation='sigmoid', name='Dense_3'))
model.summary()
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 Dense_1 (Dense)             (None, 12)                108       
                                                                 
 Dense_2 (Dense)             (None, 8)                 104       
                                                                 
 Dense_3 (Dense)             (None, 1)                 9         
                                                                 
=================================================================
Total params: 221
Trainable params: 221
Non-trainable params: 0
_________________________________________________________________
# 모델을 컴파일합니다.
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 모델을 실행합니다.
history = model.fit(X, y, epochs=100, batch_size=5)
Epoch 1/100
154/154 [==============================] - 3s 5ms/step - loss: 1.4316 - accuracy: 0.5859
Epoch 2/100
154/154 [==============================] - 1s 4ms/step - loss: 0.9464 - accuracy: 0.6198
Epoch 3/100
154/154 [==============================] - 1s 4ms/step - loss: 0.8328 - accuracy: 0.6589
Epoch 4/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7819 - accuracy: 0.6849
Epoch 5/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7723 - accuracy: 0.6823
Epoch 6/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7317 - accuracy: 0.6745
Epoch 7/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7479 - accuracy: 0.6628
Epoch 8/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6975 - accuracy: 0.6823
Epoch 9/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6726 - accuracy: 0.7135
Epoch 10/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7023 - accuracy: 0.6888
Epoch 11/100
154/154 [==============================] - 1s 4ms/step - loss: 0.7281 - accuracy: 0.6849
Epoch 12/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6596 - accuracy: 0.6927
Epoch 13/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6803 - accuracy: 0.6823
Epoch 14/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6663 - accuracy: 0.6953
Epoch 15/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6445 - accuracy: 0.7044
Epoch 16/100
154/154 [==============================] - 1s 5ms/step - loss: 0.6444 - accuracy: 0.6875
Epoch 17/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6272 - accuracy: 0.6979
Epoch 18/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6452 - accuracy: 0.6875
Epoch 19/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6277 - accuracy: 0.7005
Epoch 20/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6343 - accuracy: 0.7018
Epoch 21/100
154/154 [==============================] - 1s 3ms/step - loss: 0.6115 - accuracy: 0.7174
Epoch 22/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6135 - accuracy: 0.7057
Epoch 23/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5833 - accuracy: 0.7083
Epoch 24/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5899 - accuracy: 0.7044
Epoch 25/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5826 - accuracy: 0.7188
Epoch 26/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5798 - accuracy: 0.7227
Epoch 27/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5617 - accuracy: 0.7174
Epoch 28/100
154/154 [==============================] - 1s 4ms/step - loss: 0.6035 - accuracy: 0.7201
Epoch 29/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5758 - accuracy: 0.7188
Epoch 30/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5691 - accuracy: 0.7135
Epoch 31/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5973 - accuracy: 0.7188
Epoch 32/100
154/154 [==============================] - 1s 3ms/step - loss: 0.6078 - accuracy: 0.7057
Epoch 33/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5485 - accuracy: 0.7214
Epoch 34/100
154/154 [==============================] - 1s 3ms/step - loss: 0.6110 - accuracy: 0.7174
Epoch 35/100
154/154 [==============================] - 1s 3ms/step - loss: 0.6024 - accuracy: 0.7148
Epoch 36/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5678 - accuracy: 0.7174
Epoch 37/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5779 - accuracy: 0.7070
Epoch 38/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5524 - accuracy: 0.7305
Epoch 39/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5767 - accuracy: 0.7148
Epoch 40/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5720 - accuracy: 0.7201
Epoch 41/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5335 - accuracy: 0.7370
Epoch 42/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5865 - accuracy: 0.7174
Epoch 43/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5595 - accuracy: 0.7279
Epoch 44/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5475 - accuracy: 0.7201
Epoch 45/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5507 - accuracy: 0.7448
Epoch 46/100
154/154 [==============================] - 1s 5ms/step - loss: 0.5818 - accuracy: 0.7227
Epoch 47/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5629 - accuracy: 0.7188
Epoch 48/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5516 - accuracy: 0.7266
Epoch 49/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5475 - accuracy: 0.7396
Epoch 50/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5574 - accuracy: 0.7135
Epoch 51/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5501 - accuracy: 0.7318
Epoch 52/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5468 - accuracy: 0.7344
Epoch 53/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5508 - accuracy: 0.7344
Epoch 54/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5659 - accuracy: 0.7135
Epoch 55/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5476 - accuracy: 0.7214
Epoch 56/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5596 - accuracy: 0.7253
Epoch 57/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5302 - accuracy: 0.7370
Epoch 58/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5264 - accuracy: 0.7500
Epoch 59/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5513 - accuracy: 0.7331
Epoch 60/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5599 - accuracy: 0.7174
Epoch 61/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5705 - accuracy: 0.7214
Epoch 62/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5520 - accuracy: 0.7266
Epoch 63/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5375 - accuracy: 0.7383
Epoch 64/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5342 - accuracy: 0.7344
Epoch 65/100
154/154 [==============================] - 1s 5ms/step - loss: 0.5320 - accuracy: 0.7526
Epoch 66/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5284 - accuracy: 0.7409
Epoch 67/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5327 - accuracy: 0.7565
Epoch 68/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5282 - accuracy: 0.7357
Epoch 69/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5186 - accuracy: 0.7461
Epoch 70/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5337 - accuracy: 0.7513
Epoch 71/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5272 - accuracy: 0.7461
Epoch 72/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5322 - accuracy: 0.7656
Epoch 73/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5296 - accuracy: 0.7461
Epoch 74/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5300 - accuracy: 0.7370
Epoch 75/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5376 - accuracy: 0.7422
Epoch 76/100
154/154 [==============================] - 1s 5ms/step - loss: 0.5430 - accuracy: 0.7292
Epoch 77/100
154/154 [==============================] - 1s 5ms/step - loss: 0.5305 - accuracy: 0.7435
Epoch 78/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5299 - accuracy: 0.7305
Epoch 79/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5170 - accuracy: 0.7526
Epoch 80/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5277 - accuracy: 0.7409
Epoch 81/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5284 - accuracy: 0.7552
Epoch 82/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5331 - accuracy: 0.7344
Epoch 83/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5130 - accuracy: 0.7422
Epoch 84/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5421 - accuracy: 0.7292
Epoch 85/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5067 - accuracy: 0.7760
Epoch 86/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5174 - accuracy: 0.7604
Epoch 87/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5390 - accuracy: 0.7161
Epoch 88/100
154/154 [==============================] - 1s 5ms/step - loss: 0.5227 - accuracy: 0.7461
Epoch 89/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5132 - accuracy: 0.7513
Epoch 90/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5214 - accuracy: 0.7448
Epoch 91/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5108 - accuracy: 0.7409
Epoch 92/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5188 - accuracy: 0.7487
Epoch 93/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5093 - accuracy: 0.7565
Epoch 94/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5217 - accuracy: 0.7513
Epoch 95/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5299 - accuracy: 0.7318
Epoch 96/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5127 - accuracy: 0.7461
Epoch 97/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5124 - accuracy: 0.7448
Epoch 98/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5185 - accuracy: 0.7591
Epoch 99/100
154/154 [==============================] - 1s 3ms/step - loss: 0.5068 - accuracy: 0.7565
Epoch 100/100
154/154 [==============================] - 1s 4ms/step - loss: 0.5291 - accuracy: 0.7513

업데이트: