진박사의 일상

Multiclass Identification (Reuter 기사 분류) 본문

프로그래밍/딥러닝(Keras)

Multiclass Identification (Reuter 기사 분류)

진박사. 2021. 4. 26. 13:07
from keras.datasets import reuters

(train_data, train_labels), (test_data, test_labels) = reuters.load_data(num_words=10000)

'''
word_index = reuters.get_word_index()
reverse_word_index = dict(
    [(value, key) for (key, value) in word_index.items()])
decoded_review = ' '.join(
    [reverse_word_index.get(i -3, '?') for i in train_data[0]])
'''

import numpy as np

#전처리
def vectorize_sequence(sequence, dimension=10000): #one-hot encoding
    results = np.zeros((len(sequence), dimension)) #0으로 채워진 len(sequence)xdimension 벡터를 생성
    for i, sequence in enumerate(sequence):
        results[i, sequence] = 1
    return results

x_train = vectorize_sequence(train_data)
x_test = vectorize_sequence(test_data)
#라벨 전처리
def to_one_hot(labels, dimension=46): #labels one-hot encoding
    results = np.zeros((len(labels), dimension)) #0으로 채워진 len(sequence)xdimension 벡터를 생성
    for i, label in enumerate(labels):
        results[i, label] = 1
    return results
x_train_labels = to_one_hot(train_labels)
x_test_labels = to_one_hot(test_labels)
#to_categorical 함수로 대체 가능

from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(10000,))) #46클래스로 분할하므로 그보다는 커야한다
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))
#softmax는 다중 분류 문제에서 효과적 - 46차원의 출력 벡터, 46개의 값을 합하면 1.0

model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

#검증셋과 훈련셋 분할
x_val = x_train[:1000]
x_train = x_train[1000:]
y_val = x_train_labels[:1000]
y_train = x_train_labels[1000:]


history = model.fit(x_train,
                    y_train,
                    #epochs=20 #과적합
                    epochs=9,
                    batch_size=512,
                    validation_data=(x_val, y_val))


results = model.evaluate(x_test, x_test_labels)
print('Accuracy : ', results[1]*100, '%')
#predictions = model.predict(x_test)

#결과 시각화
import matplotlib.pyplot as plt

history_dict = history.history
loss = history_dict['loss']
val_loss = history_dict['val_loss']
acc = history_dict['acc']
val_acc = history_dict['val_acc']

epochs = range(1, len(loss) + 1)

plt.subplot(121)
plt.plot(epochs, loss, 'bo', label='Training loss') #파란 점선
plt.plot(epochs, val_loss, 'b', label='Valdation loss')#파란 실선
plt.subplot(122)
plt.plot(epochs, acc, 'ro', label='Training Accuracy')#빨간 점선
plt.plot(epochs, val_acc, 'r', label='Validation Accuracy')#빨간 실선
plt.title('Training & Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

케라스 창시자로부터 배우는 딥러닝 책을 참고하여 작성하였습니다.