헬스장 기구 분류 AI 진행과정

CoderHan 2023. 6. 15. 19:28

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# 한글 폰트 설정하기
fontpath = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=10)
plt.rc('font', family='NanumGothic')
matplotlib.matplotlib_fname()
print(plt.rcParams['font.family'])

먼저 Data 시각화에 사용할 matplotlib와 모듈, 한글폰트 등을 설정해주었다.

!git clone https://github.com/ndb796/bing_image_downloader #이미지 크롤링에 필요한 라이브러리

import os
import shutil
from bing_image_downloader.bing_image_downloader import downloader


directory_list = [
    './custom_dataset/train/',
    './custom_dataset/test/',
]

# 초기 디렉토리 만들기
for directory in directory_list:
    if not os.path.isdir(directory):
        os.makedirs(directory)

# 수집한 이미지를 학습 데이터와 평가 데이터로 구분하는 함수
def dataset_split(query, train_cnt):
    # 학습 및 평가 데이터셋 디렉토리 만들기
    for directory in directory_list:
        if not os.path.isdir(directory + '/' + query):
            os.makedirs(directory + '/' + query)
    # 학습 및 평가 데이터셋 준비하기
    cnt = 0
    for file_name in os.listdir(query):
        if cnt < train_cnt:
            print(f'[Train Dataset] {file_name}')
            shutil.move(query + '/' + file_name, './custom_dataset/train/' + query + '/' + file_name)
        else:
            print(f'[Test Dataset] {file_name}')
            shutil.move(query + '/' + file_name, './custom_dataset/test/' + query + '/' + file_name)
        cnt += 1
    shutil.rmtree(query)

이미지 크롤링에 필요한 bingDownloader를 다운로드 해주고 dataset을 담을 폴더의 경로들을 directory_list변수에 담았다.

os를 사용하여 경로에 폴더가 있으면 생성해주는 for문을 작성해준다.

dataset_split함수는 query와 train_cnt를 인자로 받는다. 이 train_cnt는 추후에 몇 개의 데이터를 학습 데이터로 사용할 것인지 결정하는 매개변수이다.

우리는 헬스기구별로 구분할 것이기 때문에 query(헬스기구)에 맞는 폴더를 각각 생성해준다.

그리고 각각 testData와 trainData를 나눠서 각각의 폴더에 담아준다.

여기서 사용한 shutil은 고수준 파일 연산을 지원하는 파이썬 내장 라이브러리인데 이걸로 파일을 이동하고 원래 있던 폴더를 삭제하는 역할을 해준다.

이제 이미지를 크롤링해보자

query = '벤치프레스 머신'
downloader.download(query, limit=40,  output_dir='./', adult_filter_off=True, force_replace=False, timeout=60)
dataset_split(query, 30)

쿼리는 크롤링할 제목이다. download의 파라미터를 살펴보면 쿼리는 검색어 limit은 받을 갯수이다. 나머지는 크게 중요하지 않다. 그리고 위에서 선언한 dataset_split함수로 30개의 traindata와 나머지는 testdata로 분류해준다.

import torch
import torch.nn as nn
import torch.optim as optim

import torchvision
from torchvision import datasets, models, transforms

import numpy as np
import time


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # device 객체

신경망 모델을 지원하는 torch로 학습을 진행할 예정이다.

!rm -rf `find -type d -name .ipynb_checkpoints`
# 데이터셋을 불러올 때 사용할 변형(transformation) 객체 정의
transforms_train = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(), # 데이터 증진(augmentation)
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) # 정규화(normalization)
])

transforms_test = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

data_dir = './custom_dataset'
train_datasets = datasets.ImageFolder(os.path.join(data_dir, 'train'),transforms_train)
test_datasets = datasets.ImageFolder(os.path.join(data_dir, 'test'), transforms_test)

train_dataloader = torch.utils.data.DataLoader(train_datasets, batch_size=3, shuffle=True, num_workers=3)
test_dataloader = torch.utils.data.DataLoader(test_datasets, batch_size=3, shuffle=True, num_workers=3)

print('학습 데이터셋 크기:', len(train_datasets))
print('테스트 데이터셋 크기:', len(test_datasets))

class_names = train_datasets.classes
print('클래스:', class_names)

우리가 수집한 데이터는 우리가 원하는 모습이 아닐 수 있기 때문에 transform을 사용해 우리가 원하는 이미지로

변형 작업을 수행한다. 사이즈는224x224크기로 변경하고 randomHorizontalFlip은 랜덤으로 이미지를 좌우반전 시키는

함수인데 같은 데이터의 여러 모습을 학습할 수 있어 좋다. ToTensor로 픽셀값들을 0~1사이로 변환한다.

그리고 마지막으로 이미지 정규화를 해줄건데 이미지 정규화는 각 채널의 평균을 뺀 뒤에 표준편차를 나눠 정규화를 진행하는 것이다. 이 값에 따라서 ToTensor로 0~1사이의 범위값으로 만들어진 값들이 -1~1 등 자유롭게 바뀔 수 있다.

정규화를 하는 이유는 동일한 환경으로 어느정도 맞추는 개념이라고 할 수 있는데

ImageNet이 학습한 수백만장의 이미지의 RGB 각각의 채널에 대한 평균은 0.485, 0.456, 0.406 그리고 표준편차는 0.229, 0.224, 0.225 이기 때문에 이 값을 사용해주었다.

torch의 dataset.imagefolder는 이미지가 담긴 사진 폴더를 그대로 dataset으로 이용할 수 있게 해준다.

그리고 dataloder로 train과 test를 할당해줍니다. 모델을 학습할 때 미니배치로 전달하고, 매 에폭마다 섞어서 과적합을 방지한다.

def imshow(input, title):
    # torch.Tensor를 numpy 객체로 변환
    input = input.numpy().transpose((1, 2, 0))
    # 이미지 정규화 해제하기
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    input = std * input + mean
    input = np.clip(input, 0, 1)
    # 이미지 출력
    plt.imshow(input)
    plt.title(title)
    plt.show()


# 학습 데이터를 배치 단위로 불러오기
iterator = iter(train_dataloader)

# # 현재 배치를 이용해 격자 형태의 이미지를 만들어 시각화
inputs, classes = next(iterator)
len(inputs)
out = torchvision.utils.make_grid(inputs)
imshow(out, title=[class_names[x] for x in classes])

iterator로 학습 데이터를 배치단위로 불러오고 이를 시각화하여 학습 데이터를 확인합니다

클래스에 해당하는 이미지가 제대로 들어왔음을 확인할 수 있습니다.

model = models.resnet34(pretrained=True)
num_features = model.fc.in_features
# 전이 학습(transfer learning): 모델의 출력 뉴런 수를 3개로 교체하여 마지막 레이어 다시 학습
model.fc = nn.Linear(num_features, 3)
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

우리가 사용할 모델을 불러옵니다. 모델은 resnet34이고 pretrained 속성은 사전학습을 의미합니다.

resnet34는 이미지 인식을 위한 딥러닝 모델이며 파이토치에서 제공합니다.

전이학습을 할 예정이기 때문에 마지막 부분에 출력 뉴런 수를 3개로 교체하여 마지막 레이어를 다시 학습합니다.

nn.linear의 첫번째는 x 두번째 인자는 출력 y를 의미합니다.

손실은 크로스 엔트로피 손실함수를 사용하고

optim.SDG는 수직하강법으로 한 번에 들어오는 데이터의 수대로 경사하강법을 적용하여 알고리즘을 최적화하는 함수이다.

model.eval()
start_time = time.time()

with torch.no_grad():
    running_loss = 0.
    running_corrects = 0

    for inputs, labels in test_dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)

        running_loss += loss.item() * inputs.size(0)
        running_corrects += torch.sum(preds == labels.data)

        # 한 배치의 첫 번째 이미지에 대하여 결과 시각화
        print(f'[예측 결과: {class_names[preds[0]]}] (실제 정답: {class_names[labels.data[0]]})')
        imshow(inputs.cpu().data[0], title='예측 결과: ' + class_names[preds[0]])

    epoch_loss = running_loss / len(test_datasets)
    epoch_acc = running_corrects / len(test_datasets) * 100.
    print('[Test Phase] Loss: {:.4f} Acc: {:.4f}% Time: {:.4f}s'.format(epoch_loss, epoch_acc, time.time() - start_time))

모델을 평가하는 단계이다.

eval과 no_grad를 둘 다 사용하는 이유는 연산 속도를 빠르게 하기 위함이다.

이런 식으로 이미지를 잘 예측하고 있음을 알 수 있다.

분류하는 클래스의 크기도 작기 때문에 빠르고 높은 결과를 얻을 수 있었다.

이제 이 모델의 예측결과를 활용해 youtube API와 연동하여 효과적인 정보를 전달하는 과정이 남아있다.

오늘은 여기까지!

저작자표시 변경금지 (새창열림)