mini project:YOLOv7 animal detect

2026년 04월 12일

YOLOv7을 이용한 동물 감지를 해보도록 하겠습니다.
환경은 맥북 m1 pro에서 virtual machine을 사용하였습니다.

사용한 데이터는 animals.v2-release.voc입니다

data는 split이 되어있는 상태로 image와 label이 함께 있었습니다.

먼저 VOC의 xml 파일을 txt로 바꾸어 주었습니다.(label.casche는 학습시에 생성되는 캐쉬파일입니다)

<annotation> <folder></folder> <filename>1_jpg.rf.2e6cce3e7cfc0e62b404ba5af96a9c38.jpg</filename> <path>1_jpg.rf.2e6cce3e7cfc0e62b404ba5af96a9c38.jpg</path> <source> <database>roboflow.ai</database> </source> <size> <width>155</width> <height>178</height> <depth>3</depth> </size> <segmented>0</segmented> <object> <name>fox</name> <pose>Unspecified</pose> <truncated>0</truncated> <difficult>0</difficult> <occluded>0</occluded> <bndbox> <xmin>56</xmin> <xmax>156</xmax> <ymin>18</ymin> <ymax>175</ymax> </bndbox> </object> </annotation>

import os import glob import cv2 import shutil from xml.etree.ElementTree import parse # data_dir = "./animals.v2-release.voc/valid/" # image_data = glob.glob(os.path.join(data_dir, "*.jpg")) label_dict = {"background": 0, "cat": 1, "chicken": 2, "cow": 3, "dog": 4, "fox": 5, "goat": 6, "horse": 7, "person": 8, "racoon": 9, "skunk": 10, } # cat, chicken, cow, dog, fox, goat, horse, person, racoon, skunk class Voc_to_yolo_convter(): def __init__(self, xml_paths): self.xml_path_list = glob.glob(os.path.join(xml_paths, "*.xml")) def get_voc_to_yolo(self, mode): for xml_path in self.xml_path_list: tree = parse(xml_path) root = tree.getroot() # get file name file_name = root.find('filename').text # get image size size_meta = root.findall('size') img_width = int(size_meta[0].find('width').text) img_height = int(size_meta[0].find('height').text) # object meta object_metas = root.findall('object') # box info get for object_meta in object_metas: # label_name object_label = object_meta.find('name').text # bbox xmin = int(object_meta.find('bndbox').findtext('xmin')) xmax = int(object_meta.find('bndbox').findtext('xmax')) ymin = int(object_meta.find('bndbox').findtext('ymin')) ymax = int(object_meta.find('bndbox').findtext('ymax')) # print(object_label, xmin, ymin, xmax, ymax) # voc to yolo yolo_x = round(((int(xmin) + int(xmax))/2)/img_width, 6) yolo_y = round(((int(ymin) + int(ymax))/2)/img_height, 6) yolo_w = round((int(xmax) - int(xmin))/img_width, 6) yolo_h = round((int(ymax) - int(ymin))/img_height, 6) image_name_temp = file_name.replace(".jpg", ".txt") # txt file save folder os.makedirs(f"./animals.v2-release.voc/{mode}/labels", exist_ok=True) # label label = label_dict[object_label] # txt save with open(f"./animals.v2-release.voc/{mode}/labels/{image_name_temp}", "a") as f: f.write(f"{label} {yolo_x} {yolo_y} {yolo_w} {yolo_h} \\n") # move image def move_image(data, mode): for path in data: image_folder_path = f"./animals.v2-release.voc/{mode}/images" os.makedirs(image_folder_path, exist_ok=True) file_name = path.split("\\\\")[1] image_path = os.path.join(image_folder_path, file_name) shutil.move(path, image_path) if __name__ == "__main__": data_dir = "./animals.v2-release.voc/valid/" image_data = glob.glob(os.path.join(data_dir, "*.jpg")) # test = Voc_to_yolo_convter(data_dir) # test.get_voc_to_yolo(mode="valid") move_image(image_data, mode="valid")

그리고 data.yaml을 작성해 주었습니다.

위치는 yolov7-main/data/data.yaml입니다.

train: ./animals_dataset/train/images val: ./animals_dataset/valid/images test: ./animals_dataset/test.images # number of class nc : 11 # classes names : ["background", "cat", "chicken", "cow", "dog", "fox", "goat", "horse", "person", "racoon", "skunk" ]

하이퍼파라미터 입니다.

lr0: 0.001 # initial learning rate (SGD=1E-2, Adam=1E-3) lrf: 0.1 # final OneCycleLR learning rate (lr0 * lrf) momentum: 0.937 # SGD momentum/Adam beta1 weight_decay: 0.0005 # optimizer weight decay 5e-4 warmup_epochs: 3.0 # warmup epochs (fractions ok) warmup_momentum: 0.8 # warmup initial momentum warmup_bias_lr: 0.1 # warmup initial bias lr box: 0.05 # box loss gain cls: 0.3 # cls loss gain cls_pw: 1.0 # cls BCELoss positive_weight obj: 0.7 # obj loss gain (scale with pixels) obj_pw: 1.0 # obj BCELoss positive_weight iou_t: 0.20 # IoU training threshold anchor_t: 4.0 # anchor-multiple threshold # anchors: 3 # anchors per output layer (0 to ignore) fl_gamma: 0.0 # focal loss gamma (efficientDet default gamma=1.5) hsv_h: 0.015 # image HSV-Hue augmentation (fraction) hsv_s: 0.7 # image HSV-Saturation augmentation (fraction) hsv_v: 0.4 # image HSV-Value augmentation (fraction) degrees: 0.0 # image rotation (+/- deg) translate: 0.2 # image translation (+/- fraction) scale: 0.9 # image scale (+/- gain) shear: 0.0 # image shear (+/- deg) perspective: 0.0 # image perspective (+/- fraction), range 0-0.001 flipud: 0.0 # image flip up-down (probability) fliplr: 0.5 # image flip left-right (probability) mosaic: 1.0 # image mosaic (probability) mixup: 0.15 # image mixup (probability) copy_paste: 0.0 # image copy paste (probability) paste_in: 0.15 # image copy paste (probability), use 0 for faster training loss_ota: 1 # use ComputeLossOTA, use 0 for faster training

각 라벨별로 동물들을 잘 잡아 내는것을 확인 할 수 있었습니다.