当前位置：凯发k8官方网 > 人工智能 > 目标检测 >内容正文

目标检测

深度学习和目标检测系列教程 10-凯发k8官方网

发布时间：2024/10/8 目标检测 0 豆豆

凯发k8官方网收集整理的这篇文章主要介绍了深度学习和目标检测系列教程 10-300：通过torch训练第一个faster-rcnn模型小编觉得挺不错的,现在分享给大家,帮大家做个参考.

@author：runsen

上次介绍了faster-rcnn模型，那么今天就开始训练第一个faster-rcnn模型。

本文将展示如何在水果图像数据集上使用faster-rcnn模型。

代码的灵感来自此处的 pytorch 文档教程和kaggle

https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html
https://www.kaggle.com/yerramvarun/fine-tuning-faster-rcnn-using-pytorch/

这是我目前见到rcnn最好的教程

数据集来源：https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection

由于很多对象检测代码是相同的，并且必须编写，torch为我们提供了相关的代码，直接克隆复制到工作目录中。

git clone https://github.com/pytorch/vision.git git checkout v0.3.0cp vision/references/detection/utils.py ./ cp vision/references/detection/transforms.py ./ cp vision/references/detection/coco_eval.py ./ cp vision/references/detection/engine.py ./ cp vision/references/detection/coco_utils.py ./

下载的数据集，在train和test文件夹中存在对应的xml和jpg文件。

import os import numpy as np import cv2 import torch import matplotlib.patches as patches import albumentations as a from albumentations.pytorch.transforms import totensorv2 from matplotlib import pyplot as plt from torch.utils.data import dataset from xml.etree import elementtree as et from torchvision import transforms as torchtransclass fruitimagesdataset(torch.utils.data.dataset):def __init__(self, files_dir, width, height, transforms=none):self.transforms = transformsself.files_dir = files_dirself.height = heightself.width = widthself.imgs = [image for image in sorted(os.listdir(files_dir))if image[-4:] == '.jpg']self.classes = [_,'apple', 'banana', 'orange']def __getitem__(self, idx):img_name = self.imgs[idx]image_path = os.path.join(self.files_dir, img_name)# reading the images and converting them to correct size and colorimg = cv2.imread(image_path)img_rgb = cv2.cvtcolor(img, cv2.color_bgr2rgb).astype(np.float32)img_res = cv2.resize(img_rgb, (self.width, self.height), cv2.inter_area)# diving by 255img_res /= 255.0# annotation fileannot_filename = img_name[:-4] '.xml'annot_file_path = os.path.join(self.files_dir, annot_filename)boxes = []labels = []tree = et.parse(annot_file_path)root = tree.getroot()# cv2 image gives size as height x widthwt = img.shape[1]ht = img.shape[0]# box coordinates for xml files are extracted and corrected for image size givenfor member in root.findall('object'):labels.append(self.classes.index(member.find('name').text))# bounding boxxmin = int(member.find('bndbox').find('xmin').text)xmax = int(member.find('bndbox').find('xmax').text)ymin = int(member.find('bndbox').find('ymin').text)ymax = int(member.find('bndbox').find('ymax').text)xmin_corr = (xmin / wt) * self.widthxmax_corr = (xmax / wt) * self.widthymin_corr = (ymin / ht) * self.heightymax_corr = (ymax / ht) * self.heightboxes.append([xmin_corr, ymin_corr, xmax_corr, ymax_corr])# convert boxes into a torch.tensorboxes = torch.as_tensor(boxes, dtype=torch.float32)# getting the areas of the boxesarea = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])# suppose all instances are not crowdiscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)labels = torch.as_tensor(labels, dtype=torch.int64)target = {}target["boxes"] = boxestarget["labels"] = labelstarget["area"] = areatarget["iscrowd"] = iscrowd# image_idimage_id = torch.tensor([idx])target["image_id"] = image_idif self.transforms:sample = self.transforms(image=img_res,bboxes=target['boxes'],labels=labels)img_res = sample['image']target['boxes'] = torch.tensor(sample['bboxes'])return img_res, targetdef __len__(self):return len(self.imgs)def torch_to_pil(img):return torchtrans.topilimage()(img).convert('rgb')def plot_img_bbox(img, target):fig, a = plt.subplots(1, 1)fig.set_size_inches(5, 5)a.imshow(img)for box in (target['boxes']):x, y, width, height = box[0], box[1], box[2] - box[0], box[3] - box[1]rect = patches.rectangle((x, y),width, height,linewidth=2,edgecolor='r',facecolor='none')a.add_patch(rect)plt.show()def get_transform(train):if train:return a.compose([a.horizontalflip(0.5),totensorv2(p=1.0)], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})else:return a.compose([totensorv2(p=1.0)], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})files_dir = '../input/fruit-images-for-object-detection/train_zip/train' test_dir = '../input/fruit-images-for-object-detection/test_zip/test'dataset = fruitimagesdataset(train_dir, 480, 480)img, target = dataset[78] print(img.shape, '\n', target) plot_img_bbox(torch_to_pil(img), target)

输出如下：

在torch中faster-rcnn模型导入from torchvision.models.detection.faster_rcnn import fastrcnnpredictor

import torchvision from torchvision.models.detection.faster_rcnn import fastrcnnpredictordef get_object_detection_model(num_classes):# 加载在coco上预先训练过的模型（会下载对应的权重）model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=true)# 获取分类器的输入特征数in_features = model.roi_heads.box_predictor.cls_score.in_features# 用新的头替换预先训练好的头model.roi_heads.box_predictor = fastrcnnpredictor(in_features, num_classes)return model

对象检测的增强与正常增强不同，因为在这里需要确保 bbox 在转换后仍然正确与对象对齐。

在这里，添加了随机翻转转换，随机图片处理

def get_transform(train):if train:return a.compose([a.horizontalflip(0.5),totensorv2(p=1.0) ], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})else:return a.compose([totensorv2(p=1.0)], bbox_params={'format': 'pascal_voc', 'label_fields': ['labels']})

现在让我们准备数据集和数据加载器进行训练和测试。

dataset = fruitimagesdataset(files_dir, 480, 480, transforms= get_transform(train=true)) dataset_test = fruitimagesdataset(files_dir, 480, 480, transforms= get_transform(train=false))# split the dataset in train and test set torch.manual_seed(1) indices = torch.randperm(len(dataset)).tolist()# train test split test_split = 0.2 tsize = int(len(dataset)*test_split) dataset = torch.utils.data.subset(dataset, indices[:-tsize]) dataset_test = torch.utils.data.subset(dataset_test, indices[-tsize:])# define training and validation data loaders data_loader = torch.utils.data.dataloader(dataset, batch_size=10, shuffle=true, num_workers=4,collate_fn=utils.collate_fn)data_loader_test = torch.utils.data.dataloader(dataset_test, batch_size=10, shuffle=false, num_workers=4,collate_fn=utils.collate_fn)

准备模型

# to train on gpu if selected. device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')num_classes = 4# get the model using our helper function model = get_object_detection_model(num_classes)# move model to the right device model.to(device)# construct an optimizer params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.sgd(params, lr=0.005,momentum=0.9, weight_decay=0.0005)# and a learning rate scheduler which decreases the learning rate by # 10x every 3 epochs lr_scheduler = torch.optim.lr_scheduler.steplr(optimizer,step_size=3,gamma=0.1)# training for 10 epochs num_epochs = 10for epoch in range(num_epochs):# training for one epochtrain_one_epoch(model, optimizer, data_loader, device, epoch, print_freq=10)# update the learning ratelr_scheduler.step()# evaluate on the test datasetevaluate(model, data_loader_test, device=device)

torchvision 为我们提供了一个将 nms 应用于我们的预测的实用程序，让我们apply_nms使用它构建一个函数。

def apply_nms(orig_prediction, iou_thresh=0.3):# torchvision returns the indices of the bboxes to keepkeep = torchvision.ops.nms(orig_prediction['boxes'], orig_prediction['scores'], iou_thresh)final_prediction = orig_predictionfinal_prediction['boxes'] = final_prediction['boxes'][keep]final_prediction['scores'] = final_prediction['scores'][keep]final_prediction['labels'] = final_prediction['labels'][keep]return final_prediction# function to convert a torchtensor back to pil image def torch_to_pil(img):return torchtrans.topilimage()(img).convert('rgb')

让我们从我们的测试数据集中取一张图像，看看我们的模型是如何工作的。

我们将首先看到，与实际相比，我们的模型预测了多少个边界框

# pick one image from the test set img, target = dataset_test[5] # put the model in evaluation mode model.eval() with torch.no_grad():prediction = model([img.to(device)])[0]print('predicted #boxes: ', len(prediction['labels'])) print('real #boxes: ', len(target['labels']))

预测#boxes：14
真实#boxes：1

真实数据

print('expected output') plot_img_bbox(torch_to_pil(img), target) print('model output') plot_img_bbox(torch_to_pil(img), prediction)

你可以看到我们的模型为每个苹果预测了很多边界框。让我们对其应用 nms 并查看最终输出

nms_prediction = apply_nms(prediction, iou_thresh=0.2) print('nms applied model output') plot_img_bbox(torch_to_pil(img), nms_prediction)

算法和代码逻辑是我目前见到，最好的faster-rcnn教程：

https://www.kaggle.com/yerramvarun/fine-tuning-faster-rcnn-using-pytorch/

这个rcnn对于系统的要求非常高，在公司的gpu中也会显出内存不够。

主要是dataloader中的num_workers=4在做多线程。

如何微调rcnn模型，并对resnet 50进行微调。如何更改训练配置，比如图像大小、优化器和学习率。如何更好使用albumentations ，值得去探索。

最后附上整个rcnn的网络结构

fasterrcnn((transform): generalizedrcnntransform(normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])resize(min_size=(800,), max_size=1333, mode='bilinear'))(backbone): backbonewithfpn((body): intermediatelayergetter((conv1): conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=false)(bn1): frozenbatchnorm2d(64, eps=0.0)(relu): relu(inplace=true)(maxpool): maxpool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=false)(layer1): sequential((0): bottleneck((conv1): conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(64, eps=0.0)(conv2): conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(64, eps=0.0)(conv3): conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(256, eps=0.0)(relu): relu(inplace=true)(downsample): sequential((0): conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=false)(1): frozenbatchnorm2d(256, eps=0.0)))(1): bottleneck((conv1): conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(64, eps=0.0)(conv2): conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(64, eps=0.0)(conv3): conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(256, eps=0.0)(relu): relu(inplace=true))(2): bottleneck((conv1): conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(64, eps=0.0)(conv2): conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(64, eps=0.0)(conv3): conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(256, eps=0.0)(relu): relu(inplace=true)))(layer2): sequential((0): bottleneck((conv1): conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(128, eps=0.0)(conv2): conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(128, eps=0.0)(conv3): conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(512, eps=0.0)(relu): relu(inplace=true)(downsample): sequential((0): conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=false)(1): frozenbatchnorm2d(512, eps=0.0)))(1): bottleneck((conv1): conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(128, eps=0.0)(conv2): conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(128, eps=0.0)(conv3): conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(512, eps=0.0)(relu): relu(inplace=true))(2): bottleneck((conv1): conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(128, eps=0.0)(conv2): conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(128, eps=0.0)(conv3): conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(512, eps=0.0)(relu): relu(inplace=true))(3): bottleneck((conv1): conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(128, eps=0.0)(conv2): conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(128, eps=0.0)(conv3): conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(512, eps=0.0)(relu): relu(inplace=true)))(layer3): sequential((0): bottleneck((conv1): conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(256, eps=0.0)(conv2): conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(256, eps=0.0)(conv3): conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(1024, eps=0.0)(relu): relu(inplace=true)(downsample): sequential((0): conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=false)(1): frozenbatchnorm2d(1024, eps=0.0)))(1): bottleneck((conv1): conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(256, eps=0.0)(conv2): conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(256, eps=0.0)(conv3): conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(1024, eps=0.0)(relu): relu(inplace=true))(2): bottleneck((conv1): conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(256, eps=0.0)(conv2): conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(256, eps=0.0)(conv3): conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(1024, eps=0.0)(relu): relu(inplace=true))(3): bottleneck((conv1): conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(256, eps=0.0)(conv2): conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(256, eps=0.0)(conv3): conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(1024, eps=0.0)(relu): relu(inplace=true))(4): bottleneck((conv1): conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(256, eps=0.0)(conv2): conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(256, eps=0.0)(conv3): conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(1024, eps=0.0)(relu): relu(inplace=true))(5): bottleneck((conv1): conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(256, eps=0.0)(conv2): conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(256, eps=0.0)(conv3): conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(1024, eps=0.0)(relu): relu(inplace=true)))(layer4): sequential((0): bottleneck((conv1): conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(512, eps=0.0)(conv2): conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(512, eps=0.0)(conv3): conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(2048, eps=0.0)(relu): relu(inplace=true)(downsample): sequential((0): conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=false)(1): frozenbatchnorm2d(2048, eps=0.0)))(1): bottleneck((conv1): conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(512, eps=0.0)(conv2): conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(512, eps=0.0)(conv3): conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(2048, eps=0.0)(relu): relu(inplace=true))(2): bottleneck((conv1): conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn1): frozenbatchnorm2d(512, eps=0.0)(conv2): conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=false)(bn2): frozenbatchnorm2d(512, eps=0.0)(conv3): conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=false)(bn3): frozenbatchnorm2d(2048, eps=0.0)(relu): relu(inplace=true))))(fpn): featurepyramidnetwork((inner_blocks): modulelist((0): conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1))(1): conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))(2): conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))(3): conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1)))(layer_blocks): modulelist((0): conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(1): conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(2): conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(3): conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))(extra_blocks): lastlevelmaxpool()))(rpn): regionproposalnetwork((anchor_generator): anchorgenerator()(head): rpnhead((conv): conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(cls_logits): conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1))(bbox_pred): conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1))))(roi_heads): roiheads((box_roi_pool): multiscaleroialign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2)(box_head): twomlphead((fc6): linear(in_features=12544, out_features=1024, bias=true)(fc7): linear(in_features=1024, out_features=1024, bias=true))(box_predictor): fastrcnnpredictor((cls_score): linear(in_features=1024, out_features=4, bias=true)(bbox_pred): linear(in_features=1024, out_features=16, bias=true))) )

总结

以上是凯发k8官方网为你收集整理的深度学习和目标检测系列教程 10-300：通过torch训练第一个faster-rcnn模型的全部内容，希望文章能够帮你解决所遇到的问题。

如果觉得凯发k8官方网网站内容还不错，欢迎将凯发k8官方网推荐给好友。

模型