yolov5量化部署(基于openvino和tensorrt)

2023-11-19

yolov5 openvino量化部署

首先,下载YOLOv5源码,安装YOLOv5和OpenVINO的python依赖。

git clone https://github.com/ultralytics/yolov5.git 
pip install -r requirements.txt && pip install openvino openvino-dev

然后,通过YOLOv5提供的export.py将预训练的Pytorch模型转换为OpenVINO FP32 IR模型。

python export.py --weights yolov5n.pt --imgsz 640 --batch-size 1 --include openvino

下面的量化代码改编自:https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/111-yolov5-quantization-migration

from pathlib import Path
from utils.dataloaders import create_dataloader
from utils.general import check_dataset
from export import attempt_load, yaml_save
from val import run as validation_fn
from openvino.tools import mo
from openvino.runtime import serialize
from openvino.tools.pot.api import DataLoader
from openvino.tools.pot.engines.ie_engine import IEEngine
from openvino.tools.pot.graph import load_model
from openvino.tools.pot.pipeline.initializer import create_pipeline
from openvino.tools.pot.graph.model_utils import compress_model_weights
from openvino.tools.pot.graph import load_model, save_model


IMAGE_SIZE = 640
MODEL_NAME = "yolov5n"
DATASET_CONFIG = "./data/coco128.yaml"


class YOLOv5POTDataLoader(DataLoader):
    '''Inherit from DataLoader function and implement for YOLOv5.'''
    def __init__(self, data_source):
        super().__init__({})
        self._data_loader = data_source
        self._data_iter = iter(self._data_loader)

    def __len__(self):
        return len(self._data_loader.dataset)

    def __getitem__(self, item):
        try:
            batch_data = next(self._data_iter)
        except StopIteration:
            self._data_iter = iter(self._data_loader)
            batch_data = next(self._data_iter)

        im, target, path, shape = batch_data

        im = im.float()
        im /= 255
        nb, _, height, width = im.shape
        img = im.cpu().detach().numpy()
        target = target.cpu().detach().numpy()

        annotation = dict()
        annotation["image_path"] = path
        annotation["target"] = target
        annotation["batch_size"] = nb
        annotation["shape"] = shape
        annotation["width"] = width
        annotation["height"] = height
        annotation["img"] = img

        return (item, annotation), img


if __name__ == "__main__":
    '''Conversion of the YOLOv5 model to OpenVINO'''
    onnx_path = f"./{MODEL_NAME}.onnx"

    # fp32 IR model
    fp32_path = f"./FP32_openvino_model/{MODEL_NAME}_fp32.xml"

    print(f"Export ONNX to OpenVINO FP32 IR to: {fp32_path}")
    model = mo.convert_model(onnx_path)
    serialize(model, fp32_path)

    # fp16 IR model
    fp16_path = f"./FP16_openvino_model/{MODEL_NAME}_fp16.xml"

    print(f"Export ONNX to OpenVINO FP16 IR to: {fp16_path}")
    model = mo.convert_model(onnx_path, compress_to_fp16=True)
    serialize(model, fp16_path)

    '''Prepare dataset for quantization'''
    data = check_dataset(DATASET_CONFIG)
    data_source = create_dataloader(data["val"], imgsz=640, batch_size=1, stride=32, pad=0.5, workers=0)[0]
    pot_data_loader = YOLOv5POTDataLoader(data_source)

    '''Configure quantization pipeline'''
    algorithms_config = [
        {
            "name": "DefaultQuantization",
            "params": {
                "preset": "mixed",
                "stat_subset_size": 300,
                "target_device": "CPU"
            },
        }
    ]

    engine_config = {"device": "CPU"}

    model_config = {
        "model_name": f"{MODEL_NAME}",
        "model": fp32_path,
        "weights": fp32_path.replace(".xml", ".bin"),
    }

    pot_model = load_model(model_config)

    engine = IEEngine(config=engine_config, data_loader=pot_data_loader)

    pipeline = create_pipeline(algorithms_config, engine)

    '''Perform model optimization'''
    compressed_model = pipeline.run(pot_model)
    compress_model_weights(compressed_model)
    optimized_save_dir = Path(f"./POT_INT8_openvino_model/")
    save_model(compressed_model, optimized_save_dir, model_config["model_name"] + "_int8")
    pot_int8_path = f"{optimized_save_dir}/{MODEL_NAME}_int8.xml"

    '''Compare accuracy FP32 and INT8 models'''
    model = attempt_load(f"./{MODEL_NAME}.pt", device="cpu", inplace=True, fuse=True) 
    metadata = {"stride": int(max(model.stride)), "names": model.names}  # model metadata
    yaml_save(Path(pot_int8_path).with_suffix(".yaml"), metadata)
    yaml_save(Path(fp32_path).with_suffix(".yaml"), metadata)

    print("Checking the accuracy of the original model:")
    fp32_metrics = validation_fn(
        data=DATASET_CONFIG,
        weights=Path(fp32_path).parent,
        batch_size=1,
        workers=0,
        plots=False,
        device="cpu",
        iou_thres=0.65,
    )

    fp32_ap5 = fp32_metrics[0][2]
    fp32_ap_full = fp32_metrics[0][3]
    print(f"mAP@.5 = {fp32_ap5}")
    print(f"mAP@.5:.95 = {fp32_ap_full}")

    print("Checking the accuracy of the POT int8 model:")
    int8_metrics = validation_fn(
        data=DATASET_CONFIG,
        weights=Path(pot_int8_path).parent,
        batch_size=1,
        workers=0,
        plots=False,
        device="cpu",
        iou_thres=0.65,
    )

    pot_int8_ap5 = int8_metrics[0][2]
    pot_int8_ap_full = int8_metrics[0][3]
    print(f"mAP@.5 = {pot_int8_ap5}")
    print(f"mAP@.5:.95 = {pot_int8_ap_full}")

python推理:

import cv2
import numpy as np
from openvino.inference_engine import IECore


names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
        'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
        'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
        'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
        'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
        'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
        'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
        'cell phone','microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
        'teddy bear', 'hair drier', 'toothbrush']
conf_thres = 0.5
nms_thres = 0.5
model_path = "yolov5n-f32.onnx" //onnx推理支持fp32和fp16
model_xml = r"./POT_INT8_openvino_model/yolov5n_int8.xml" //IR推理支持fp32、fp16和int8
model_bin = r"./POT_INT8_openvino_model/yolov5n_int8.bin"
    
    
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), scaleup=False, stride=32):
    shape = img.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better test mAP)
        r = min(r, 1.0)
    ratio = r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    dw /= 2
    dh /= 2
    if shape[::-1] != new_unpad:  # resize
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return img,ratio,(dw,dh)


def iou(b1,b2):
    b1_x1, b1_y1, b1_x2, b1_y2 = b1[0], b1[1], b1[2], b1[3]
    b2_x1, b2_y1, b2_x2, b2_y2 = b2[:,0], b2[:,1], b2[:,2], b2[:,3]

    inter_rect_x1 = np.maximum(b1_x1, b2_x1)
    inter_rect_y1 = np.maximum(b1_y1, b2_y1)
    inter_rect_x2 = np.minimum(b1_x2, b2_x2)
    inter_rect_y2 = np.minimum(b1_y2, b2_y2)

    inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(inter_rect_y2 - inter_rect_y1, 0)

    area_b1 = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
    area_b2 = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)

    iou = inter_area / np.maximum((area_b1+area_b2-inter_area),1e-6)
    return iou


def non_max_suppression(boxes, conf_thres=0.5, nms_thres=0.4, ratio=1, pad=(20,20)):
    # 取出batch_size
    bs = np.shape(boxes)[0]
    # xywh___ to____ xyxy
    shape_boxes = np.zeros_like(boxes[:,:,:4])
    shape_boxes[:, :, 0] = boxes[:, :, 0] - boxes[:, :, 2] / 2
    shape_boxes[:, :, 1] = boxes[:, :, 1] - boxes[:, :, 3] / 2
    shape_boxes[:, :, 2] = boxes[:, :, 0] + boxes[:, :, 2] / 2
    shape_boxes[:, :, 3] = boxes[:, :, 1] + boxes[:, :, 3] / 2
    boxes[:, :, :4] = shape_boxes
    boxes[:, :, 5:] *= boxes[:, :, 4:5]

    # output存放每一张图片的预测结果,推理阶段一般是一张图片
    output = []
    for i in range(bs):
        predictions = boxes[i]  # 预测位置xyxy  shape==(12700,85)
        score = np.max(predictions[:, 5:], axis=-1)
        # score = predictions[:,4]  # 存在物体置信度,shape==12700
        mask = score > conf_thres  # 物体置信度阈值mask==[False,False,True......],shape==12700,True将会被保留,False列将会被删除
        detections = predictions[mask]  # 第一次筛选  shape==(115,85)
        class_conf = np.expand_dims(np.max(detections[:,5:],axis=-1),axis=-1)  # 获取每个预测框预测的类别置信度
        class_pred = np.expand_dims(np.argmax(detections[:,5:],axis=-1),axis=-1)  # 获取每个预测框的类别下标
        # 结果堆叠,(num_boxes,位置信息4+包含物体概率1+类别置信度1+类别序号1)
        detections = np.concatenate([detections[:,:4],class_conf,class_pred],axis=-1)  # shape=(numbox,7)

        unique_class = np.unique(detections[:,-1])  # 取出包含的所有类别
        if len(unique_class)==0:
            continue
        best_box = []
        for c in unique_class:
            # 取出类别为c的预测结果
            cls_mask = detections[:,-1] == c
            detection = detections[cls_mask] # shape=(82,7)

            # 包含物体类别概率从高至低排列
            scores = detection[:,4]
            arg_sort = np.argsort(scores)[::-1]  # 返回的是索引
            detection = detection[arg_sort]

            while len(detection) != 0:
                best_box.append(detection[0])
                if len(detection) == 1:
                    break
                # 计算当前置信度最大的框和其它预测框的iou
                ious = iou(best_box[-1],detection[1:])
                detection = detection[1:][ious < nms_thres]  # 小于nms_thres将被保留,每一轮至少减少一个
        output.append(best_box)

    boxes_loc = []
    conf_loc = []
    class_loc = []
    if len(output):
        for i in range(len(output)):
            pred = output[i]
            for i, det in enumerate(pred):
                if len(det):
                    # 将框坐标调整回原始图像中
                    det[0] = (det[0] - pad[0]) / ratio
                    det[2] = (det[2] - pad[0]) / ratio
                    det[1] = (det[1] - pad[1]) / ratio
                    det[3] = (det[3] - pad[1]) / ratio
                    boxes_loc.append([det[0],det[1],det[2],det[3]])
                    conf_loc.append(det[4])
                    class_loc.append(det[5])
    return boxes_loc,conf_loc,class_loc

def plot_box(img,boxes,conf,clas_id,line_thickness=3,names=None):
    # 画位置框
    tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    c1, c2 = (int(boxes[0]), int(boxes[1])), (int(boxes[2]),int(boxes[3]))
    cv2.rectangle(img, c1, c2, [0, 0 ,255], thickness=tl, lineType=cv2.LINE_AA)

    # 画类别信息框
    label = f'{names[int(clas_id)]} {conf:.2f}'
    tf = max(tl - 1, 1)  # label字体的线宽 font thickness
    t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
    c2 = (c1[0] + t_size[0], c1[1] - t_size[1] - 3)
    cv2.rectangle(img, c1, c2, [255, 0 ,0], -1, cv2.LINE_AA)
    cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)


if __name__ == '__main__':
    ie = IECore()
    #net = ie.read_network(model=model_path)
    net = ie.read_network(model=model_xml, weights=model_bin)
    exec_net = ie.load_network(network=net, device_name="CPU")
    input_layer = next(iter(net.input_info))
    frame = cv2.imread("bus.jpg") 
    img, ratio, (dw,dh) = letterbox(frame)

    blob = cv2.dnn.blobFromImage(np.ascontiguousarray(img), 1/255.0, (img.shape[0], img.shape[1]), swapRB=True, crop=False)
    infer_request_handle=exec_net.start_async(request_id=0,inputs={input_layer: blob})
    if infer_request_handle.wait(-1) == 0:
        res = infer_request_handle.output_blobs["output0"]
        outs = res.buffer
        boxes_loc,conf_loc,class_loc = non_max_suppression(outs, conf_thres=conf_thres, nms_thres=nms_thres,ratio=ratio, pad=(dw,dh))

        for i in range(len(boxes_loc)):
            boxes = boxes_loc[i]
            conf = conf_loc[i]
            clas_id = class_loc[i]
            plot_box(frame, boxes, conf, clas_id, line_thickness=3, names=names)

    cv2.imshow("result", frame)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

或者利用yolov5自带的detect.py:

python detect.py --weights ./POT_INT8_openvino_model

C++推理:(参考基于OpenVNO C++ API部署YOLOv5模型

#include <iostream>
#include <string>
#include <openvino/openvino.hpp> 
#include <opencv2/opencv.hpp>   


/* ---------  Please modify the path of yolov5 model and image -----------*/
std::string model_file = "yolov5n_int8.xml";
 //可以改成yolov5n_fp16.xml或yolov5n_fp32.xml或yolov5n-f32.onnx(支持fp32、fp18
 、int8 IR推理和fp32 onnx推理)
std::string image_file = "bus.jpg";

const std::vector<std::string> class_names = {
	"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
	"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
	"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
	"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
	"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
	"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
	"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
	"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
	"hair drier", "toothbrush" };


cv::Mat letterbox(cv::Mat& img, std::vector<float>& paddings, std::vector<int> new_shape = { 640, 640 })
{
	// Get current image shape [height, width]
	int img_h = img.rows;
	int img_w = img.cols;

	// Compute scale ratio(new / old) and target resized shape
	float scale = std::min(new_shape[1] * 1.0 / img_h, new_shape[0] * 1.0 / img_w);
	int resize_h = int(round(img_h * scale));
	int resize_w = int(round(img_w * scale));
	paddings[0] = scale;

	// Compute padding
	int pad_h = new_shape[1] - resize_h;
	int pad_w = new_shape[0] - resize_w;

	// Resize and pad image while meeting stride-multiple constraints
	cv::Mat resized_img;
	cv::resize(img, resized_img, cv::Size(resize_w, resize_h));

	// divide padding into 2 sides
	float half_h = pad_h * 1.0 / 2;
	float half_w = pad_w * 1.0 / 2;
	paddings[1] = half_h;
	paddings[2] = half_w;

	// Compute padding boarder
	int top = int(round(half_h - 0.1));
	int bottom = int(round(half_h + 0.1));
	int left = int(round(half_w - 0.1));
	int right = int(round(half_w + 0.1));

	// Add border
	cv::copyMakeBorder(resized_img, resized_img, top, bottom, left, right, 0, cv::Scalar(114, 114, 114));

	return resized_img;
}


int main(int argc, char* argv[]) 
{
	// -------- Get OpenVINO runtime version --------
	std::cout << ov::get_openvino_version().description << ':' << ov::get_openvino_version().buildNumber << std::endl;

	// -------- Step 1. Initialize OpenVINO Runtime Core --------
	ov::Core core;

	// -------- Step 2. Compile the Model --------
	auto compiled_model = core.compile_model(model_file, "CPU"); 

	// -------- Step 3. Create an Inference Request --------
	ov::InferRequest infer_request = compiled_model.create_infer_request();

	clock_t start = clock();
	// -------- Step 4. Read a picture file and do the preprocess --------
	cv::Mat img = cv::imread(image_file); //Load a picture into memory
	std::vector<float> paddings(3);       //scale, half_h, half_w
	cv::Mat resized_img = letterbox(img, paddings); //resize to (640,640) by letterbox
	cv::Mat blob = cv::dnn::blobFromImage(resized_img, 1 / 255.0, cv::Size(640, 640), cv::Scalar(0, 0, 0), true); 	// BGR->RGB, u8(0-255)->f32(0.0-1.0), HWC->NCHW

	// -------- Step 5. Feed the blob into the input node of YOLOv5 -------	
	auto input_port = compiled_model.input(); // Get input port for model with one input
	ov::Tensor input_tensor(input_port.get_element_type(), input_port.get_shape(), blob.ptr(0)); // Create tensor from external memory
	infer_request.set_input_tensor(input_tensor); // Set input tensor for model with one input

	// -------- Step 6. Start inference --------
	for (size_t i = 0; i < 100; i++)
		infer_request.infer();

	// -------- Step 7. Get the inference result --------
	auto output = infer_request.get_output_tensor(0);
	auto output_shape = output.get_shape();
	std::cout << "The shape of output tensor:" << output_shape << std::endl;
	cv::Mat output_buffer(output_shape[1], output_shape[2], CV_32F, output.data()); 

	// -------- Step 8. Post-process the inference result -----------
	float conf_threshold = 0.25;
	float nms_threshold = 0.5;
	std::vector<cv::Rect> boxes;
	std::vector<int> class_ids;
	std::vector<float> class_scores;
	std::vector<float> confidences;
	for (int i = 0; i < output_buffer.rows; i++) 
	{
		float confidence = output_buffer.at<float>(i, 4);
		if (confidence < conf_threshold) 
			continue;

		cv::Mat classes_scores = output_buffer.row(i).colRange(5, 85);
		cv::Point class_id;
		double score;
		cv::minMaxLoc(classes_scores, NULL, &score, NULL, &class_id);

		if (score > 0.25)
		{
			float cx = output_buffer.at<float>(i, 0);
			float cy = output_buffer.at<float>(i, 1);
			float w = output_buffer.at<float>(i, 2);
			float h = output_buffer.at<float>(i, 3);
			int left = static_cast<int>((cx - 0.5 * w - paddings[2]) / paddings[0]);
			int top = static_cast<int>((cy - 0.5 * h - paddings[1]) / paddings[0]);
			int width = static_cast<int>(w / paddings[0]);
			int height = static_cast<int>(h / paddings[0]);
			cv::Rect box;
			box.x = left;
			box.y = top;
			box.width = width;
			box.height = height;

			boxes.push_back(box);
			class_ids.push_back(class_id.x);
			class_scores.push_back(score);
			confidences.push_back(confidence);
		}
	}

	// NMS
	std::vector<int> indices;
	cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, nms_threshold, indices);

	clock_t end = clock();
	std::cout << end - start << std::endl;

	// -------- Step 8. Visualize the detection results -----------
	for (size_t i = 0; i < indices.size(); i++) 
	{
		int index = indices[i];
		int class_id = class_ids[index];
		cv::rectangle(img, boxes[index], cv::Scalar(0, 0, 255), 2, 8);
		std::string label = class_names[class_id] + ":" + std::to_string(class_scores[index]);
		cv::putText(img, label, cv::Point(boxes[index].tl().x, boxes[index].tl().y - 10), cv::FONT_HERSHEY_SIMPLEX, .5, cv::Scalar(255, 0, 0));
	}

	cv::imshow("YOLOv5 OpenVINO Inference C++ Demo", img);
	cv::waitKey(0);

	return 0;
}

C++在i7-12700 CPU下推理fp32、fp16、int8模型循环100轮,耗时如下(各跑3次):
yolov5n_fp32:1599ms 2040ms 1514ms
yolov5n_fp16:1505ms 2078ms 1514ms
yolov5n_int8: 856ms 861ms 852ms

fp32和fp16模型推理耗时差不多,int8能缩短推理耗时到一半左右。

yolov5 tensorrt量化部署

方法一:wts转trt的硬解析方案
这个借助的是大佬的工程:https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5
python和c++的推理代码都有。不得不佩服人家写的确实很详细了,有这么好的轮子干嘛不直接拿来用呢,哈哈。
配置环境的过程还参考了博文:windows上配置TensorRT yolov5 -6.0部署 tensorrtx视频流推理
LZ测试了一下yolov5各种模型单张图片的推理耗时如下(C++,RTX3070 gpu ):
yolov5n-int8: 2ms 1ms
yolov5s-int8: 2ms 1ms
yolov5m-int8:3ms 2ms
yolov5l-int8: 4ms 3ms
yolov5x-int8 :7ms 6ms

yolov5n-fp16: 1ms 1ms
yolov5s-fp16: 2ms 2ms
yolov5m-fp16:4ms 3ms
yolov5l-fp16: 6ms 5ms
yolov5x-fp16:10ms 9ms

yolov5n-fp32: 424ms 2ms
yolov5s-fp32: 389ms 4ms
yolov5m-fp32:401ms 9ms
yolov5l-fp32: 422ms 17ms
yolov5x-fp32: 30ms 28ms
其中在我的机器上,fp32模型的yolov5n-yolov5l版本首次推理时间较长,不清楚是什么原因。

方法二:onnx解析trt的api方案
代码参考课程《深度学习-TensorRT模型部署实战》,先将onnx转换成tensorrt的engine,再进行推理。

#include <NvInfer.h>
#include <NvInferRuntime.h>

#include "onnx-tensorrt-release-8.0/NvOnnxParser.h"

#include <cuda_runtime.h>

#include <stdio.h>
#include <math.h>

#include <iostream>
#include <fstream>
#include <vector>
#include <memory>
#include <functional>

#include <opencv2/opencv.hpp>

using namespace std;

#define checkRuntime(op)  __check_cuda_runtime((op), #op, __FILE__, __LINE__)

bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){
    if(code != cudaSuccess){    
        const char* err_name = cudaGetErrorName(code);    
        const char* err_message = cudaGetErrorString(code);  
        printf("runtime error %s:%d  %s failed. \n  code = %s, message = %s\n", file, line, op, err_name, err_message);   
        return false;
    }
    return true;
}

inline const char* severity_string(nvinfer1::ILogger::Severity t){
    switch(t){
        case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
        case nvinfer1::ILogger::Severity::kERROR:   return "error";
        case nvinfer1::ILogger::Severity::kWARNING: return "warning";
        case nvinfer1::ILogger::Severity::kINFO:    return "info";
        case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
        default: return "unknow";
    }
}

static const char* cocolabels[] = {
    "person", "bicycle", "car", "motorcycle", "airplane",
    "bus", "train", "truck", "boat", "traffic light", "fire hydrant",
    "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse",
    "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis",
    "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
    "skateboard", "surfboard", "tennis racket", "bottle", "wine glass",
    "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich",
    "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake",
    "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv",
    "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave",
    "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase",
    "scissors", "teddy bear", "hair drier", "toothbrush"
};

static std::tuple<uint8_t, uint8_t, uint8_t> hsv2bgr(float h, float s, float v){
    const int h_i = static_cast<int>(h * 6);
    const float f = h * 6 - h_i;
    const float p = v * (1 - s);
    const float q = v * (1 - f*s);
    const float t = v * (1 - (1 - f) * s);
    float r, g, b;
    switch (h_i) {
    case 0:r = v; g = t; b = p;break;
    case 1:r = q; g = v; b = p;break;
    case 2:r = p; g = v; b = t;break;
    case 3:r = p; g = q; b = v;break;
    case 4:r = t; g = p; b = v;break;
    case 5:r = v; g = p; b = q;break;
    default:r = 1; g = 1; b = 1;break;}
    return make_tuple(static_cast<uint8_t>(b * 255), static_cast<uint8_t>(g * 255), static_cast<uint8_t>(r * 255));
}

static std::tuple<uint8_t, uint8_t, uint8_t> random_color(int id){
    float h_plane = ((((unsigned int)id << 2) ^ 0x937151) % 100) / 100.0f;;
    float s_plane = ((((unsigned int)id << 3) ^ 0x315793) % 100) / 100.0f;
    return hsv2bgr(h_plane, s_plane, 1);
}

class TRTLogger : public nvinfer1::ILogger{
public:
    virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override{
        if(severity <= Severity::kWARNING){
            if(severity == Severity::kWARNING){
                printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
            }
            else if(severity <= Severity::kERROR){
                printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
            }
            else{
                printf("%s: %s\n", severity_string(severity), msg);
            }
        }
    }
} logger;

template<typename _T>
shared_ptr<_T> make_nvshared(_T* ptr){
    return shared_ptr<_T>(ptr, [](_T* p){p->destroy();});
}

bool build_model() {
	TRTLogger logger;

	// 这是基本需要的组件
	auto builder = make_nvshared(nvinfer1::createInferBuilder(logger));
	auto config = make_nvshared(builder->createBuilderConfig());
	auto network = make_nvshared(builder->createNetworkV2(1));

	// 通过onnxparser解析器解析的结果会填充到network中,类似addConv的方式添加进去
	auto parser = make_nvshared(nvonnxparser::createParser(*network, logger));
	if (!parser->parseFromFile("yolov5s-7.0.onnx", 1)) {
		printf("Failed to parse yolov5s.onnx\n");

		// 注意这里的几个指针还没有释放,是有内存泄漏的,后面考虑更优雅的解决
		return false;
	}

	int maxBatchSize = 1;
	printf("Workspace Size = %.2f MB\n", (1 << 28) / 1024.0f / 1024.0f);
	config->setMaxWorkspaceSize(1 << 28);

	// 如果模型有多个输入,则必须多个profile
	auto profile = builder->createOptimizationProfile();
	auto input_tensor = network->getInput(0);
	auto input_dims = input_tensor->getDimensions();

	// 配置最小、最优、最大范围
	input_dims.d[0] = 1;
	profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
	profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
	input_dims.d[0] = maxBatchSize;
	profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
	config->addOptimizationProfile(profile);

	auto engine = make_nvshared(builder->buildEngineWithConfig(*network, *config));
	if (engine == nullptr) {
		printf("Build engine failed.\n");
		return false;
	}

	// 将模型序列化,并储存为文件
	auto model_data = make_nvshared(engine->serialize());
	FILE* f = fopen("yolov5s-7.0.trt", "wb");
	fwrite(model_data->data(), 1, model_data->size(), f);
	fclose(f);

	// 卸载顺序按照构建顺序倒序
	printf("Build Done.\n");
	return true;
}

vector<unsigned char> load_file(const string& file){
    ifstream in(file, ios::in | ios::binary);
    if (!in.is_open())
        return {};

    in.seekg(0, ios::end);
    size_t length = in.tellg();

    std::vector<uint8_t> data;
    if (length > 0){
        in.seekg(0, ios::beg);
        data.resize(length);

        in.read((char*)&data[0], length);
    }
    in.close();
    return data;
}

void inference(){
    TRTLogger logger;
    auto engine_data = load_file("yolov5s-7.0.trt");
    auto runtime   = make_nvshared(nvinfer1::createInferRuntime(logger));
    auto engine = make_nvshared(runtime->deserializeCudaEngine(engine_data.data(), engine_data.size()));
    if(engine == nullptr){
        printf("Deserialize cuda engine failed.\n");
        runtime->destroy();
        return;
    }

    if(engine->getNbBindings() != 2){
        printf("你的onnx导出有问题,必须是1个输入和1个输出,你这明显有:%d个输出.\n", engine->getNbBindings() - 1);
        return;
    }

    cudaStream_t stream = nullptr;
    checkRuntime(cudaStreamCreate(&stream));
    auto execution_context = make_nvshared(engine->createExecutionContext());

    int input_batch = 1;
    int input_channel = 3;
    int input_height = 640;
    int input_width = 640;
    int input_numel = input_batch * input_channel * input_height * input_width;
    float* input_data_host = nullptr;
    float* input_data_device = nullptr;
    checkRuntime(cudaMallocHost(&input_data_host, input_numel * sizeof(float)));
    checkRuntime(cudaMalloc(&input_data_device, input_numel * sizeof(float)));

    auto image = cv::imread("bus.jpg");

    float scale_x = input_width / (float)image.cols;
    float scale_y = input_height / (float)image.rows;
    float scale = std::min(scale_x, scale_y);
    float i2d[6], d2i[6];

    i2d[0] = scale;  i2d[1] = 0;  i2d[2] = (-scale * image.cols + input_width + scale  - 1) * 0.5;
    i2d[3] = 0;  i2d[4] = scale;  i2d[5] = (-scale * image.rows + input_height + scale - 1) * 0.5;

    cv::Mat m2x3_i2d(2, 3, CV_32F, i2d);  
    cv::Mat m2x3_d2i(2, 3, CV_32F, d2i);  
    cv::invertAffineTransform(m2x3_i2d, m2x3_d2i); 

    cv::Mat input_image(input_height, input_width, CV_8UC3);
    cv::warpAffine(image, input_image, m2x3_i2d, input_image.size(), cv::INTER_LINEAR, cv::BORDER_CONSTANT, cv::Scalar::all(114));
    cv::imwrite("input-image.jpg", input_image);

    int image_area = input_image.cols * input_image.rows;
    unsigned char* pimage = input_image.data;
    float* phost_b = input_data_host + image_area * 0;
    float* phost_g = input_data_host + image_area * 1;
    float* phost_r = input_data_host + image_area * 2;
    for(int i = 0; i < image_area; ++i, pimage += 3){
        *phost_r++ = pimage[0] / 255.0f;
        *phost_g++ = pimage[1] / 255.0f;
        *phost_b++ = pimage[2] / 255.0f;
    }
    ///
    checkRuntime(cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream));

    auto output_dims = engine->getBindingDimensions(1);
    int output_numbox = output_dims.d[1];
    int output_numprob = output_dims.d[2];
    int num_classes = output_numprob - 5;
    int output_numel = input_batch * output_numbox * output_numprob;
    float* output_data_host = nullptr;
    float* output_data_device = nullptr;
    checkRuntime(cudaMallocHost(&output_data_host, sizeof(float) * output_numel));
    checkRuntime(cudaMalloc(&output_data_device, sizeof(float) * output_numel));

    auto input_dims = engine->getBindingDimensions(0);
    input_dims.d[0] = input_batch;

    execution_context->setBindingDimensions(0, input_dims);
    float* bindings[] = {input_data_device, output_data_device};
    bool success      = execution_context->enqueueV2((void**)bindings, stream, nullptr);
    checkRuntime(cudaMemcpyAsync(output_data_host, output_data_device, sizeof(float) * output_numel, cudaMemcpyDeviceToHost, stream));
    checkRuntime(cudaStreamSynchronize(stream));

    vector<vector<float>> bboxes;
    float confidence_threshold = 0.25;
    float nms_threshold = 0.5;
    for(int i = 0; i < output_numbox; ++i){
        float* ptr = output_data_host + i * output_numprob;
        float objness = ptr[4];
        if(objness < confidence_threshold)
            continue;

        float* pclass = ptr + 5;
        int label     = std::max_element(pclass, pclass + num_classes) - pclass;
        float prob    = pclass[label];
        float confidence = prob * objness;
        if(confidence < confidence_threshold)
            continue;

        float cx     = ptr[0];
        float cy     = ptr[1];
        float width  = ptr[2];
        float height = ptr[3];

        float left   = cx - width * 0.5;
        float top    = cy - height * 0.5;
        float right  = cx + width * 0.5;
        float bottom = cy + height * 0.5;

        float image_base_left   = d2i[0] * left   + d2i[2];
        float image_base_right  = d2i[0] * right  + d2i[2];
        float image_base_top    = d2i[0] * top    + d2i[5];
        float image_base_bottom = d2i[0] * bottom + d2i[5];
        bboxes.push_back({image_base_left, image_base_top, image_base_right, image_base_bottom, (float)label, confidence});
    }
    printf("decoded bboxes.size = %d\n", bboxes.size());

    std::sort(bboxes.begin(), bboxes.end(), [](vector<float>& a, vector<float>& b){return a[5] > b[5];});
    std::vector<bool> remove_flags(bboxes.size());
    std::vector<vector<float>> box_result;
    box_result.reserve(bboxes.size());

    auto iou = [](const vector<float>& a, const vector<float>& b){
        float cross_left   = std::max(a[0], b[0]);
        float cross_top    = std::max(a[1], b[1]);
        float cross_right  = std::min(a[2], b[2]);
        float cross_bottom = std::min(a[3], b[3]);

        float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
        float union_area = std::max(0.0f, a[2] - a[0]) * std::max(0.0f, a[3] - a[1]) 
                         + std::max(0.0f, b[2] - b[0]) * std::max(0.0f, b[3] - b[1]) - cross_area;
        if(cross_area == 0 || union_area == 0) return 0.0f;
        return cross_area / union_area;
    };

    for(int i = 0; i < bboxes.size(); ++i){
        if(remove_flags[i]) continue;

        auto& ibox = bboxes[i];
        box_result.emplace_back(ibox);
        for(int j = i + 1; j < bboxes.size(); ++j){
            if(remove_flags[j]) continue;

            auto& jbox = bboxes[j];
            if(ibox[4] == jbox[4]){
                if(iou(ibox, jbox) >= nms_threshold)
                    remove_flags[j] = true;
            }
        }
    }
    printf("box_result.size = %d\n", box_result.size());

    for(int i = 0; i < box_result.size(); ++i){
        auto& ibox = box_result[i];
        float left = ibox[0];
        float top = ibox[1];
        float right = ibox[2];
        float bottom = ibox[3];

        int class_label = ibox[4];
        float confidence = ibox[5];
        cv::Scalar color;
        tie(color[0], color[1], color[2]) = random_color(class_label);
        cv::rectangle(image, cv::Point(left, top), cv::Point(right, bottom), color, 3);

        auto name      = cocolabels[class_label];
        auto caption   = cv::format("%s %.2f", name, confidence);
        int text_width = cv::getTextSize(caption, 0, 1, 2, nullptr).width + 10;
        cv::rectangle(image, cv::Point(left-3, top-33), cv::Point(left + text_width, top), color, -1);
        cv::putText(image, caption, cv::Point(left, top-5), 0, 1, cv::Scalar::all(0), 2, 16);
    }
    cv::imwrite("image-draw.jpg", image);

    checkRuntime(cudaStreamDestroy(stream));
    checkRuntime(cudaFreeHost(input_data_host));
    checkRuntime(cudaFreeHost(output_data_host));
    checkRuntime(cudaFree(input_data_device));
    checkRuntime(cudaFree(output_data_device));
}


int main(){
	if (!build_model()) {
		return -1;
	}
    inference();
    return 0;
}

其中build_model(将onnx转换成tensorrt的engine)也可以通过yolov5-7.0自带的export.py导出:

python export.py --include 'engine'
本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

yolov5量化部署(基于openvino和tensorrt) 的相关文章

随机推荐

  • [LeetCode-01]-Two Sum(求和)

    文章目录 题目相关 Solution 1 暴力求解 2 暴力求解方法改进 3 不可行的方案 后记 每周完成一个ARTS Algorithm Review Tip Share ARTS Algorithm 每周至少做一个 leetcode 的
  • OpenLooKeng连接hive出现UnknownHostException

    背景 最近在验证一个计算的方案 使用的是华为开源的数据虚拟化引擎OpenLookeng hetu 底层基于presto引擎来提供交互式查询分析能力 先前成功部署过一套 添加hive mysql等数据源一切正常 后重新在其他环境进行部署时 连
  • 老胡的周刊(第106期)

    老胡的信息周刊 1 记录这周我看到的有价值的信息 主要针对计算机领域 内容主题极大程度被我个人喜好主导 这个项目核心目的在于记录让自己有印象的信息做一个留存以及共享 项目 quivr 2 Quivr 是您在云中的第二个大脑 让您轻松存储和检
  • windows10-cuda10.1-cudnn7-tensorflow1.4.0配置流程

    1 下载cuda版本 cuda 10 1 105 win10 network exe 并按照流程安装 安装完后 打开cmd 输入 nvcc V 回车查看是否有版本信息 若出现版本信息 则证明nvcc安装成功 2 下载cudnn版本 cudn
  • oracle批量绑定 forall bulk collect用法以及测试案例

    一 如何使用批挷定提高性能 How Do Bulk Binds Improve Performance 在PL SQL 和SQL引擎 engines 中 太多的上下文切换 context switches 会影响性能 这个会发生在当一个循环
  • 黑丝YYDS,通过五十行代码分分钟下载上万张黑丝小姐姐纯欲高清图!(宅男福利)

    前言 夏天已经接近尾声了 虽然各种大白腿会少了很多 但是会冒出很多各种黑色的大长腿哇 尤其是黑色的 黑丝才是永远滴神 今天带你们大饱眼福一波 一 技术路线 requests 网页请求 BeautifulSoup 解析html网页 re 正则
  • c3p0 mysql 自动重连_关于C3P0容错和自动重连特性的研究

    转载 http blog csdn net cutesource article details 5422093 最近常有数据库和网络设备升级和搬迁等事情 而各个应用都是基于数据库连接池做的 大部分都是基于C3P0 数据库或网络状况的变动都
  • FreeBSD开启SSH远程登录

    1 安装时选择上 SSH 或者源码安装 SSH 2 使用 root 登陆系统 3 使用 ee 编辑器编辑 ee etc inetd conf 去掉 ssh 前的 按 ctrl c 再输入 exit 保存退出 4 编辑 ee etc rc c
  • unity零基础学习

    提示 本人作为萌新记录unity学习路线 后期会根据学习情况不断进行修改 自学途径主要依赖siki学院 前言 伴随游戏行业的兴起 unity引擎的使用越来越普遍 本文章主要记录博主本人入门unity的相关记录大部分依赖siki学院进行整理
  • 【入门指南】从Python入门人工智能指南

    作为一个学习者 什么样的学习方式 学习路径能够帮助我们更高效 便捷的入门人工智能 不至于错过奔驰而过的 AI 号列车 人工智能时代持续发展 成为新一轮产业变革的核心驱动力和引领未来发展的战略技术 不仅受到政策的支持 国内人工智能市场规模也在
  • 在VS Code中使用Kotlin

    在VS Code中使用Kotlin 当前Google有意用Kotlin来替代Java 今后在Android开发中 可能Kotlin会成为标准语言 Kotlin语言是由JetBrains公司开发的 就是做IntelliJ IDEA的那家公司
  • Python示例,它演示了如何使用Python中的类和继承来创建一个简单的形状类层次结构:

    定义一个形状类 class Shape def area self pass 定义一个矩形类 继承自形状类 class Rectangle Shape def init self width height self width width
  • Pycharm 安装插件

    Pycharm 安装插件 Pycharm 安装插件 Pycharm 中一些值得安装的插件 Rainbow Brackets IdeaVim Markdown Navigator Translation 如果说编辑器是程序员的武器 那么插件就
  • Notion笔记搭建博客网站 - NotionNext

    NotionNext是什么 NotionNext是我在Github上开源的基于Next js框架开发的博客生成器 目的是帮助写作爱好者们通过Notion笔记快速搭建一个独立站 从而专注于写作 而不需要操心网站的维护 它将您的Notion笔记
  • 多线程系列之——事件内核对象

    所有内核对象里面事件内核对象是最简单的一个 它包括一个使用计数 还有两个布尔值 一个布尔值用来表示事件是手动重置事件还是自动重置事件 另一个布尔值表示当前是否处于触发状态 当一个手动重置事件被触发的时候 所有等待该事件的线程都能变成调度状态
  • -day14--模块与内置模块

    day14 模块 课程目标 掌握Python中常用模块的使用方法 今日概要 自定义模块 包 第三方模块 内置模块 1 2 1 自定义模块 1 1 模块和包 import hashlib def encrypt data 数据加密 hash
  • 【全站首发】最详细的jetbrains space使用方法

    目录 一 注册一个Jetbrains账号 如果有可以跳过 二 创建或加入一个space团队 包含下载软件 三 Jetbrains Space软件配置 四 软件的使用 一 注册一个Jetbrains账号 如果有可以跳过 打开Jetbrains
  • 测试工作总结

    1 软件测试定义 2 基本方法 3 流程规范 1 新需求 需求文档 原型图 蓝湖 文档分析 评审 测试计划 测试用例编写 冒烟测试用例编写 计划用例评审 跟项目组 项目组提交测试 程序员自测报告 验证自测报告 跟公司而定 有的是冒烟测试 开
  • 在 IBM Bluemix 开发及使用 Docker Images

    Node Js是云端平台开发App的主要工具 以下范例是将Bluehatter App以3中方式执行包含 1 以Node Js在localhost开发Bluehatter App后直接Deloy至IBM Bluemix云端平台执行 2 将前
  • yolov5量化部署(基于openvino和tensorrt)

    yolov5 openvino量化部署 首先 下载YOLOv5源码 安装YOLOv5和OpenVINO的python依赖 git clone https github com ultralytics yolov5 git pip insta