ultralytics 8.3.77 faster YOLOv8-Segment ONNX Runtime example (#19312)

Signed-off-by: Adnan Ekici <53556022+AdnanEkici@users.noreply.github.com>
Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com>
Co-authored-by: UltralyticsAssistant <web@ultralytics.com>
Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
This commit is contained in:
Adnan Ekici 2025-02-19 18:05:21 +03:00 committed by GitHub
parent b50a327a04
commit 3bb51b608f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 187 additions and 264 deletions

View file

@ -1,25 +1,39 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
import argparse import argparse
from typing import List, Tuple, Union
import cv2 import cv2
import numpy as np import numpy as np
import onnxruntime as ort import onnxruntime as ort
import torch
import torch.nn.functional as F
import ultralytics.utils.ops as ops
from ultralytics.engine.results import Results
from ultralytics.utils import ASSETS, yaml_load from ultralytics.utils import ASSETS, yaml_load
from ultralytics.utils.checks import check_yaml from ultralytics.utils.checks import check_yaml
from ultralytics.utils.plotting import Colors
class YOLOv8Seg: class YOLOv8Seg:
"""YOLOv8 segmentation model.""" """YOLOv8 segmentation model."""
def __init__(self, onnx_model): def __init__(self, onnx_model, conf_threshold=0.4):
""" """
Initialization. Initializes the object detection model using an ONNX model.
Args: Args:
onnx_model (str): Path to the ONNX model. onnx_model (str): Path to the ONNX model file.
conf_threshold (float, optional): Confidence threshold for detections. Defaults to 0.4.
Attributes:
session (ort.InferenceSession): ONNX Runtime session for running inference.
ndtype (numpy.dtype): Data type for model input (FP16 or FP32).
model_height (int): Height of the model's input image.
model_width (int): Width of the model's input image.
classes (list): List of class names from the COCO dataset.
device (str): Specifies whether inference runs on CPU or GPU.
conf_threshold (float): Confidence threshold for filtering detections.
""" """
# Build Ort session # Build Ort session
self.session = ort.InferenceSession( self.session = ort.InferenceSession(
@ -38,281 +52,190 @@ class YOLOv8Seg:
# Load COCO class names # Load COCO class names
self.classes = yaml_load(check_yaml("coco8.yaml"))["names"] self.classes = yaml_load(check_yaml("coco8.yaml"))["names"]
# Create color palette # Device
self.color_palette = Colors() self.device = "cuda:0" if ort.get_device().lower() == "gpu" else "cpu"
def __call__(self, im0, conf_threshold=0.4, iou_threshold=0.45, nm=32): # Confidence
self.conf_threshold = conf_threshold
def __call__(self, im0):
""" """
The whole pipeline: pre-process -> inference -> post-process. Runs inference on the input image using the ONNX model.
Args: Args:
im0 (Numpy.ndarray): original input image. im0 (numpy.ndarray): The original input image in BGR format.
conf_threshold (float): confidence threshold for filtering predictions.
iou_threshold (float): iou threshold for NMS.
nm (int): the number of masks.
Returns: Returns:
boxes (List): list of bounding boxes. list: Processed detection results after post-processing.
segments (List): list of segments.
masks (np.ndarray): [N, H, W], output masks. Example:
>>> detector = Model("yolov8.onnx")
>>> results = detector(image) # Runs inference and returns detections.
""" """
# Pre-process # Pre-process
im, ratio, (pad_w, pad_h) = self.preprocess(im0) processed_image = self.preprocess(im0)
# Ort inference # Ort inference
preds = self.session.run(None, {self.session.get_inputs()[0].name: im}) predictions = self.session.run(None, {self.session.get_inputs()[0].name: processed_image})
# Post-process # Post-process
boxes, segments, masks = self.postprocess( results = self.postprocess(im0, processed_image, predictions)
preds,
im0=im0,
ratio=ratio,
pad_w=pad_w,
pad_h=pad_h,
conf_threshold=conf_threshold,
iou_threshold=iou_threshold,
nm=nm,
)
return boxes, segments, masks
def preprocess(self, img): return results
def preprocess(self, image, new_shape: Union[Tuple, List] = (640, 640)):
""" """
Pre-processes the input image. Preprocesses the input image before feeding it into the model.
Args: Args:
img (Numpy.ndarray): image about to be processed. image (np.ndarray): The input image in BGR format.
new_shape (Tuple or List, optional): The target shape for resizing. Defaults to (640, 640).
Returns: Returns:
img_process (Numpy.ndarray): image preprocessed for inference. np.ndarray: Preprocessed image ready for model inference.
ratio (tuple): width, height ratios in letterbox.
pad_w (float): width padding in letterbox.
pad_h (float): height padding in letterbox.
"""
# Resize and pad input image using letterbox() (Borrowed from Ultralytics)
shape = img.shape[:2] # original image shape
new_shape = (self.model_height, self.model_width)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
ratio = r, r
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
pad_w, pad_h = (new_shape[1] - new_unpad[0]) / 2, (new_shape[0] - new_unpad[1]) / 2 # wh padding
if shape[::-1] != new_unpad: # resize
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(pad_h - 0.1)), int(round(pad_h + 0.1))
left, right = int(round(pad_w - 0.1)), int(round(pad_w + 0.1))
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))
# Transforms: HWC to CHW -> BGR to RGB -> div(255) -> contiguous -> add axis(optional) Example:
img = np.ascontiguousarray(np.einsum("HWC->CHW", img)[::-1], dtype=self.ndtype) / 255.0 >>> processed_img = model.preprocess(image)
img_process = img[None] if len(img.shape) == 3 else img
return img_process, ratio, (pad_w, pad_h)
def postprocess(self, preds, im0, ratio, pad_w, pad_h, conf_threshold, iou_threshold, nm=32):
""" """
Post-process the prediction. image, _, _ = self.__resize_and_pad_image(image=image, new_shape=new_shape)
image = self.__reshape_image(image=image)
processed_image = image[None] if len(image.shape) == 3 else image
return processed_image
def __reshape_image(self, image: np.ndarray) -> np.ndarray:
"""
Reshapes the image by changing its layout and normalizing pixel values.
Args: Args:
preds (Numpy.ndarray): predictions come from ort.session.run(). image (np.ndarray): The image to be reshaped.
im0 (Numpy.ndarray): [h, w, c] original input image.
ratio (tuple): width, height ratios in letterbox.
pad_w (float): width padding in letterbox.
pad_h (float): height padding in letterbox.
conf_threshold (float): conf threshold.
iou_threshold (float): iou threshold.
nm (int): the number of masks.
Returns: Returns:
boxes (List): list of bounding boxes. np.ndarray: Reshaped and normalized image.
segments (List): list of segments.
masks (np.ndarray): [N, H, W], output masks. Example:
>>> reshaped_img = model.__reshape_image(image)
""" """
x, protos = preds[0], preds[1] # Two outputs: predictions and protos image = image.transpose([2, 0, 1])
image = image[np.newaxis, ...]
image = np.ascontiguousarray(image).astype(np.float32) / 255
return image
# Transpose dim 1: (Batch_size, xywh_conf_cls_nm, Num_anchors) -> (Batch_size, Num_anchors, xywh_conf_cls_nm) def __resize_and_pad_image(
x = np.einsum("bcn->bnc", x) self, image=np.ndarray, new_shape: Union[Tuple, List] = (640, 640), color: Union[Tuple, List] = (114, 114, 114)
):
# Predictions filtering by conf-threshold
x = x[np.amax(x[..., 4:-nm], axis=-1) > conf_threshold]
# Create a new matrix which merge these(box, score, cls, nm) into one
# For more details about `numpy.c_()`: https://numpy.org/doc/1.26/reference/generated/numpy.c_.html
x = np.c_[x[..., :4], np.amax(x[..., 4:-nm], axis=-1), np.argmax(x[..., 4:-nm], axis=-1), x[..., -nm:]]
# NMS filtering
x = x[cv2.dnn.NMSBoxes(x[:, :4], x[:, 4], conf_threshold, iou_threshold)]
# Decode and return
if len(x) > 0:
# Bounding boxes format change: cxcywh -> xyxy
x[..., [0, 1]] -= x[..., [2, 3]] / 2
x[..., [2, 3]] += x[..., [0, 1]]
# Rescales bounding boxes from model shape(model_height, model_width) to the shape of original image
x[..., :4] -= [pad_w, pad_h, pad_w, pad_h]
x[..., :4] /= min(ratio)
# Bounding boxes boundary clamp
x[..., [0, 2]] = x[:, [0, 2]].clip(0, im0.shape[1])
x[..., [1, 3]] = x[:, [1, 3]].clip(0, im0.shape[0])
# Process masks
masks = self.process_mask(protos[0], x[:, 6:], x[:, :4], im0.shape)
# Masks -> Segments(contours)
segments = self.masks2segments(masks)
return x[..., :6], segments, masks # boxes, segments, masks
else:
return [], [], []
@staticmethod
def masks2segments(masks):
""" """
Takes a list of masks(n,h,w) and returns a list of segments(n,xy), from Resizes and pads the input image while maintaining the aspect ratio.
https://github.com/ultralytics/ultralytics/blob/main/ultralytics/utils/ops.py.
Args: Args:
masks (numpy.ndarray): the output of the model, which is a tensor of shape (batch_size, 160, 160). image (np.ndarray): The input image.
new_shape (Tuple or List, optional): Target shape (width, height). Defaults to (640, 640).
color (Tuple or List, optional): Padding color. Defaults to (114, 114, 114).
Returns: Returns:
segments (List): list of segment masks. Tuple[np.ndarray, float, float]: The resized image along with padding values.
"""
segments = []
for x in masks.astype("uint8"):
c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0] # CHAIN_APPROX_SIMPLE
if c:
c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2)
else:
c = np.zeros((0, 2)) # no segments found
segments.append(c.astype("float32"))
return segments
@staticmethod Example:
def crop_mask(masks, boxes): >>> resized_img, dw, dh = model.__resize_and_pad_image(image)
""" """
Takes a mask and a bounding box, and returns a mask that is cropped to the bounding box, from shape = image.shape[:2] # original image shape
https://github.com/ultralytics/ultralytics/blob/main/ultralytics/utils/ops.py.
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape)
# Scale ratio (new / old)
ratio = min(new_shape[0] / shape[1], new_shape[1] / shape[0])
new_unpad = int(round(shape[1] * ratio)), int(round(shape[0] * ratio))
delta_width, delta_height = new_shape[0] - new_unpad[0], new_shape[1] - new_unpad[1]
# Divide padding into 2 sides
delta_width /= 2
delta_height /= 2
image = cv2.resize(image, new_unpad, interpolation=cv2.INTER_LINEAR) if shape[::-1] == new_unpad else image
top, bottom = int(round(delta_height - 0.1)), int(round(delta_height + 0.1))
left, right = int(round(delta_width - 0.1)), int(round(delta_width + 0.1))
image = cv2.copyMakeBorder(image, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
return image, delta_width, delta_height
def postprocess(self, image, processed_image, predictions):
"""
Post-processes model predictions to extract meaningful results.
Args: Args:
masks (Numpy.ndarray): [n, h, w] tensor of masks. image (np.ndarray): The original input image.
boxes (Numpy.ndarray): [n, 4] tensor of bbox coordinates in relative point form. processed_image (np.ndarray): The preprocessed image used for inference.
predictions (list): Model output predictions.
Returns: Returns:
(Numpy.ndarray): The masks are being cropped to the bounding box. list: Processed detection results.
Example:
>>> results = model.postprocess(image, processed_image, predictions)
""" """
n, h, w = masks.shape torch_tensor_predictions = [torch.from_numpy(output) for output in predictions]
x1, y1, x2, y2 = np.split(boxes[:, :, None], 4, 1) torch_tensor_boxes_confidence_category_predictions = torch_tensor_predictions[0]
r = np.arange(w, dtype=x1.dtype)[None, None, :] masks_predictions_tensor = torch_tensor_predictions[1].to(self.device)
c = np.arange(h, dtype=x1.dtype)[None, :, None]
return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
def process_mask(self, protos, masks_in, bboxes, im0_shape): nms_boxes_confidence_category_predictions_tensor = ops.non_max_suppression(
""" torch_tensor_boxes_confidence_category_predictions,
Takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher conf_thres=self.conf_threshold,
quality but is slower, from https://github.com/ultralytics/ultralytics/blob/main/ultralytics/utils/ops.py. nc=len(self.classes),
agnostic=False,
Args: max_det=100,
protos (numpy.ndarray): [mask_dim, mask_h, mask_w]. max_time_img=0.001,
masks_in (numpy.ndarray): [n, mask_dim], n is number of masks after nms. max_nms=1000,
bboxes (numpy.ndarray): bboxes re-scaled to original image shape.
im0_shape (tuple): the size of the input image (h,w,c).
Returns:
(numpy.ndarray): The upsampled masks.
"""
c, mh, mw = protos.shape
masks = np.matmul(masks_in, protos.reshape((c, -1))).reshape((-1, mh, mw)).transpose(1, 2, 0) # HWN
masks = np.ascontiguousarray(masks)
masks = self.scale_mask(masks, im0_shape) # re-scale mask from P3 shape to original input image shape
masks = np.einsum("HWN -> NHW", masks) # HWN -> NHW
masks = self.crop_mask(masks, bboxes)
return np.greater(masks, 0.5)
@staticmethod
def scale_mask(masks, im0_shape, ratio_pad=None):
"""
Takes a mask, and resizes it to the original image size, from
https://github.com/ultralytics/ultralytics/blob/main/ultralytics/utils/ops.py.
Args:
masks (np.ndarray): resized and padded masks/images, [h, w, num]/[h, w, 3].
im0_shape (tuple): the original image shape.
ratio_pad (tuple): the ratio of the padding to the original image.
Returns:
masks (np.ndarray): The masks that are being returned.
"""
im1_shape = masks.shape[:2]
if ratio_pad is None: # calculate from im0_shape
gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1]) # gain = old / new
pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2 # wh padding
else:
pad = ratio_pad[1]
# Calculate tlbr of mask
top, left = int(round(pad[1] - 0.1)), int(round(pad[0] - 0.1)) # y, x
bottom, right = int(round(im1_shape[0] - pad[1] + 0.1)), int(round(im1_shape[1] - pad[0] + 0.1))
if len(masks.shape) < 2:
raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
masks = masks[top:bottom, left:right]
masks = cv2.resize(
masks, (im0_shape[1], im0_shape[0]), interpolation=cv2.INTER_LINEAR
) # INTER_CUBIC would be better
if len(masks.shape) == 2:
masks = masks[:, :, None]
return masks
def draw_and_visualize(self, im, bboxes, segments, vis=False, save=True):
"""
Draw and visualize results.
Args:
im (np.ndarray): original image, shape [h, w, c].
bboxes (numpy.ndarray): [n, 4], n is number of bboxes.
segments (List): list of segment masks.
vis (bool): imshow using OpenCV.
save (bool): save image annotated.
Returns:
None
"""
# Draw rectangles and polygons
im_canvas = im.copy()
for (*box, conf, cls_), segment in zip(bboxes, segments):
# draw contour and fill mask
cv2.polylines(im, np.int32([segment]), True, (255, 255, 255), 2) # white borderline
cv2.fillPoly(im_canvas, np.int32([segment]), self.color_palette(int(cls_), bgr=True))
# draw bbox rectangle
cv2.rectangle(
im,
(int(box[0]), int(box[1])),
(int(box[2]), int(box[3])),
self.color_palette(int(cls_), bgr=True),
1,
cv2.LINE_AA,
)
cv2.putText(
im,
f"{self.classes[cls_]}: {conf:.3f}",
(int(box[0]), int(box[1] - 9)),
cv2.FONT_HERSHEY_SIMPLEX,
0.7,
self.color_palette(int(cls_), bgr=True),
2,
cv2.LINE_AA,
) )
# Mix image results = []
im = cv2.addWeighted(im_canvas, 0.3, im, 0.7, 0) for idx, predictions in enumerate(nms_boxes_confidence_category_predictions_tensor):
predictions = predictions.to(self.device)
masks = self.__process_mask(
masks_predictions_tensor[idx],
predictions[:, 6:],
predictions[:, :4],
processed_image.shape[2:],
upsample=True,
) # HWC
predictions[:, :4] = ops.scale_boxes(processed_image.shape[2:], predictions[:, :4], image.shape)
results.append(Results(image, path="", names=self.classes, boxes=predictions[:, :6], masks=masks))
# Show image return results
if vis:
cv2.imshow("demo", im)
cv2.waitKey(0)
cv2.destroyAllWindows()
# Save image def __process_mask(self, protos, masks_in, bboxes, shape, upsample=False):
if save: """
cv2.imwrite("demo.jpg", im) Processes segmentation masks from the model output.
Args:
protos (torch.Tensor): The prototype mask predictions from the model.
masks_in (torch.Tensor): The raw mask predictions.
bboxes (torch.Tensor): Bounding boxes for the detected objects.
shape (Tuple): Target shape for mask resizing.
upsample (bool, optional): Whether to upscale masks to match the original image size. Defaults to False.
Returns:
torch.Tensor: Processed binary masks.
Example:
>>> masks = model.__process_mask(protos, masks_in, bboxes, shape, upsample=True)
"""
c, mh, mw = protos.shape # CHW
ih, iw = shape
masks = (masks_in @ protos.float().view(c, -1)).sigmoid().view(-1, mh, mw) # CHW
width_ratio = mw / iw
height_ratio = mh / ih
downsampled_bboxes = bboxes.clone()
downsampled_bboxes[:, 0] *= width_ratio
downsampled_bboxes[:, 2] *= width_ratio
downsampled_bboxes[:, 3] *= height_ratio
downsampled_bboxes[:, 1] *= height_ratio
masks = ops.crop_mask(masks, downsampled_bboxes) # CHW
if upsample:
masks = F.interpolate(masks[None], shape, mode="bilinear", align_corners=False)[0] # CHW
return masks.gt_(0.5).to(self.device)
if __name__ == "__main__": if __name__ == "__main__":
@ -321,18 +244,18 @@ if __name__ == "__main__":
parser.add_argument("--model", type=str, required=True, help="Path to ONNX model") parser.add_argument("--model", type=str, required=True, help="Path to ONNX model")
parser.add_argument("--source", type=str, default=str(ASSETS / "bus.jpg"), help="Path to input image") parser.add_argument("--source", type=str, default=str(ASSETS / "bus.jpg"), help="Path to input image")
parser.add_argument("--conf", type=float, default=0.25, help="Confidence threshold") parser.add_argument("--conf", type=float, default=0.25, help="Confidence threshold")
parser.add_argument("--iou", type=float, default=0.45, help="NMS IoU threshold")
args = parser.parse_args() args = parser.parse_args()
# Build model # Build model
model = YOLOv8Seg(args.model) model = YOLOv8Seg(args.model, args.conf)
# Read image by OpenCV # Read image by OpenCV
img = cv2.imread(args.source) img = cv2.imread(args.source)
img = cv2.resize(img, (640, 640)) # Can be changed based on your models expected size
# Inference # Inference
boxes, segments, _ = model(img, conf_threshold=args.conf, iou_threshold=args.iou) results = model(img)
# Draw bboxes and polygons cv2.imshow("Segmented Image", results[0].plot())
if len(boxes) > 0: cv2.waitKey(0)
model.draw_and_visualize(img, boxes, segments, vis=False, save=True) cv2.destroyAllWindows()

View file

@ -1,6 +1,6 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
__version__ = "8.3.76" __version__ = "8.3.77"
import os import os