Add YoloTiny adapter

itlab-vision · Dec 15, 2024 · a8b22fa · a8b22fa
1 parent 04a416a
commit a8b22fa
Show file tree

Hide file tree

Showing 4 changed files with 127 additions and 6 deletions.
diff --git a/results/validation/validation_results_openvino_public_models.md b/results/validation/validation_results_openvino_public_models.md
@@ -340,6 +340,7 @@ ssd300                    | - | - | Bounding box: (380,165), (595,425) | Boundin
 ssd512                    | - | - | Bounding box: (377,163), (595,425) | Bounding box: (380,165), (595,425) |
 ssd_mobilenet_v1_fpn_coco | - | - | Bounding boxes: (295, 131), (439, 291),<br> (375, 217), (582, 425),<br> (436, 153), (611, 301) |  Bounding boxes: (295, 131), (439, 291),<br> (375, 217), (582, 425),<br> (436, 153), (611, 301) |
 ssdlite_mobilenet_v2      | - | - | - | - |
+yolo-v3-tiny-tf      | - | - | Bounding box: (127, 161), (228, 325),<br> (43, 139), (127, 286), <br> (212, 147), (345, 341), <br> (175, 110), (251, 243) | Bounding box: (127, 161), (228, 325),<br> (43, 139), (127, 286), <br> (212, 147), (345, 341), <br> (175, 110), (251, 243) |
 
 ### Test image #2
 
@@ -366,6 +367,7 @@ ssd300                    | - | - | Bounding box: (68,100), (336,452) | Bounding
 ssd512                    | - | - | Bounding box: (75,100), (355,445) | Bounding box: (75,100), (355,445)|
 ssd_mobilenet_v1_fpn_coco | - | - | Bounding box: (89, 98), (345, 440)| Bounding box: (89, 98), (345, 440)|
 ssdlite_mobilenet_v2      | - | - | Bounding box: (47, 59), (206, 272)| Bounding box: (47, 59), (206, 272)|
+yolo-v3-tiny-tf      | - | - | Bounding box: (39, 36), (324, 452)| Bounding box: (39, 36), (324, 452)|
 
 ### Test image #3
 
@@ -392,6 +394,7 @@ ssd300                    | - | - | Bounding box: (80,155), (270,375)  | Boundin
 ssd512                    | - | - | Bounding box: (75,170), (172,370)  | Bounding box: (73,170), (173,371)  |
 ssd_mobilenet_v1_fpn_coco | - | - | Bounding box: (90, 135), (260, 375)| Bounding box: (90, 135), (260, 375)|
 ssdlite_mobilenet_v2      | - | - | Bounding boxes: (74, 155), (242, 226), (75, 102), (242, 225)| Bounding boxes: (74, 155), (242, 226), (75, 102), (242, 225)|
+yolo-v3-tiny-tf      | - | - | Bounding boxes: (134, 105), (288, 319), <br>(127, 280), (299, 330)| Bounding boxes: (134, 105), (288, 319), <br>(127, 280), (299, 330)|
 
 ### Test image #4
 Data source: [MS COCO][ms_coco]
@@ -424,6 +427,7 @@ pelee-coco |-|-| Bounding box:<br>TV (103, 41), (402, 289)<br>MOUSE (not detecte
 retinanet-tf |-|-| Bounding box:<br>TV (104, 40), (390, 298)<br>MOUSE (507, 337), (559, 373)<br>KEYBOARD (231, 331), (497, 455) | Bounding box:<br>TV (104, 40), (390, 298)<br>MOUSE (507, 337), (559, 373)<br>KEYBOARD (231, 331), (497, 455)<br>|
 ssd_resnet50_v1_fpn_coco |-|-| Bounding box:<br>TV (113, 40), (396, 305)<br>MOUSE (508, 337), (559, 373)<br>KEYBOARD (223, 340), (499, 461) | Bounding box:<br>TV (113, 40), (396, 305)<br>MOUSE (508, 337), (559, 373)<br>KEYBOARD (223, 340), (499, 461)<br>|
 ssdlite_mobilenet_v2 |-|-| Bounding box:<br>TV (45, 23), (182, 181)<br>MOUSE (238, 209), (261, 229)<br>KEYBOARD (108, 212), (235, 287) | Bounding box:<br>TV (45, 23), (182, 181)<br>MOUSE (238, 209), (261, 229)<br>KEYBOARD (108, 212), (235, 287)<br>|
+yolo-v3-tiny-tf |-|-| Bounding box:<br>TV (87, 29), (265, 267)<br>MOUSE (330, 292), (362, 332)<br>KEYBOARD (156, 305), (313, 388) | Bounding box:<br>TV (87, 29), (265, 267)<br>MOUSE (330, 292), (362, 332)<br>KEYBOARD (156, 305), (313, 388)<br>|
 
 ### Test image #5
 Data source: [MS COCO][ms_coco]
@@ -448,6 +452,7 @@ pelee-coco                    |-|-| Bounding box:<br>PERSON (95, 72), (207, 397)
 retinanet-tf                  |-|-| Bounding box:<br>PERSON (90, 73), (205, 384)<br>HORSE (145, 61), (542, 378) | Bounding box:<br>PERSON (90, 73), (205, 384)<br>HORSE (145, 61), (542, 378)<br>|
 ssd_resnet50_v1_fpn_coco                   |-|-| Bounding box:<br>PERSON (not detected)<br>HORSE (134, 57), (534, 389) | Bounding box:<br>PERSON (not detected)<br>HORSE (134, 57), (534, 389)<br>|
 ssdlite_mobilenet_v2          |-|-| Bounding box:<br>PERSON (43, 48), (98, 281)<br>HORSE (57, 42), (251, 271) | Bounding box:<br>PERSON (43, 48), (98, 281)<br>HORSE (57, 42), (251, 271)<br>|
+yolo-v3-tiny-tf          |-|-| Bounding box: HORSE (74, 44), (352, 382) | Bounding box: HORSE (74, 44), (352, 382)|
 
 
 ### Test image #6
@@ -467,6 +472,7 @@ Bounding box (upper left and	 bottom right corners):<br>AEROPLANE (131, 21), (24
 efficientdet-d0-tf                   |-|-| Bounding box:<br>AIRPLANE (64, 173), (449, 333)<br>| Bounding box:<br>AIRPLANE (64, 173), (449, 333)<br>|
 efficientdet-d1-tf                   |-|-| Bounding box:<br>AIRPLANE (71, 212), (551, 412)<br>| Bounding box:<br>AIRPLANE (71, 212), (551, 412)<br>|
 yolo-v1-tiny-tf                   |-|-| Bounding box:<br>AEROPLANE (131, 21), (248, 414)<br>| Bounding box:<br>AEROPLANE (131, 21), (248, 414)<br>|
+yolo-v3-tiny-tf                 |-|-| Bounding box:<br>AEROPLANE (-16, 138), (438, 281)<br>| Bounding box:<br>AEROPLANE (-16, 138), (438, 281)<br>|
 
 ### Test image #7
 Data source: [WIDER FACE Dataset][wider_face_dataset]

diff --git a/src/inference/inference_openvino_async_mode.py b/src/inference/inference_openvino_async_mode.py
@@ -137,7 +137,7 @@ def cli_argument_parser():
                                  'person-detection-action-recognition-teacher', 'driver-action-recognition-encoder',
                                  'reidentification', 'driver-action-recognition-decoder', 'action-recognition-decoder',
                                  'face-detection', 'mask-rcnn', 'yolo_tiny_voc', 'yolo_v2_voc', 'yolo_v2_coco',
-                                 'yolo_v2_tiny_coco', 'yolo_v3', 'yolo_v3_tf'],
+                                 'yolo_v2_tiny_coco', 'yolo_v3', 'yolo_v3_tf', 'retinanet-tf', 'yolo_v3_tiny'],
                         default='feedforward',
                         type=str,
                         dest='task')

diff --git a/src/inference/inference_openvino_sync_mode.py b/src/inference/inference_openvino_sync_mode.py
@@ -122,7 +122,7 @@ def cli_argument_parser():
                             'action-recognition-encoder', 'driver-action-recognition-encoder', 'reidentification',
                             'driver-action-recognition-decoder', 'action-recognition-decoder', 'face-detection',
                             'mask-rcnn', 'yolo_tiny_voc', 'yolo_v2_voc', 'yolo_v2_coco', 'yolo_v2_tiny_coco',
-                            'yolo_v3', 'yolo_v3_tf'],
+                            'yolo_v3', 'yolo_v3_tf', 'retinanet-tf', 'yolo_v3_tiny'],
                         default='feedforward',
                         type=str,
                         dest='task')

diff --git a/src/inference/io_adapter.py b/src/inference/io_adapter.py
@@ -336,6 +336,8 @@ def get_io_adapter(args, io_model_wrapper, transformer):
             return MiniFASNetV2TFLiteCppIO(args, io_model_wrapper, transformer)
         elif task == 'retinanet-tf':
             return RetinaNetDetectionIO(args, io_model_wrapper, transformer)
+        elif task == 'yolo_v3_tiny':
+            return YoloV3TinyIO(args, io_model_wrapper, transformer)
 
 
 class FeedForwardIO(IOAdapter):
@@ -1797,7 +1799,7 @@ def _get_anchors(self):
     def _get_shapes(self):
         pass
 
-    def __non_max_supression(self, predictions, score_threshold, nms_threshold):
+    def _non_max_supression(self, predictions, score_threshold, nms_threshold):
         predictions.sort(key=lambda prediction: prediction[0], reverse=True)
         valid_detections = []
         while len(predictions) > 0:
@@ -1832,7 +1834,7 @@ def __non_max_supression(self, predictions, score_threshold, nms_threshold):
         return valid_detections
 
     @staticmethod
-    def __print_detections(detections, labels_map, image, scales, orig_shape, batch, log):
+    def _print_detections(detections, labels_map, image, scales, orig_shape, batch, log):
         image = cv2.resize(image, orig_shape)
         for detection in detections:
             left = int(detection[2][0] * scales['W'])
@@ -1843,8 +1845,8 @@ def __print_detections(detections, labels_map, image, scales, orig_shape, batch,
             color = (min(int(class_id / 25 % 5) * 50, 255), min(int(class_id / 5 % 5) * 50, 255),
                      min(int(class_id % 5) * 50, 255))
             log.info('Bounding boxes for image {0} for object {1}'.format(batch, class_id))
-            log.info('Top left: ({0}, {1})'.format(top, left))
-            log.info('Bottom right: ({0}, {1})'.format(bottom, right))
+            log.info('Top left: ({0}, {1})'.format(left, top))
+            log.info('Bottom right: ({0}, {1})'.format(right, bottom))
             label = '<' + labels_map[class_id] + '>'
             image = cv2.rectangle(image, (left, top), (right, bottom), color, 3)
             label_size, base_line = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.75, 1)
@@ -2055,6 +2057,119 @@ def _get_cell_predictions(self, cx, cy, dx, dy, detection, anchor_box_number, im
 
         return predictions
 
+class YoloV3TinyIO(yolo):
+    def __init__(self, args, io_model_wrapper, transformer):
+        super().__init__(args, io_model_wrapper, transformer)
+        self.load_labels_map('mscoco_names.txt')
+
+    def _get_anchors(self):
+        return [
+            ((81, 82), (135, 169), (344, 319)),
+            ((23, 27), (37, 58), (81, 82)),
+        ]
+
+    def _get_shapes(self):
+        return [
+            (3, 85, 13, 13),
+            (3, 85, 26, 26),
+        ]
+
+    def process_output(self, result, log):
+        if self._is_result_invalid(result):
+            log.warning('Model output is processed only for the number iteration = 1')
+            return
+
+        anchors = self._get_anchors()
+        shapes = self._get_shapes()
+        outputs = [
+            result.get("conv2d_9/Conv2D/YoloRegion"),
+            result.get("conv2d_12/Conv2D/YoloRegion"),
+        ]
+
+        if outputs[0] is None or outputs[1] is None:
+            print("Expected output layers not found in the result")
+            return
+
+        input_layer_name = next(iter(self._input))
+        input_ = self._input[input_layer_name]
+        ib, h, w, c = input_.shape
+
+        b = outputs[0].shape[0]
+        images = np.empty((b, h, w, c), dtype=input_.dtype)
+
+        for i in range(b):
+            images[i] = input_[i % ib]
+
+        for batch in range(b):
+            image = images[batch].copy()
+            predictions = []
+            orig_h, orig_w = self._original_shapes[next(iter(self._original_shapes))][batch % ib]
+            scales = {'W': orig_w / w, 'H': orig_h / h}
+
+            for output, shape, anchor_set in zip(outputs, shapes, anchors):
+                num_anchors, num_attributes, grid_size_x, grid_size_y = shape
+                output = output[batch].reshape(num_anchors, num_attributes, grid_size_x, grid_size_y)
+
+                for anchor_idx in range(num_anchors):
+                    for cx in range(grid_size_x):
+                        for cy in range(grid_size_y):
+                            detection = output[anchor_idx, :, cy, cx]
+                            prediction = self._get_cell_predictions(
+                                cx, cy, grid_size_x, grid_size_y,
+                                detection, anchor_idx,
+                                h, w,
+                                anchor_set,
+                                scales
+                            )
+
+                            if prediction:
+                                predictions.extend(prediction)
+
+            valid_detections = self._non_max_supression(predictions, self._threshold, 0.2)
+
+            processed_image = self._print_detections(
+                valid_detections,
+                self._labels_map,
+                image,
+                scales,
+                (orig_w, orig_h),
+                batch,
+                log,
+            )
+
+            out_img = Path(__file__).parent / f'out_detection_{batch + 1}.bmp'
+            cv2.imwrite(str(out_img), processed_image)
+            log.info(f"Result image was saved to {out_img}")
+
+    def _get_cell_predictions(self, cx, cy, dx, dy, detection, anchor_box_number, image_height, image_width, anchors, scales):
+        tx, ty, tw, th, box_score = detection[:5]
+        class_logits = detection[5:]
+
+        bbox_center_x = (cx + self._sigmoid(tx)) * (image_width / dx)
+        bbox_center_y = (cy + self._sigmoid(ty)) * (image_height / dy)
+
+        prior_width, prior_height = anchors[anchor_box_number]
+        bbox_width = np.exp(tw) * prior_width * scales['W']
+        bbox_height = np.exp(th) * prior_height * scales['H']
+
+        box_confidence = self._sigmoid(box_score)
+
+        class_probs = self._sigmoid(class_logits)
+        class_confidences = box_confidence * class_probs
+
+        predictions = []
+        for class_id, confidence in enumerate(class_confidences):
+            if confidence >= 0.5:
+                bbox = [
+                    float(bbox_center_x - bbox_width / 2),
+                    float(bbox_center_y - bbox_height / 2),
+                    float(bbox_width),
+                    float(bbox_height),
+                ]
+                predictions.append([confidence, class_id, bbox])
+
+        return predictions if predictions else None
+
 
 class YoloV7(IOAdapter):
     def process_output(self, result, log, threshold=0.5):