DepthAI v2 has been superseded by DepthAI v3. You are viewing legacy documentation.
DepthAI Tutorials
DepthAI API References

ON THIS PAGE

  • Demo
  • Setup
  • Source code
  • Pipeline

RGB & TinyYolo with spatial data

This example shows how to run Yolo on the RGB input frame, and how to display both the RGB preview, detections, depth map and spatial information (X,Y,Z). It's similar to example RGB & MobilenetSSD with spatial data except it is running TinyYolo network. X,Y,Z coordinates are relative to the center of depth map.Configurable, network dependent parameters are required for correct decoding:
  • setNumClasses() - number of YOLO classes
  • setCoordinateSize() - size of coordinate
  • setAnchors() - yolo anchors
  • setAnchorMasks() - anchorMasks26, anchorMasks13 (anchorMasks52 - additionally for full YOLOv4)
  • setIouThreshold() - intersection over union threshold
  • setConfidenceThreshold() - confidence threshold above which objects are detected

Similar samples:

Demo

Setup

This example requires the DepthAI v3 API, see installation instructions.

Source code

Python

Python
GitHub
1#!/usr/bin/env python3
2
3from pathlib import Path
4import sys
5import cv2
6import depthai as dai
7import numpy as np
8import time
9
10'''
11Spatial Tiny-yolo example
12  Performs inference on RGB camera and retrieves spatial location coordinates: x,y,z relative to the center of depth map.
13  Can be used for tiny-yolo-v3 or tiny-yolo-v4 networks
14'''
15
16# Get argument first
17nnBlobPath = str((Path(__file__).parent / Path('../models/yolo-v4-tiny-tf_openvino_2021.4_6shave.blob')).resolve().absolute())
18if 1 < len(sys.argv):
19    arg = sys.argv[1]
20    if arg == "yolo3":
21        nnBlobPath = str((Path(__file__).parent / Path('../models/yolo-v3-tiny-tf_openvino_2021.4_6shave.blob')).resolve().absolute())
22    elif arg == "yolo4":
23        nnBlobPath = str((Path(__file__).parent / Path('../models/yolo-v4-tiny-tf_openvino_2021.4_6shave.blob')).resolve().absolute())
24    else:
25        nnBlobPath = arg
26else:
27    print("Using Tiny YoloV4 model. If you wish to use Tiny YOLOv3, call 'tiny_yolo.py yolo3'")
28
29if not Path(nnBlobPath).exists():
30    import sys
31    raise FileNotFoundError(f'Required file/s not found, please run "{sys.executable} install_requirements.py"')
32
33# Tiny yolo v3/4 label texts
34labelMap = [
35    "person",         "bicycle",    "car",           "motorbike",     "aeroplane",   "bus",           "train",
36    "truck",          "boat",       "traffic light", "fire hydrant",  "stop sign",   "parking meter", "bench",
37    "bird",           "cat",        "dog",           "horse",         "sheep",       "cow",           "elephant",
38    "bear",           "zebra",      "giraffe",       "backpack",      "umbrella",    "handbag",       "tie",
39    "suitcase",       "frisbee",    "skis",          "snowboard",     "sports ball", "kite",          "baseball bat",
40    "baseball glove", "skateboard", "surfboard",     "tennis racket", "bottle",      "wine glass",    "cup",
41    "fork",           "knife",      "spoon",         "bowl",          "banana",      "apple",         "sandwich",
42    "orange",         "broccoli",   "carrot",        "hot dog",       "pizza",       "donut",         "cake",
43    "chair",          "sofa",       "pottedplant",   "bed",           "diningtable", "toilet",        "tvmonitor",
44    "laptop",         "mouse",      "remote",        "keyboard",      "cell phone",  "microwave",     "oven",
45    "toaster",        "sink",       "refrigerator",  "book",          "clock",       "vase",          "scissors",
46    "teddy bear",     "hair drier", "toothbrush"
47]
48
49syncNN = True
50
51# Create pipeline
52pipeline = dai.Pipeline()
53
54# Define sources and outputs
55camRgb = pipeline.create(dai.node.ColorCamera)
56spatialDetectionNetwork = pipeline.create(dai.node.YoloSpatialDetectionNetwork)
57monoLeft = pipeline.create(dai.node.MonoCamera)
58monoRight = pipeline.create(dai.node.MonoCamera)
59stereo = pipeline.create(dai.node.StereoDepth)
60nnNetworkOut = pipeline.create(dai.node.XLinkOut)
61
62xoutRgb = pipeline.create(dai.node.XLinkOut)
63xoutNN = pipeline.create(dai.node.XLinkOut)
64xoutDepth = pipeline.create(dai.node.XLinkOut)
65
66xoutRgb.setStreamName("rgb")
67xoutNN.setStreamName("detections")
68xoutDepth.setStreamName("depth")
69nnNetworkOut.setStreamName("nnNetwork")
70
71# Properties
72camRgb.setPreviewSize(416, 416)
73camRgb.setResolution(dai.ColorCameraProperties.SensorResolution.THE_1080_P)
74camRgb.setInterleaved(False)
75camRgb.setColorOrder(dai.ColorCameraProperties.ColorOrder.BGR)
76
77monoLeft.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
78monoLeft.setCamera("left")
79monoRight.setResolution(dai.MonoCameraProperties.SensorResolution.THE_400_P)
80monoRight.setCamera("right")
81
82# setting node configs
83stereo.setDefaultProfilePreset(dai.node.StereoDepth.PresetMode.HIGH_DENSITY)
84# Align depth map to the perspective of RGB camera, on which inference is done
85stereo.setDepthAlign(dai.CameraBoardSocket.CAM_A)
86stereo.setOutputSize(monoLeft.getResolutionWidth(), monoLeft.getResolutionHeight())
87stereo.setSubpixel(True)
88
89spatialDetectionNetwork.setBlobPath(nnBlobPath)
90spatialDetectionNetwork.setConfidenceThreshold(0.5)
91spatialDetectionNetwork.input.setBlocking(False)
92spatialDetectionNetwork.setBoundingBoxScaleFactor(0.5)
93spatialDetectionNetwork.setDepthLowerThreshold(100)
94spatialDetectionNetwork.setDepthUpperThreshold(5000)
95
96# Yolo specific parameters
97spatialDetectionNetwork.setNumClasses(80)
98spatialDetectionNetwork.setCoordinateSize(4)
99spatialDetectionNetwork.setAnchors([10,14, 23,27, 37,58, 81,82, 135,169, 344,319])
100spatialDetectionNetwork.setAnchorMasks({ "side26": [1,2,3], "side13": [3,4,5] })
101spatialDetectionNetwork.setIouThreshold(0.5)
102
103# Linking
104monoLeft.out.link(stereo.left)
105monoRight.out.link(stereo.right)
106
107camRgb.preview.link(spatialDetectionNetwork.input)
108if syncNN:
109    spatialDetectionNetwork.passthrough.link(xoutRgb.input)
110else:
111    camRgb.preview.link(xoutRgb.input)
112
113spatialDetectionNetwork.out.link(xoutNN.input)
114
115stereo.depth.link(spatialDetectionNetwork.inputDepth)
116spatialDetectionNetwork.passthroughDepth.link(xoutDepth.input)
117spatialDetectionNetwork.outNetwork.link(nnNetworkOut.input)
118
119# Connect to device and start pipeline
120with dai.Device(pipeline) as device:
121
122    # Output queues will be used to get the rgb frames and nn data from the outputs defined above
123    previewQueue = device.getOutputQueue(name="rgb", maxSize=4, blocking=False)
124    detectionNNQueue = device.getOutputQueue(name="detections", maxSize=4, blocking=False)
125    depthQueue = device.getOutputQueue(name="depth", maxSize=4, blocking=False)
126    networkQueue = device.getOutputQueue(name="nnNetwork", maxSize=4, blocking=False)
127
128    startTime = time.monotonic()
129    counter = 0
130    fps = 0
131    color = (255, 255, 255)
132    printOutputLayersOnce = True
133
134    while True:
135        inPreview = previewQueue.get()
136        inDet = detectionNNQueue.get()
137        depth = depthQueue.get()
138        inNN = networkQueue.get()
139
140        if printOutputLayersOnce:
141            toPrint = 'Output layer names:'
142            for ten in inNN.getAllLayerNames():
143                toPrint = f'{toPrint} {ten},'
144            print(toPrint)
145            printOutputLayersOnce = False
146
147        frame = inPreview.getCvFrame()
148        depthFrame = depth.getFrame() # depthFrame values are in millimeters
149
150        depth_downscaled = depthFrame[::4]
151        if np.all(depth_downscaled == 0):
152            min_depth = 0  # Set a default minimum depth value when all elements are zero
153        else:
154            min_depth = np.percentile(depth_downscaled[depth_downscaled != 0], 1)
155        max_depth = np.percentile(depth_downscaled, 99)
156        depthFrameColor = np.interp(depthFrame, (min_depth, max_depth), (0, 255)).astype(np.uint8)
157        depthFrameColor = cv2.applyColorMap(depthFrameColor, cv2.COLORMAP_HOT)
158
159        counter+=1
160        current_time = time.monotonic()
161        if (current_time - startTime) > 1 :
162            fps = counter / (current_time - startTime)
163            counter = 0
164            startTime = current_time
165
166        detections = inDet.detections
167
168        # If the frame is available, draw bounding boxes on it and show the frame
169        height = frame.shape[0]
170        width  = frame.shape[1]
171        for detection in detections:
172            roiData = detection.boundingBoxMapping
173            roi = roiData.roi
174            roi = roi.denormalize(depthFrameColor.shape[1], depthFrameColor.shape[0])
175            topLeft = roi.topLeft()
176            bottomRight = roi.bottomRight()
177            xmin = int(topLeft.x)
178            ymin = int(topLeft.y)
179            xmax = int(bottomRight.x)
180            ymax = int(bottomRight.y)
181            cv2.rectangle(depthFrameColor, (xmin, ymin), (xmax, ymax), color, 1)
182
183            # Denormalize bounding box
184            x1 = int(detection.xmin * width)
185            x2 = int(detection.xmax * width)
186            y1 = int(detection.ymin * height)
187            y2 = int(detection.ymax * height)
188            try:
189                label = labelMap[detection.label]
190            except:
191                label = detection.label
192            cv2.putText(frame, str(label), (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
193            cv2.putText(frame, "{:.2f}".format(detection.confidence*100), (x1 + 10, y1 + 35), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
194            cv2.putText(frame, f"X: {int(detection.spatialCoordinates.x)} mm", (x1 + 10, y1 + 50), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
195            cv2.putText(frame, f"Y: {int(detection.spatialCoordinates.y)} mm", (x1 + 10, y1 + 65), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
196            cv2.putText(frame, f"Z: {int(detection.spatialCoordinates.z)} mm", (x1 + 10, y1 + 80), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
197
198            cv2.rectangle(frame, (x1, y1), (x2, y2), color, cv2.FONT_HERSHEY_SIMPLEX)
199
200        cv2.putText(frame, "NN fps: {:.2f}".format(fps), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
201        cv2.imshow("depth", depthFrameColor)
202        cv2.imshow("rgb", frame)
203
204        if cv2.waitKey(1) == ord('q'):
205            break

C++

1#include <chrono>
2
3#include "utility.hpp"
4
5// Includes common necessary includes for development using depthai library
6#include "depthai/depthai.hpp"
7
8static const std::vector<std::string> labelMap = {
9    "person",        "bicycle",      "car",           "motorbike",     "aeroplane",   "bus",         "train",       "truck",        "boat",
10    "traffic light", "fire hydrant", "stop sign",     "parking meter", "bench",       "bird",        "cat",         "dog",          "horse",
11    "sheep",         "cow",          "elephant",      "bear",          "zebra",       "giraffe",     "backpack",    "umbrella",     "handbag",
12    "tie",           "suitcase",     "frisbee",       "skis",          "snowboard",   "sports ball", "kite",        "baseball bat", "baseball glove",
13    "skateboard",    "surfboard",    "tennis racket", "bottle",        "wine glass",  "cup",         "fork",        "knife",        "spoon",
14    "bowl",          "banana",       "apple",         "sandwich",      "orange",      "broccoli",    "carrot",      "hot dog",      "pizza",
15    "donut",         "cake",         "chair",         "sofa",          "pottedplant", "bed",         "diningtable", "toilet",       "tvmonitor",
16    "laptop",        "mouse",        "remote",        "keyboard",      "cell phone",  "microwave",   "oven",        "toaster",      "sink",
17    "refrigerator",  "book",         "clock",         "vase",          "scissors",    "teddy bear",  "hair drier",  "toothbrush"};
18
19static std::atomic<bool> syncNN{true};
20
21int main(int argc, char** argv) {
22    using namespace std;
23    using namespace std::chrono;
24    std::string nnPath(BLOB_PATH);
25
26    // If path to blob specified, use that
27    if(argc > 1) {
28        nnPath = std::string(argv[1]);
29    }
30
31    // Print which blob we are using
32    printf("Using blob at path: %s\n", nnPath.c_str());
33
34    // Create pipeline
35    dai::Pipeline pipeline;
36
37    // Define sources and outputs
38    auto camRgb = pipeline.create<dai::node::ColorCamera>();
39    auto spatialDetectionNetwork = pipeline.create<dai::node::YoloSpatialDetectionNetwork>();
40    auto monoLeft = pipeline.create<dai::node::MonoCamera>();
41    auto monoRight = pipeline.create<dai::node::MonoCamera>();
42    auto stereo = pipeline.create<dai::node::StereoDepth>();
43
44    auto xoutRgb = pipeline.create<dai::node::XLinkOut>();
45    auto xoutNN = pipeline.create<dai::node::XLinkOut>();
46    auto xoutDepth = pipeline.create<dai::node::XLinkOut>();
47    auto nnNetworkOut = pipeline.create<dai::node::XLinkOut>();
48
49    xoutRgb->setStreamName("rgb");
50    xoutNN->setStreamName("detections");
51    xoutDepth->setStreamName("depth");
52    nnNetworkOut->setStreamName("nnNetwork");
53
54    // Properties
55    camRgb->setPreviewSize(416, 416);
56    camRgb->setResolution(dai::ColorCameraProperties::SensorResolution::THE_1080_P);
57    camRgb->setInterleaved(false);
58    camRgb->setColorOrder(dai::ColorCameraProperties::ColorOrder::BGR);
59
60    monoLeft->setResolution(dai::MonoCameraProperties::SensorResolution::THE_400_P);
61    monoLeft->setCamera("left");
62    monoRight->setResolution(dai::MonoCameraProperties::SensorResolution::THE_400_P);
63    monoRight->setCamera("right");
64
65    // setting node configs
66    stereo->setDefaultProfilePreset(dai::node::StereoDepth::PresetMode::HIGH_DENSITY);
67    // Align depth map to the perspective of RGB camera, on which inference is done
68    stereo->setDepthAlign(dai::CameraBoardSocket::CAM_A);
69    stereo->setOutputSize(monoLeft->getResolutionWidth(), monoLeft->getResolutionHeight());
70
71    spatialDetectionNetwork->setBlobPath(nnPath);
72    spatialDetectionNetwork->setConfidenceThreshold(0.5f);
73    spatialDetectionNetwork->input.setBlocking(false);
74    spatialDetectionNetwork->setBoundingBoxScaleFactor(0.5);
75    spatialDetectionNetwork->setDepthLowerThreshold(100);
76    spatialDetectionNetwork->setDepthUpperThreshold(5000);
77
78    // yolo specific parameters
79    spatialDetectionNetwork->setNumClasses(80);
80    spatialDetectionNetwork->setCoordinateSize(4);
81    spatialDetectionNetwork->setAnchors({10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319});
82    spatialDetectionNetwork->setAnchorMasks({{"side26", {1, 2, 3}}, {"side13", {3, 4, 5}}});
83    spatialDetectionNetwork->setIouThreshold(0.5f);
84
85    // Linking
86    monoLeft->out.link(stereo->left);
87    monoRight->out.link(stereo->right);
88
89    camRgb->preview.link(spatialDetectionNetwork->input);
90    if(syncNN) {
91        spatialDetectionNetwork->passthrough.link(xoutRgb->input);
92    } else {
93        camRgb->preview.link(xoutRgb->input);
94    }
95
96    spatialDetectionNetwork->out.link(xoutNN->input);
97
98    stereo->depth.link(spatialDetectionNetwork->inputDepth);
99    spatialDetectionNetwork->passthroughDepth.link(xoutDepth->input);
100    spatialDetectionNetwork->outNetwork.link(nnNetworkOut->input);
101
102    // Connect to device and start pipeline
103    dai::Device device(pipeline);
104
105    // Output queues will be used to get the rgb frames and nn data from the outputs defined above
106    auto previewQueue = device.getOutputQueue("rgb", 4, false);
107    auto detectionNNQueue = device.getOutputQueue("detections", 4, false);
108    auto depthQueue = device.getOutputQueue("depth", 4, false);
109    auto networkQueue = device.getOutputQueue("nnNetwork", 4, false);
110
111    auto startTime = steady_clock::now();
112    int counter = 0;
113    float fps = 0;
114    auto color = cv::Scalar(255, 255, 255);
115    bool printOutputLayersOnce = true;
116
117    while(true) {
118        auto imgFrame = previewQueue->get<dai::ImgFrame>();
119        auto inDet = detectionNNQueue->get<dai::SpatialImgDetections>();
120        auto depth = depthQueue->get<dai::ImgFrame>();
121        auto inNN = networkQueue->get<dai::NNData>();
122
123        if(printOutputLayersOnce && inNN) {
124            std::cout << "Output layer names: ";
125            for(const auto& ten : inNN->getAllLayerNames()) {
126                std::cout << ten << ", ";
127            }
128            std::cout << std::endl;
129            printOutputLayersOnce = false;
130        }
131
132        cv::Mat frame = imgFrame->getCvFrame();
133        cv::Mat depthFrame = depth->getFrame();  // depthFrame values are in millimeters
134
135        cv::Mat depthFrameColor;
136        cv::normalize(depthFrame, depthFrameColor, 255, 0, cv::NORM_INF, CV_8UC1);
137        cv::equalizeHist(depthFrameColor, depthFrameColor);
138        cv::applyColorMap(depthFrameColor, depthFrameColor, cv::COLORMAP_HOT);
139
140        counter++;
141        auto currentTime = steady_clock::now();
142        auto elapsed = duration_cast<duration<float>>(currentTime - startTime);
143        if(elapsed > seconds(1)) {
144            fps = counter / elapsed.count();
145            counter = 0;
146            startTime = currentTime;
147        }
148
149        auto detections = inDet->detections;
150
151        for(const auto& detection : detections) {
152            auto roiData = detection.boundingBoxMapping;
153            auto roi = roiData.roi;
154            roi = roi.denormalize(depthFrameColor.cols, depthFrameColor.rows);
155            auto topLeft = roi.topLeft();
156            auto bottomRight = roi.bottomRight();
157            auto xmin = (int)topLeft.x;
158            auto ymin = (int)topLeft.y;
159            auto xmax = (int)bottomRight.x;
160            auto ymax = (int)bottomRight.y;
161            cv::rectangle(depthFrameColor, cv::Rect(cv::Point(xmin, ymin), cv::Point(xmax, ymax)), color, cv::FONT_HERSHEY_SIMPLEX);
162
163            int x1 = detection.xmin * frame.cols;
164            int y1 = detection.ymin * frame.rows;
165            int x2 = detection.xmax * frame.cols;
166            int y2 = detection.ymax * frame.rows;
167
168            uint32_t labelIndex = detection.label;
169            std::string labelStr = to_string(labelIndex);
170            if(labelIndex < labelMap.size()) {
171                labelStr = labelMap[labelIndex];
172            }
173            cv::putText(frame, labelStr, cv::Point(x1 + 10, y1 + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, 255);
174            std::stringstream confStr;
175            confStr << std::fixed << std::setprecision(2) << detection.confidence * 100;
176            cv::putText(frame, confStr.str(), cv::Point(x1 + 10, y1 + 35), cv::FONT_HERSHEY_TRIPLEX, 0.5, 255);
177
178            std::stringstream depthX;
179            depthX << "X: " << (int)detection.spatialCoordinates.x << " mm";
180            cv::putText(frame, depthX.str(), cv::Point(x1 + 10, y1 + 50), cv::FONT_HERSHEY_TRIPLEX, 0.5, 255);
181            std::stringstream depthY;
182            depthY << "Y: " << (int)detection.spatialCoordinates.y << " mm";
183            cv::putText(frame, depthY.str(), cv::Point(x1 + 10, y1 + 65), cv::FONT_HERSHEY_TRIPLEX, 0.5, 255);
184            std::stringstream depthZ;
185            depthZ << "Z: " << (int)detection.spatialCoordinates.z << " mm";
186            cv::putText(frame, depthZ.str(), cv::Point(x1 + 10, y1 + 80), cv::FONT_HERSHEY_TRIPLEX, 0.5, 255);
187
188            cv::rectangle(frame, cv::Rect(cv::Point(x1, y1), cv::Point(x2, y2)), color, cv::FONT_HERSHEY_SIMPLEX);
189        }
190
191        std::stringstream fpsStr;
192        fpsStr << std::fixed << std::setprecision(2) << fps;
193        cv::putText(frame, fpsStr.str(), cv::Point(2, imgFrame->getHeight() - 4), cv::FONT_HERSHEY_TRIPLEX, 0.4, color);
194
195        cv::imshow("depth", depthFrameColor);
196        cv::imshow("rgb", frame);
197
198        int key = cv::waitKey(1);
199        if(key == 'q' || key == 'Q') {
200            return 0;
201        }
202    }
203    return 0;
204}

Pipeline

Need assistance?

Head over to Discussion Forum for technical support or any other questions you might have.