DepthAI
  • DepthAI Components
    • AprilTags
    • Benchmark
    • Camera
    • Calibration
    • DetectionNetwork
    • Events
    • FeatureTracker
    • Gate
    • HostNodes
    • ImageAlign
    • ImageManip
    • IMU
    • Misc
    • Model Zoo
    • NeuralDepth
    • NeuralNetwork
    • ObjectTracker
    • RecordReplay
    • RGBD
    • Script
    • SpatialDetectionNetwork
    • SpatialLocationCalculator
    • StereoDepth
    • Sync
    • VideoEncoder
    • Visualizer
    • Warp
    • RVC2-specific
  • Advanced Tutorials
  • API Reference
  • Tools
Software Stack

ON THIS PAGE

  • Pipeline
  • Source code

Spatial Detection Network

Supported on:RVC2RVC4
The example creates a pipeline to perform YOLOv6-Nano spatial object detection using RGB and stereo depth streams, visualizes results with bounding boxes and spatial coordinates on both colorized depth and RGB frames, and uses a custom visualization node.This example requires the DepthAI v3 API, see installation instructions.

Pipeline

Source code

Python

Python
GitHub
1#!/usr/bin/env python3
2
3import argparse
4from pathlib import Path
5import cv2
6import depthai as dai
7import numpy as np
8
9NEURAL_FPS = 8
10STEREO_DEFAULT_FPS = 20
11
12parser = argparse.ArgumentParser()
13parser.add_argument(
14    "--depthSource", type=str, default="stereo", choices=["stereo", "neural"]
15)
16args = parser.parse_args()
17# For better results on OAK4, use a segmentation model like "luxonis/yolov8-instance-segmentation-large:coco-640x480"
18# for depth estimation over the objects mask instead of the full bounding box.
19modelDescription = dai.NNModelDescription("yolov6-nano")
20size = (640, 400)
21
22if args.depthSource == "stereo":
23    fps = STEREO_DEFAULT_FPS
24else:
25    fps = NEURAL_FPS
26
27class SpatialVisualizer(dai.node.HostNode):
28    def __init__(self):
29        dai.node.HostNode.__init__(self)
30        self.sendProcessingToPipeline(True)
31    def build(self, depth:dai.Node.Output, detections: dai.Node.Output, rgb: dai.Node.Output):
32        self.link_args(depth, detections, rgb) # Must match the inputs to the process method
33
34    def process(self, depthPreview, detections, rgbPreview):
35        depthPreview = depthPreview.getCvFrame()
36        rgbPreview = rgbPreview.getCvFrame()
37        depthFrameColor = self.processDepthFrame(depthPreview)
38        self.displayResults(rgbPreview, depthFrameColor, detections.detections)
39
40    def processDepthFrame(self, depthFrame):
41        depthDownscaled = depthFrame[::4]
42        if np.all(depthDownscaled == 0):
43            minDepth = 0
44        else:
45            minDepth = np.percentile(depthDownscaled[depthDownscaled != 0], 1)
46        maxDepth = np.percentile(depthDownscaled, 99)
47        depthFrameColor = np.interp(depthFrame, (minDepth, maxDepth), (0, 255)).astype(np.uint8)
48        return cv2.applyColorMap(depthFrameColor, cv2.COLORMAP_HOT)
49
50    def displayResults(self, rgbFrame, depthFrameColor, detections):
51        height, width, _ = rgbFrame.shape
52        for detection in detections:
53            self.drawBoundingBoxes(depthFrameColor, detection)
54            self.drawDetections(rgbFrame, detection, width, height)
55
56        cv2.imshow("Depth frame", depthFrameColor)
57        cv2.imshow("Color frame", rgbFrame)
58        if cv2.waitKey(1) == ord('q'):
59            self.stopPipeline()
60
61    def drawBoundingBoxes(self, depthFrameColor, detection):
62        roiData = detection.boundingBoxMapping
63        roi = roiData.roi
64        roi = roi.denormalize(depthFrameColor.shape[1], depthFrameColor.shape[0])
65        topLeft = roi.topLeft()
66        bottomRight = roi.bottomRight()
67        cv2.rectangle(depthFrameColor, (int(topLeft.x), int(topLeft.y)), (int(bottomRight.x), int(bottomRight.y)), (255, 255, 255), 1)
68
69    def drawDetections(self, frame, detection, frameWidth, frameHeight):
70        x1 = int(detection.xmin * frameWidth)
71        x2 = int(detection.xmax * frameWidth)
72        y1 = int(detection.ymin * frameHeight)
73        y2 = int(detection.ymax * frameHeight)
74        label = detection.labelName
75        color = (255, 255, 255)
76        cv2.putText(frame, str(label), (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
77        cv2.putText(frame, "{:.2f}".format(detection.confidence * 100), (x1 + 10, y1 + 35), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
78        cv2.putText(frame, f"X: {int(detection.spatialCoordinates.x)} mm", (x1 + 10, y1 + 50), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
79        cv2.putText(frame, f"Y: {int(detection.spatialCoordinates.y)} mm", (x1 + 10, y1 + 65), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
80        cv2.putText(frame, f"Z: {int(detection.spatialCoordinates.z)} mm", (x1 + 10, y1 + 80), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
81        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 1)
82
83# Creates the pipeline and a default device implicitly
84with dai.Pipeline() as p:
85    # Define sources and outputs
86    platform = p.getDefaultDevice().getPlatform()
87
88    camRgb = p.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_A, sensorFps=fps)
89    monoLeft = p.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_B, sensorFps=fps)
90    monoRight = p.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_C, sensorFps=fps)
91    if args.depthSource == "stereo":
92        depthSource = p.create(dai.node.StereoDepth)
93        depthSource.setExtendedDisparity(True)
94        monoLeft.requestOutput(size).link(depthSource.left)
95        monoRight.requestOutput(size).link(depthSource.right)
96    elif args.depthSource == "neural":
97        depthSource = p.create(dai.node.NeuralDepth).build(
98            monoLeft.requestFullResolutionOutput(),
99            monoRight.requestFullResolutionOutput(),
100            dai.DeviceModelZoo.NEURAL_DEPTH_LARGE,
101        )
102    else:
103        raise ValueError(f"Invalid depth source: {args.depthSource}")
104
105    spatialDetectionNetwork = p.create(dai.node.SpatialDetectionNetwork).build(
106        camRgb, depthSource, modelDescription
107    )
108    visualizer = p.create(SpatialVisualizer)
109
110    spatialDetectionNetwork.spatialLocationCalculator.initialConfig.setSegmentationPassthrough(False)
111    spatialDetectionNetwork.input.setBlocking(False)
112    spatialDetectionNetwork.setDepthLowerThreshold(100)
113    spatialDetectionNetwork.setDepthUpperThreshold(5000)
114
115    visualizer.build(
116        spatialDetectionNetwork.passthroughDepth,
117        spatialDetectionNetwork.out,
118        spatialDetectionNetwork.passthrough,
119    )
120
121    print("Starting pipeline with depth source: ", args.depthSource)
122
123    p.run()

C++

1#include <argparse/argparse.hpp>
2#include <iostream>
3#include <memory>
4#include <opencv2/opencv.hpp>
5#include <vector>
6
7#include "depthai/depthai.hpp"
8
9constexpr float NEURAL_FPS = 8.0f;
10constexpr float STEREO_DEFAULT_FPS = 20.0f;
11
12// Custom host node for spatial visualization
13class SpatialVisualizer : public dai::NodeCRTP<dai::node::HostNode, SpatialVisualizer> {
14   public:
15    Input& depthInput = inputs["depth"];
16    Input& detectionsInput = inputs["detections"];
17    Input& rgbInput = inputs["rgb"];
18
19    std::vector<std::string> labelMap;
20
21    std::shared_ptr<SpatialVisualizer> build(Output& depth, Output& detections, Output& rgb) {
22        depth.link(depthInput);
23        detections.link(detectionsInput);
24        rgb.link(rgbInput);
25        sendProcessingToPipeline(true);
26        return std::static_pointer_cast<SpatialVisualizer>(this->shared_from_this());
27    }
28
29    std::shared_ptr<dai::Buffer> processGroup(std::shared_ptr<dai::MessageGroup> in) override {
30        auto depthFrame = in->get<dai::ImgFrame>("depth");
31        auto detections = in->get<dai::SpatialImgDetections>("detections");
32        auto rgbFrame = in->get<dai::ImgFrame>("rgb");
33
34        cv::Mat depthCv = depthFrame->getCvFrame();
35        cv::Mat rgbCv = rgbFrame->getCvFrame();
36        cv::Mat depthFrameColor = processDepthFrame(depthCv);
37        displayResults(rgbCv, depthFrameColor, detections->detections);
38
39        return nullptr;
40    }
41
42   private:
43    cv::Mat processDepthFrame(const cv::Mat& depthFrame) {
44        // Downscale depth frame
45        cv::Mat depthDownscaled;
46        cv::resize(depthFrame, depthDownscaled, cv::Size(), 0.25, 0.25);
47
48        // Find min and max depth values
49        double minDepth = 0, maxDepth = 0;
50        cv::Mat mask = (depthDownscaled != 0);
51        if(cv::countNonZero(mask) > 0) {
52            cv::minMaxLoc(depthDownscaled, &minDepth, &maxDepth, nullptr, nullptr, mask);
53        }
54
55        // Normalize depth frame
56        cv::Mat depthFrameColor;
57        depthFrame.convertTo(depthFrameColor, CV_8UC1, 255.0 / (maxDepth - minDepth), -minDepth * 255.0 / (maxDepth - minDepth));
58
59        // Apply color map
60        cv::Mat colorized;
61        cv::applyColorMap(depthFrameColor, colorized, cv::COLORMAP_HOT);
62        return colorized;
63    }
64
65    void displayResults(cv::Mat& rgbFrame, cv::Mat& depthFrameColor, const std::vector<dai::SpatialImgDetection>& detections) {
66        int height = rgbFrame.rows;
67        int width = rgbFrame.cols;
68
69        for(const auto& detection : detections) {
70            drawBoundingBoxes(depthFrameColor, detection);
71            drawDetections(rgbFrame, detection, width, height);
72        }
73
74        cv::imshow("depth", depthFrameColor);
75        cv::imshow("rgb", rgbFrame);
76
77        if(cv::waitKey(1) == 'q') {
78            stopPipeline();
79        }
80    }
81
82    void drawBoundingBoxes(cv::Mat& depthFrameColor, const dai::SpatialImgDetection& detection) {
83        auto roi = detection.boundingBoxMapping.roi;
84        roi = roi.denormalize(depthFrameColor.cols, depthFrameColor.rows);
85        auto topLeft = roi.topLeft();
86        auto bottomRight = roi.bottomRight();
87        cv::rectangle(depthFrameColor,
88                      cv::Point(static_cast<int>(topLeft.x), static_cast<int>(topLeft.y)),
89                      cv::Point(static_cast<int>(bottomRight.x), static_cast<int>(bottomRight.y)),
90                      cv::Scalar(255, 255, 255),
91                      1);
92    }
93
94    void drawDetections(cv::Mat& frame, const dai::SpatialImgDetection& detection, int frameWidth, int frameHeight) {
95        int x1 = static_cast<int>(detection.xmin * frameWidth);
96        int x2 = static_cast<int>(detection.xmax * frameWidth);
97        int y1 = static_cast<int>(detection.ymin * frameHeight);
98        int y2 = static_cast<int>(detection.ymax * frameHeight);
99
100        std::string label;
101        try {
102            label = labelMap[detection.label];
103        } catch(...) {
104            label = std::to_string(detection.label);
105        }
106
107        cv::Scalar color(255, 255, 255);
108        cv::putText(frame, label, cv::Point(x1 + 10, y1 + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
109        cv::putText(frame, std::to_string(detection.confidence * 100), cv::Point(x1 + 10, y1 + 35), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
110        cv::putText(frame,
111                    "X: " + std::to_string(static_cast<int>(detection.spatialCoordinates.x)) + " mm",
112                    cv::Point(x1 + 10, y1 + 50),
113                    cv::FONT_HERSHEY_TRIPLEX,
114                    0.5,
115                    color);
116        cv::putText(frame,
117                    "Y: " + std::to_string(static_cast<int>(detection.spatialCoordinates.y)) + " mm",
118                    cv::Point(x1 + 10, y1 + 65),
119                    cv::FONT_HERSHEY_TRIPLEX,
120                    0.5,
121                    color);
122        cv::putText(frame,
123                    "Z: " + std::to_string(static_cast<int>(detection.spatialCoordinates.z)) + " mm",
124                    cv::Point(x1 + 10, y1 + 80),
125                    cv::FONT_HERSHEY_TRIPLEX,
126                    0.5,
127                    color);
128        cv::rectangle(frame, cv::Point(x1, y1), cv::Point(x2, y2), color, 1);
129    }
130};
131
132int main(int argc, char** argv) {
133    // Initialize argument parser
134    argparse::ArgumentParser program("spatial_detection", "1.0.0");
135    program.add_description("Spatial detection network example with configurable depth source");
136    program.add_argument("--depthSource").default_value(std::string("stereo")).help("Depth source: stereo, neural, tof");
137
138    // Parse arguments
139    try {
140        program.parse_args(argc, argv);
141    } catch(const std::runtime_error& err) {
142        std::cerr << err.what() << '\n';
143        std::cerr << program;
144        return EXIT_FAILURE;
145    }
146
147    // Get arguments
148    std::string depthSourceArg = program.get<std::string>("--depthSource");
149
150    // Validate depth source argument
151    if(depthSourceArg != "stereo" && depthSourceArg != "neural" && depthSourceArg != "tof") {
152        std::cerr << "Invalid depth source: " << depthSourceArg << '\n';
153        std::cerr << "Valid options are: stereo, neural, tof" << '\n';
154        return EXIT_FAILURE;
155    }
156
157    try {
158        float fps = STEREO_DEFAULT_FPS;
159        if(depthSourceArg == "neural") {
160            fps = NEURAL_FPS;
161        }
162
163        // Create pipeline
164        dai::Pipeline pipeline;
165
166        const std::pair<int, int> size = {640, 400};
167
168        // Define sources and outputs
169        auto camRgb = pipeline.create<dai::node::Camera>();
170        camRgb->build(dai::CameraBoardSocket::CAM_A, std::nullopt, fps);
171
172        auto platform = pipeline.getDefaultDevice()->getPlatform();
173
174        // Create depth source based on argument
175        dai::node::DepthSource depthSource;
176
177        if(depthSourceArg == "stereo") {
178            auto monoLeft = pipeline.create<dai::node::Camera>();
179            auto monoRight = pipeline.create<dai::node::Camera>();
180            auto stereo = pipeline.create<dai::node::StereoDepth>();
181
182            monoLeft->build(dai::CameraBoardSocket::CAM_B, std::nullopt, fps);
183            monoRight->build(dai::CameraBoardSocket::CAM_C, std::nullopt, fps);
184
185            stereo->setExtendedDisparity(true);
186            monoLeft->requestOutput(size, std::nullopt, dai::ImgResizeMode::CROP)->link(stereo->left);
187            monoRight->requestOutput(size, std::nullopt, dai::ImgResizeMode::CROP)->link(stereo->right);
188
189            depthSource = stereo;
190        } else if(depthSourceArg == "neural") {
191            auto monoLeft = pipeline.create<dai::node::Camera>();
192            auto monoRight = pipeline.create<dai::node::Camera>();
193
194            monoLeft->build(dai::CameraBoardSocket::CAM_B, std::nullopt, fps);
195            monoRight->build(dai::CameraBoardSocket::CAM_C, std::nullopt, fps);
196
197            auto neuralDepth = pipeline.create<dai::node::NeuralDepth>();
198            neuralDepth->build(*monoLeft->requestFullResolutionOutput(), *monoRight->requestFullResolutionOutput(), dai::DeviceModelZoo::NEURAL_DEPTH_LARGE);
199
200            depthSource = neuralDepth;
201        } else if(depthSourceArg == "tof") {
202            auto tof = pipeline.create<dai::node::ToF>();
203            depthSource = tof;
204        }
205
206        // Create spatial detection network using the unified build method with DepthSource variant
207        auto spatialDetectionNetwork = pipeline.create<dai::node::SpatialDetectionNetwork>();
208        auto visualizer = pipeline.create<SpatialVisualizer>();
209
210        // Configure spatial detection network
211        spatialDetectionNetwork->input.setBlocking(false);
212        spatialDetectionNetwork->setBoundingBoxScaleFactor(0.5f);
213        spatialDetectionNetwork->setDepthLowerThreshold(100);
214        spatialDetectionNetwork->setDepthUpperThreshold(5000);
215
216        // Set up model and build with DepthSource variant
217        dai::NNModelDescription modelDesc;
218        // For better results on OAK4, use a segmentation model like "luxonis/yolov8-instance-segmentation-large:coco-640x480"
219        // for depth estimation over the objects mask instead of the full bounding box.
220        modelDesc.model = "yolov6-nano";
221        spatialDetectionNetwork->build(camRgb, depthSource, modelDesc);
222
223        // Set label map
224        visualizer->labelMap = spatialDetectionNetwork->getClasses().value();
225        spatialDetectionNetwork->spatialLocationCalculator->initialConfig->setSegmentationPassthrough(false);
226
227        // Linking
228        visualizer->build(spatialDetectionNetwork->passthroughDepth, spatialDetectionNetwork->out, spatialDetectionNetwork->passthrough);
229
230        std::cout << "Pipeline starting with depth source: " << depthSourceArg << '\n';
231
232        // Start pipeline
233        pipeline.run();
234
235    } catch(const std::exception& e) {
236        std::cerr << "Error: " << e.what() << '\n';
237        return EXIT_FAILURE;
238    }
239
240    return EXIT_SUCCESS;
241}

Need assistance?

Head over to Discussion Forum for technical support or any other questions you might have.