# Spatial Detection Network

The example creates a pipeline to perform YOLOv6-Nano spatial object detection using RGB and stereo depth streams, visualizes
results with bounding boxes and spatial coordinates on both colorized depth and RGB frames, and uses a custom visualization node.

This example requires the DepthAI v3 API, see [installation instructions](https://docs.luxonis.com/software-v3/depthai.md).

## Pipeline

### examples/spatial_detection.pipeline.json

```json
{"pipeline": {"connections": [{"node1Id": 0, "node1Output": "0", "node1OutputGroup": "dynamicOutputs", "node2Id": 5, "node2Input": "in", "node2InputGroup": ""}, {"node1Id": 1, "node1Output": "0", "node1OutputGroup": "dynamicOutputs", "node2Id": 3, "node2Input": "left", "node2InputGroup": ""}, {"node1Id": 2, "node1Output": "0", "node1OutputGroup": "dynamicOutputs", "node2Id": 3, "node2Input": "right", "node2InputGroup": ""}, {"node1Id": 3, "node1Output": "depth", "node1OutputGroup": "", "node2Id": 13, "node2Input": "in", "node2InputGroup": ""}, {"node1Id": 3, "node1Output": "depth", "node1OutputGroup": "", "node2Id": 4, "node2Input": "inputDepth", "node2InputGroup": ""}, {"node1Id": 6, "node1Output": "out", "node1OutputGroup": "", "node2Id": 4, "node2Input": "inputDetections", "node2InputGroup": ""}, {"node1Id": 5, "node1Output": "passthrough", "node1OutputGroup": "", "node2Id": 4, "node2Input": "inputImg", "node2InputGroup": ""}, {"node1Id": 5, "node1Output": "passthrough", "node1OutputGroup": "", "node2Id": 11, "node2Input": "in", "node2InputGroup": ""}, {"node1Id": 5, "node1Output": "passthrough", "node1OutputGroup": "", "node2Id": 6, "node2Input": "imageIn", "node2InputGroup": ""}, {"node1Id": 5, "node1Output": "passthrough", "node1OutputGroup": "", "node2Id": 3, "node2Input": "inputAlignTo", "node2InputGroup": ""}, {"node1Id": 5, "node1Output": "out", "node1OutputGroup": "", "node2Id": 6, "node2Input": "in", "node2InputGroup": ""}, {"node1Id": 4, "node1Output": "out", "node1OutputGroup": "", "node2Id": 9, "node2Input": "in", "node2InputGroup": ""}], "globalProperties": {"calibData": null, "cameraTuningBlobSize": null, "cameraTuningBlobUri": "", "leonCssFrequencyHz": 700000000.0, "leonMssFrequencyHz": 700000000.0, "pipelineName": null, "pipelineVersion": null, "sippBufferSize": 18432, "sippDmaBufferSize": 16384, "xlinkChunkSize": -1}, "nodes": [[11, {"alias": "", "id": 11, "ioInfo": [[["", "in"], {"blocking": true, "group": "", "id": 43, "name": "in", "queueSize": 3, "type": 3, "waitForMessage": false}]], "logLevel": 3, "name": "XLinkOut", "parentId": -1, "properties": {"maxFpsLimit": -1.0, "metadataOnly": false, "streamName": "__x_5__passthrough"}}], [9, {"alias": "", "id": 9, "ioInfo": [[["", "in"], {"blocking": true, "group": "", "id": 42, "name": "in", "queueSize": 3, "type": 3, "waitForMessage": false}]], "logLevel": 3, "name": "XLinkOut", "parentId": -1, "properties": {"maxFpsLimit": -1.0, "metadataOnly": false, "streamName": "__x_4__out"}}], [6, {"alias": "detectionParser", "id": 6, "ioInfo": [[["", "out"], {"blocking": false, "group": "", "id": 41, "name": "out", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "imageIn"], {"blocking": false, "group": "", "id": 40, "name": "imageIn", "queueSize": 1, "type": 3, "waitForMessage": true}], [["", "in"], {"blocking": true, "group": "", "id": 39, "name": "in", "queueSize": 1, "type": 3, "waitForMessage": true}]], "logLevel": 3, "name": "DetectionParser", "parentId": 4, "properties": {"networkInputs": {"images": {"dataType": 1, "dims": [416, 416, 3, 1], "name": "images", "numDimensions": 4, "offset": 0, "order": 17185, "qpScale": 1.0, "qpZp": 0.0, "quantization": false, "strides": []}}, "numFramesPool": 8, "parser": {"anchorMasks": {}, "anchors": [], "anchorsV2": [], "classNames": ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"], "classes": 80, "confidenceThreshold": 0.5, "coordinates": 4, "iouThreshold": 0.5, "nnFamily": 0, "subtype": "yolov6"}}}], [5, {"alias": "neuralNetwork", "id": 5, "ioInfo": [[["", "passthrough"], {"blocking": false, "group": "", "id": 38, "name": "passthrough", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "out"], {"blocking": false, "group": "", "id": 37, "name": "out", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "in"], {"blocking": false, "group": "", "id": 36, "name": "in", "queueSize": 3, "type": 3, "waitForMessage": true}]], "logLevel": 3, "name": "NeuralNetwork", "parentId": 4, "properties": {"backend": "", "backendProperties": {}, "blobSize": 8689834, "blobUri": "asset:__blob", "modelSource": 0, "modelUri": "", "numFrames": 8, "numNCEPerThread": 0, "numShavesPerThread": 0, "numThreads": 0}}], [4, {"alias": "", "id": 4, "ioInfo": [[["", "passthroughDepth"], {"blocking": false, "group": "", "id": 34, "name": "passthroughDepth", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "boundingBoxMapping"], {"blocking": false, "group": "", "id": 33, "name": "boundingBoxMapping", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "spatialLocationCalculatorOutput"], {"blocking": false, "group": "", "id": 35, "name": "spatialLocationCalculatorOutput", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "out"], {"blocking": false, "group": "", "id": 32, "name": "out", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "inputDetections"], {"blocking": true, "group": "", "id": 31, "name": "inputDetections", "queueSize": 1, "type": 3, "waitForMessage": true}], [["", "inputImg"], {"blocking": true, "group": "", "id": 30, "name": "inputImg", "queueSize": 2, "type": 3, "waitForMessage": true}], [["", "inputDepth"], {"blocking": false, "group": "", "id": 29, "name": "inputDepth", "queueSize": 4, "type": 3, "waitForMessage": true}]], "logLevel": 3, "name": "SpatialDetectionNetwork", "parentId": -1, "properties": {"calculationAlgorithm": 4, "depthThresholds": {"lowerThreshold": 100, "upperThreshold": 5000}, "detectedBBScaleFactor": 0.5}}], [3, {"alias": "", "id": 3, "ioInfo": [[["", "confidenceMap"], {"blocking": false, "group": "", "id": 28, "name": "confidenceMap", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "debugDispCostDump"], {"blocking": false, "group": "", "id": 27, "name": "debugDispCostDump", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "debugExtDispLrCheckIt2"], {"blocking": false, "group": "", "id": 26, "name": "debugExtDispLrCheckIt2", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "debugDispLrCheckIt2"], {"blocking": false, "group": "", "id": 24, "name": "debugDispLrCheckIt2", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "debugExtDispLrCheckIt1"], {"blocking": false, "group": "", "id": 25, "name": "debugExtDispLrCheckIt1", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "debugDispLrCheckIt1"], {"blocking": false, "group": "", "id": 23, "name": "debugDispLrCheckIt1", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "outConfig"], {"blocking": false, "group": "", "id": 22, "name": "outConfig", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "rectifiedRight"], {"blocking": false, "group": "", "id": 21, "name": "rectifiedRight", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "rectifiedLeft"], {"blocking": false, "group": "", "id": 20, "name": "rectifiedLeft", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "depth"], {"blocking": false, "group": "", "id": 16, "name": "depth", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "right"], {"blocking": true, "group": "", "id": 15, "name": "right", "queueSize": 3, "type": 3, "waitForMessage": false}], [["", "left"], {"blocking": true, "group": "", "id": 14, "name": "left", "queueSize": 3, "type": 3, "waitForMessage": false}], [["", "syncedRight"], {"blocking": false, "group": "", "id": 19, "name": "syncedRight", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "syncedLeft"], {"blocking": false, "group": "", "id": 18, "name": "syncedLeft", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "inputAlignTo"], {"blocking": false, "group": "", "id": 13, "name": "inputAlignTo", "queueSize": 1, "type": 3, "waitForMessage": true}], [["", "disparity"], {"blocking": false, "group": "", "id": 17, "name": "disparity", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "inputConfig"], {"blocking": true, "group": "", "id": 12, "name": "inputConfig", "queueSize": 3, "type": 3, "waitForMessage": false}]], "logLevel": 3, "name": "StereoDepth", "parentId": -1, "properties": {"alphaScaling": null, "baseline": null, "depthAlignCamera": -1, "depthAlignmentUseSpecTranslation": null, "disparityToDepthUseSpecTranslation": null, "enableFrameSync": true, "enableRectification": true, "enableRuntimeStereoModeSwitch": false, "focalLength": null, "focalLengthFromCalibration": true, "height": null, "initialConfig": {"algorithmControl": {"centerAlignmentShiftFactor": null, "customDepthUnitMultiplier": 1000.0, "depthAlign": 1, "depthUnit": 2, "disparityShift": 0, "enableExtended": true, "enableLeftRightCheck": true, "enableSubpixel": true, "enableSwLeftRightCheck": false, "leftRightCheckThreshold": 10, "numInvalidateEdgePixels": 0, "subpixelFractionalBits": 5}, "censusTransform": {"enableMeanMode": true, "kernelMask": 0, "kernelSize": -1, "noiseThresholdOffset": 1, "noiseThresholdScale": 1, "threshold": 0}, "confidenceMetrics": {"flatnessConfidenceThreshold": 2, "flatnessConfidenceWeight": 8, "flatnessOverride": false, "motionVectorConfidenceThreshold": 1, "motionVectorConfidenceWeight": 4, "occlusionConfidenceWeight": 20}, "costAggregation": {"divisionFactor": 1, "horizontalPenaltyCostP1": 250, "horizontalPenaltyCostP2": 500, "p1Config": {"defaultValue": 11, "edgeThreshold": 15, "edgeValue": 10, "enableAdaptive": true, "smoothThreshold": 5, "smoothValue": 22}, "p2Config": {"defaultValue": 33, "edgeValue": 22, "enableAdaptive": true, "smoothValue": 63}, "verticalPenaltyCostP1": 250, "verticalPenaltyCostP2": 500}, "costMatching": {"confidenceThreshold": 55, "disparityWidth": 1, "enableCompanding": false, "enableSwConfidenceThresholding": false, "invalidDisparityValue": 0, "linearEquationParameters": {"alpha": 0, "beta": 2, "threshold": 127}}, "filtersBackend": 2, "postProcessing": {"adaptiveMedianFilter": {"confidenceThreshold": 200, "enable": true}, "bilateralSigmaValue": 0, "brightnessFilter": {"maxBrightness": 256, "minBrightness": 0}, "decimationFilter": {"decimationFactor": 1, "decimationMode": 0}, "filteringOrder": [3, 1, 2, 4, 5], "holeFilling": {"enable": true, "fillConfidenceThreshold": 200, "highConfidenceThreshold": 210, "invalidateDisparities": true, "minValidDisparity": 1}, "median": 0, "spatialFilter": {"alpha": 0.5, "delta": 0, "enable": false, "holeFillingRadius": 2, "numIterations": 1}, "speckleFilter": {"differenceThreshold": 2, "enable": false, "speckleRange": 50}, "temporalFilter": {"alpha": 0.4000000059604645, "delta": 0, "enable": false, "persistencyMode": 3}, "thresholdFilter": {"maxRange": 65535, "minRange": 0}}}, "mesh": {"meshLeftUri": "", "meshRightUri": "", "meshSize": null, "stepHeight": 16, "stepWidth": 16}, "numFramesPool": 3, "numPostProcessingMemorySlices": -1, "numPostProcessingShaves": -1, "outHeight": 400, "outKeepAspectRatio": true, "outWidth": 640, "rectificationUseSpecTranslation": null, "rectifyEdgeFillColor": 0, "useHomographyRectification": null, "width": null}}], [2, {"alias": "", "id": 2, "ioInfo": [[["dynamicOutputs", "0"], {"blocking": false, "group": "dynamicOutputs", "id": 11, "name": "0", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "raw"], {"blocking": false, "group": "", "id": 10, "name": "raw", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "mockIsp"], {"blocking": true, "group": "", "id": 9, "name": "mockIsp", "queueSize": 8, "type": 3, "waitForMessage": false}], [["", "inputControl"], {"blocking": true, "group": "", "id": 8, "name": "inputControl", "queueSize": 3, "type": 3, "waitForMessage": false}]], "logLevel": 3, "name": "Camera", "parentId": -1, "properties": {"boardSocket": 2, "cameraName": "", "fps": -1.0, "imageOrientation": -1, "initialControl": {"aeLockMode": false, "aeMaxExposureTimeUs": 5, "aeRegion": {"height": 0, "priority": 1213374846, "width": 0, "x": 96, "y": 0}, "afRegion": {"height": 33369, "priority": 23712, "width": 58768, "x": 23717, "y": 0}, "antiBandingMode": 0, "autoFocusMode": 3, "awbLockMode": false, "awbMode": 0, "brightness": 0, "captureIntent": 0, "chromaDenoise": 104, "cmdMask": 0, "contrast": 119, "controlMode": 0, "effectMode": 0, "enableHdr": false, "expCompensation": 0, "expManual": {"exposureTimeUs": 0, "frameDurationUs": 0, "sensitivityIso": 576}, "frameSyncMode": 0, "lensPosAutoInfinity": 0, "lensPosAutoMacro": 0, "lensPosition": 0, "lensPositionRaw": 0.0, "lowPowerNumFramesBurst": 129, "lowPowerNumFramesDiscard": 2, "lumaDenoise": 116, "miscControls": [], "saturation": 105, "sceneMode": 0, "sharpness": 100, "strobeConfig": {"activeLevel": 0, "enable": 0, "gpioNumber": 0}, "strobeTimings": {"durationUs": 23712, "exposureBeginOffsetUs": 0, "exposureEndOffsetUs": -2108037792}, "wbColorTemp": 13410}, "isp3aFps": 0, "mockIspHeight": -1, "mockIspWidth": -1, "numFramesPoolIsp": 3, "numFramesPoolPreview": 4, "numFramesPoolRaw": 3, "numFramesPoolStill": 4, "numFramesPoolVideo": 4, "outputRequests": [{"enableUndistortion": null, "fps": {"value": null}, "resizeMode": 0, "size": {"value": {"index": 0, "value": [640, 400]}}, "type": null}], "resolutionHeight": -1, "resolutionWidth": -1}}], [1, {"alias": "", "id": 1, "ioInfo": [[["dynamicOutputs", "0"], {"blocking": false, "group": "dynamicOutputs", "id": 7, "name": "0", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "raw"], {"blocking": false, "group": "", "id": 6, "name": "raw", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "mockIsp"], {"blocking": true, "group": "", "id": 5, "name": "mockIsp", "queueSize": 8, "type": 3, "waitForMessage": false}], [["", "inputControl"], {"blocking": true, "group": "", "id": 4, "name": "inputControl", "queueSize": 3, "type": 3, "waitForMessage": false}]], "logLevel": 3, "name": "Camera", "parentId": -1, "properties": {"boardSocket": 1, "cameraName": "", "fps": -1.0, "imageOrientation": -1, "initialControl": {"aeLockMode": false, "aeMaxExposureTimeUs": 537120116, "aeRegion": {"height": 28526, "priority": 1818324338, "width": 43520, "x": 17610, "y": 49227}, "afRegion": {"height": 26999, "priority": 3395843172, "width": 42434, "x": 31337, "y": 25701}, "antiBandingMode": 224, "autoFocusMode": 3, "awbLockMode": false, "awbMode": 68, "brightness": 120, "captureIntent": 0, "chromaDenoise": 67, "cmdMask": 0, "contrast": 70, "controlMode": 161, "effectMode": 120, "enableHdr": false, "expCompensation": 97, "expManual": {"exposureTimeUs": 1684372073, "frameDurationUs": 1952999273, "sensitivityIso": 1701357250}, "frameSyncMode": 202, "lensPosAutoInfinity": 114, "lensPosAutoMacro": 109, "lensPosition": 0, "lensPositionRaw": 0.0, "lowPowerNumFramesBurst": 166, "lowPowerNumFramesDiscard": 109, "lumaDenoise": 202, "miscControls": [], "saturation": 112, "sceneMode": 161, "sharpness": 115, "strobeConfig": {"activeLevel": 0, "enable": 0, "gpioNumber": 0}, "strobeTimings": {"durationUs": 1751607653, "exposureBeginOffsetUs": 51833, "exposureEndOffsetUs": 1755709440}, "wbColorTemp": 39577}, "isp3aFps": 0, "mockIspHeight": -1, "mockIspWidth": -1, "numFramesPoolIsp": 3, "numFramesPoolPreview": 4, "numFramesPoolRaw": 3, "numFramesPoolStill": 4, "numFramesPoolVideo": 4, "outputRequests": [{"enableUndistortion": null, "fps": {"value": null}, "resizeMode": 0, "size": {"value": {"index": 0, "value": [640, 400]}}, "type": null}], "resolutionHeight": -1, "resolutionWidth": -1}}], [13, {"alias": "", "id": 13, "ioInfo": [[["", "in"], {"blocking": true, "group": "", "id": 44, "name": "in", "queueSize": 3, "type": 3, "waitForMessage": false}]], "logLevel": 3, "name": "XLinkOut", "parentId": -1, "properties": {"maxFpsLimit": -1.0, "metadataOnly": false, "streamName": "__x_3__depth"}}], [0, {"alias": "", "id": 0, "ioInfo": [[["dynamicOutputs", "0"], {"blocking": false, "group": "dynamicOutputs", "id": 3, "name": "0", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "raw"], {"blocking": false, "group": "", "id": 2, "name": "raw", "queueSize": 8, "type": 0, "waitForMessage": false}], [["", "mockIsp"], {"blocking": true, "group": "", "id": 1, "name": "mockIsp", "queueSize": 8, "type": 3, "waitForMessage": false}], [["", "inputControl"], {"blocking": true, "group": "", "id": 0, "name": "inputControl", "queueSize": 3, "type": 3, "waitForMessage": false}]], "logLevel": 3, "name": "Camera", "parentId": -1, "properties": {"boardSocket": 0, "cameraName": "", "fps": -1.0, "imageOrientation": -1, "initialControl": {"aeLockMode": false, "aeMaxExposureTimeUs": 100663297, "aeRegion": {"height": 0, "priority": 2186963712, "width": 0, "x": 4163, "y": 1}, "afRegion": {"height": 0, "priority": 0, "width": 0, "x": 23712, "y": 0}, "antiBandingMode": 90, "autoFocusMode": 3, "awbLockMode": false, "awbMode": 72, "brightness": -105, "captureIntent": 130, "chromaDenoise": 0, "cmdMask": 0, "contrast": -33, "controlMode": 160, "effectMode": 92, "enableHdr": false, "expCompensation": 39, "expManual": {"exposureTimeUs": 23712, "frameDurationUs": 23712, "sensitivityIso": 2186963820}, "frameSyncMode": 0, "lensPosAutoInfinity": 128, "lensPosAutoMacro": 107, "lensPosition": 0, "lensPositionRaw": 0.0, "lowPowerNumFramesBurst": 0, "lowPowerNumFramesDiscard": 0, "lumaDenoise": 0, "miscControls": [], "saturation": -34, "sceneMode": 107, "sharpness": 0, "strobeConfig": {"activeLevel": 168, "enable": 0, "gpioNumber": 113}, "strobeTimings": {"durationUs": 16, "exposureBeginOffsetUs": 23712, "exposureEndOffsetUs": 17}, "wbColorTemp": 0}, "isp3aFps": 0, "mockIspHeight": -1, "mockIspWidth": -1, "numFramesPoolIsp": 3, "numFramesPoolPreview": 4, "numFramesPoolRaw": 3, "numFramesPoolStill": 4, "numFramesPoolVideo": 4, "outputRequests": [{"enableUndistortion": null, "fps": {"value": {"index": 0, "value": 30.0}}, "resizeMode": 0, "size": {"value": {"index": 0, "value": [416, 416]}}, "type": 8}], "resolutionHeight": -1, "resolutionWidth": -1}}]]}}
```

## Source code

#### Python

```python
#!/usr/bin/env python3

import argparse
from pathlib import Path
import cv2
import depthai as dai
import numpy as np

NEURAL_FPS = 8
STEREO_DEFAULT_FPS = 20

parser = argparse.ArgumentParser()
parser.add_argument(
    "--depthSource", type=str, default="stereo", choices=["stereo", "neural"]
)
args = parser.parse_args()
# For better results on OAK4, use a segmentation model like "luxonis/yolov8-instance-segmentation-large:coco-640x480"
# for depth estimation over the objects mask instead of the full bounding box.
modelDescription = dai.NNModelDescription("yolov6-nano")
size = (640, 400)

if args.depthSource == "stereo":
    fps = STEREO_DEFAULT_FPS
else:
    fps = NEURAL_FPS

class SpatialVisualizer(dai.node.HostNode):
    def __init__(self):
        dai.node.HostNode.__init__(self)
        self.sendProcessingToPipeline(True)
    def build(self, depth:dai.Node.Output, detections: dai.Node.Output, rgb: dai.Node.Output):
        self.link_args(depth, detections, rgb) # Must match the inputs to the process method

    def process(self, depthPreview, detections, rgbPreview):
        depthPreview = depthPreview.getCvFrame()
        rgbPreview = rgbPreview.getCvFrame()
        depthFrameColor = self.processDepthFrame(depthPreview)
        self.displayResults(rgbPreview, depthFrameColor, detections.detections)

    def processDepthFrame(self, depthFrame):
        depthDownscaled = depthFrame[::4]
        if np.all(depthDownscaled == 0):
            minDepth = 0
        else:
            minDepth = np.percentile(depthDownscaled[depthDownscaled != 0], 1)
        maxDepth = np.percentile(depthDownscaled, 99)
        depthFrameColor = np.interp(depthFrame, (minDepth, maxDepth), (0, 255)).astype(np.uint8)
        return cv2.applyColorMap(depthFrameColor, cv2.COLORMAP_HOT)

    def displayResults(self, rgbFrame, depthFrameColor, detections):
        height, width, _ = rgbFrame.shape
        for detection in detections:
            self.drawBoundingBoxes(depthFrameColor, detection)
            self.drawDetections(rgbFrame, detection, width, height)

        cv2.imshow("Depth frame", depthFrameColor)
        cv2.imshow("Color frame", rgbFrame)
        if cv2.waitKey(1) == ord('q'):
            self.stopPipeline()

    def drawBoundingBoxes(self, depthFrameColor, detection):
        roiData = detection.boundingBoxMapping
        roi = roiData.roi
        roi = roi.denormalize(depthFrameColor.shape[1], depthFrameColor.shape[0])
        topLeft = roi.topLeft()
        bottomRight = roi.bottomRight()
        cv2.rectangle(depthFrameColor, (int(topLeft.x), int(topLeft.y)), (int(bottomRight.x), int(bottomRight.y)), (255, 255, 255), 1)

    def drawDetections(self, frame, detection, frameWidth, frameHeight):
        x1 = int(detection.xmin * frameWidth)
        x2 = int(detection.xmax * frameWidth)
        y1 = int(detection.ymin * frameHeight)
        y2 = int(detection.ymax * frameHeight)
        label = detection.labelName
        color = (255, 255, 255)
        cv2.putText(frame, str(label), (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
        cv2.putText(frame, "{:.2f}".format(detection.confidence * 100), (x1 + 10, y1 + 35), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
        cv2.putText(frame, f"X: {int(detection.spatialCoordinates.x)} mm", (x1 + 10, y1 + 50), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
        cv2.putText(frame, f"Y: {int(detection.spatialCoordinates.y)} mm", (x1 + 10, y1 + 65), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
        cv2.putText(frame, f"Z: {int(detection.spatialCoordinates.z)} mm", (x1 + 10, y1 + 80), cv2.FONT_HERSHEY_TRIPLEX, 0.5, color)
        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 1)

# Creates the pipeline and a default device implicitly
with dai.Pipeline() as p:
    # Define sources and outputs
    platform = p.getDefaultDevice().getPlatform()

    camRgb = p.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_A, sensorFps=fps)
    monoLeft = p.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_B, sensorFps=fps)
    monoRight = p.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_C, sensorFps=fps)
    if args.depthSource == "stereo":
        depthSource = p.create(dai.node.StereoDepth)
        depthSource.setExtendedDisparity(True)
        monoLeft.requestOutput(size).link(depthSource.left)
        monoRight.requestOutput(size).link(depthSource.right)
    elif args.depthSource == "neural":
        depthSource = p.create(dai.node.NeuralDepth).build(
            monoLeft.requestFullResolutionOutput(),
            monoRight.requestFullResolutionOutput(),
            dai.DeviceModelZoo.NEURAL_DEPTH_LARGE,
        )
    else:
        raise ValueError(f"Invalid depth source: {args.depthSource}")

    spatialDetectionNetwork = p.create(dai.node.SpatialDetectionNetwork).build(
        camRgb, depthSource, modelDescription
    )
    visualizer = p.create(SpatialVisualizer)

    spatialDetectionNetwork.spatialLocationCalculator.initialConfig.setSegmentationPassthrough(False)
    spatialDetectionNetwork.input.setBlocking(False)
    spatialDetectionNetwork.setDepthLowerThreshold(100)
    spatialDetectionNetwork.setDepthUpperThreshold(5000)

    visualizer.build(
        spatialDetectionNetwork.passthroughDepth,
        spatialDetectionNetwork.out,
        spatialDetectionNetwork.passthrough,
    )

    print("Starting pipeline with depth source: ", args.depthSource)

    p.run()
```

#### C++

```cpp
#include <argparse/argparse.hpp>
#include <iostream>
#include <memory>
#include <opencv2/opencv.hpp>
#include <vector>

#include "depthai/depthai.hpp"

constexpr float NEURAL_FPS = 8.0f;
constexpr float STEREO_DEFAULT_FPS = 20.0f;

// Custom host node for spatial visualization
class SpatialVisualizer : public dai::NodeCRTP<dai::node::HostNode, SpatialVisualizer> {
   public:
    Input& depthInput = inputs["depth"];
    Input& detectionsInput = inputs["detections"];
    Input& rgbInput = inputs["rgb"];

    std::vector<std::string> labelMap;

    std::shared_ptr<SpatialVisualizer> build(Output& depth, Output& detections, Output& rgb) {
        depth.link(depthInput);
        detections.link(detectionsInput);
        rgb.link(rgbInput);
        sendProcessingToPipeline(true);
        return std::static_pointer_cast<SpatialVisualizer>(this->shared_from_this());
    }

    std::shared_ptr<dai::Buffer> processGroup(std::shared_ptr<dai::MessageGroup> in) override {
        auto depthFrame = in->get<dai::ImgFrame>("depth");
        auto detections = in->get<dai::SpatialImgDetections>("detections");
        auto rgbFrame = in->get<dai::ImgFrame>("rgb");

        cv::Mat depthCv = depthFrame->getCvFrame();
        cv::Mat rgbCv = rgbFrame->getCvFrame();
        cv::Mat depthFrameColor = processDepthFrame(depthCv);
        displayResults(rgbCv, depthFrameColor, detections->detections);

        return nullptr;
    }

   private:
    cv::Mat processDepthFrame(const cv::Mat& depthFrame) {
        // Downscale depth frame
        cv::Mat depthDownscaled;
        cv::resize(depthFrame, depthDownscaled, cv::Size(), 0.25, 0.25);

        // Find min and max depth values
        double minDepth = 0, maxDepth = 0;
        cv::Mat mask = (depthDownscaled != 0);
        if(cv::countNonZero(mask) > 0) {
            cv::minMaxLoc(depthDownscaled, &minDepth, &maxDepth, nullptr, nullptr, mask);
        }

        // Normalize depth frame
        cv::Mat depthFrameColor;
        depthFrame.convertTo(depthFrameColor, CV_8UC1, 255.0 / (maxDepth - minDepth), -minDepth * 255.0 / (maxDepth - minDepth));

        // Apply color map
        cv::Mat colorized;
        cv::applyColorMap(depthFrameColor, colorized, cv::COLORMAP_HOT);
        return colorized;
    }

    void displayResults(cv::Mat& rgbFrame, cv::Mat& depthFrameColor, const std::vector<dai::SpatialImgDetection>& detections) {
        int height = rgbFrame.rows;
        int width = rgbFrame.cols;

        for(const auto& detection : detections) {
            drawBoundingBoxes(depthFrameColor, detection);
            drawDetections(rgbFrame, detection, width, height);
        }

        cv::imshow("depth", depthFrameColor);
        cv::imshow("rgb", rgbFrame);

        if(cv::waitKey(1) == 'q') {
            stopPipeline();
        }
    }

    void drawBoundingBoxes(cv::Mat& depthFrameColor, const dai::SpatialImgDetection& detection) {
        auto roi = detection.boundingBoxMapping.roi;
        roi = roi.denormalize(depthFrameColor.cols, depthFrameColor.rows);
        auto topLeft = roi.topLeft();
        auto bottomRight = roi.bottomRight();
        cv::rectangle(depthFrameColor,
                      cv::Point(static_cast<int>(topLeft.x), static_cast<int>(topLeft.y)),
                      cv::Point(static_cast<int>(bottomRight.x), static_cast<int>(bottomRight.y)),
                      cv::Scalar(255, 255, 255),
                      1);
    }

    void drawDetections(cv::Mat& frame, const dai::SpatialImgDetection& detection, int frameWidth, int frameHeight) {
        int x1 = static_cast<int>(detection.xmin * frameWidth);
        int x2 = static_cast<int>(detection.xmax * frameWidth);
        int y1 = static_cast<int>(detection.ymin * frameHeight);
        int y2 = static_cast<int>(detection.ymax * frameHeight);

        std::string label;
        try {
            label = labelMap[detection.label];
        } catch(...) {
            label = std::to_string(detection.label);
        }

        cv::Scalar color(255, 255, 255);
        cv::putText(frame, label, cv::Point(x1 + 10, y1 + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
        cv::putText(frame, std::to_string(detection.confidence * 100), cv::Point(x1 + 10, y1 + 35), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
        cv::putText(frame,
                    "X: " + std::to_string(static_cast<int>(detection.spatialCoordinates.x)) + " mm",
                    cv::Point(x1 + 10, y1 + 50),
                    cv::FONT_HERSHEY_TRIPLEX,
                    0.5,
                    color);
        cv::putText(frame,
                    "Y: " + std::to_string(static_cast<int>(detection.spatialCoordinates.y)) + " mm",
                    cv::Point(x1 + 10, y1 + 65),
                    cv::FONT_HERSHEY_TRIPLEX,
                    0.5,
                    color);
        cv::putText(frame,
                    "Z: " + std::to_string(static_cast<int>(detection.spatialCoordinates.z)) + " mm",
                    cv::Point(x1 + 10, y1 + 80),
                    cv::FONT_HERSHEY_TRIPLEX,
                    0.5,
                    color);
        cv::rectangle(frame, cv::Point(x1, y1), cv::Point(x2, y2), color, 1);
    }
};

int main(int argc, char** argv) {
    // Initialize argument parser
    argparse::ArgumentParser program("spatial_detection", "1.0.0");
    program.add_description("Spatial detection network example with configurable depth source");
    program.add_argument("--depthSource").default_value(std::string("stereo")).help("Depth source: stereo, neural, tof");

    // Parse arguments
    try {
        program.parse_args(argc, argv);
    } catch(const std::runtime_error& err) {
        std::cerr << err.what() << '\n';
        std::cerr << program;
        return EXIT_FAILURE;
    }

    // Get arguments
    std::string depthSourceArg = program.get<std::string>("--depthSource");

    // Validate depth source argument
    if(depthSourceArg != "stereo" && depthSourceArg != "neural" && depthSourceArg != "tof") {
        std::cerr << "Invalid depth source: " << depthSourceArg << '\n';
        std::cerr << "Valid options are: stereo, neural, tof" << '\n';
        return EXIT_FAILURE;
    }

    try {
        float fps = STEREO_DEFAULT_FPS;
        if(depthSourceArg == "neural") {
            fps = NEURAL_FPS;
        }

        // Create pipeline
        dai::Pipeline pipeline;

        const std::pair<int, int> size = {640, 400};

        // Define sources and outputs
        auto camRgb = pipeline.create<dai::node::Camera>();
        camRgb->build(dai::CameraBoardSocket::CAM_A, std::nullopt, fps);

        auto platform = pipeline.getDefaultDevice()->getPlatform();

        // Create depth source based on argument
        dai::node::DepthSource depthSource;

        if(depthSourceArg == "stereo") {
            auto monoLeft = pipeline.create<dai::node::Camera>();
            auto monoRight = pipeline.create<dai::node::Camera>();
            auto stereo = pipeline.create<dai::node::StereoDepth>();

            monoLeft->build(dai::CameraBoardSocket::CAM_B, std::nullopt, fps);
            monoRight->build(dai::CameraBoardSocket::CAM_C, std::nullopt, fps);

            stereo->setExtendedDisparity(true);
            monoLeft->requestOutput(size, std::nullopt, dai::ImgResizeMode::CROP)->link(stereo->left);
            monoRight->requestOutput(size, std::nullopt, dai::ImgResizeMode::CROP)->link(stereo->right);

            depthSource = stereo;
        } else if(depthSourceArg == "neural") {
            auto monoLeft = pipeline.create<dai::node::Camera>();
            auto monoRight = pipeline.create<dai::node::Camera>();

            monoLeft->build(dai::CameraBoardSocket::CAM_B, std::nullopt, fps);
            monoRight->build(dai::CameraBoardSocket::CAM_C, std::nullopt, fps);

            auto neuralDepth = pipeline.create<dai::node::NeuralDepth>();
            neuralDepth->build(*monoLeft->requestFullResolutionOutput(), *monoRight->requestFullResolutionOutput(), dai::DeviceModelZoo::NEURAL_DEPTH_LARGE);

            depthSource = neuralDepth;
        } else if(depthSourceArg == "tof") {
            auto tof = pipeline.create<dai::node::ToF>();
            depthSource = tof;
        }

        // Create spatial detection network using the unified build method with DepthSource variant
        auto spatialDetectionNetwork = pipeline.create<dai::node::SpatialDetectionNetwork>();
        auto visualizer = pipeline.create<SpatialVisualizer>();

        // Configure spatial detection network
        spatialDetectionNetwork->input.setBlocking(false);
        spatialDetectionNetwork->setBoundingBoxScaleFactor(0.5f);
        spatialDetectionNetwork->setDepthLowerThreshold(100);
        spatialDetectionNetwork->setDepthUpperThreshold(5000);

        // Set up model and build with DepthSource variant
        dai::NNModelDescription modelDesc;
        // For better results on OAK4, use a segmentation model like "luxonis/yolov8-instance-segmentation-large:coco-640x480"
        // for depth estimation over the objects mask instead of the full bounding box.
        modelDesc.model = "yolov6-nano";
        spatialDetectionNetwork->build(camRgb, depthSource, modelDesc);

        // Set label map
        visualizer->labelMap = spatialDetectionNetwork->getClasses().value();
        spatialDetectionNetwork->spatialLocationCalculator->initialConfig->setSegmentationPassthrough(false);

        // Linking
        visualizer->build(spatialDetectionNetwork->passthroughDepth, spatialDetectionNetwork->out, spatialDetectionNetwork->passthrough);

        std::cout << "Pipeline starting with depth source: " << depthSourceArg << '\n';

        // Start pipeline
        pipeline.run();

    } catch(const std::exception& e) {
        std::cerr << "Error: " << e.what() << '\n';
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}
```

### Need assistance?

Head over to [Discussion Forum](https://discuss.luxonis.com/) for technical support or any other questions you might have.
