DepthAI
Software Stack

ON THIS PAGE

  • Demo
  • Pipeline
  • Source code

Object Tracker

Supported on:RVC2RVC4
This example demonstrates spatial object tracking on the RGB stream using a YOLOv6-nano SpatialDetectionNetwork together with the ObjectTracker node.Detections are decoded on-device and fused with stereo depth to provide per-track X/Y/Z (mm), while the tracker assigns persistent IDs and status visible in the overlay. When a valid spatial state exists, the overlay also displays optional velocity and speed in m/s.The source includes a useSpatialAssociation flag. It is False by default, which means tracklets still carry spatial coordinates, but detection-to-track matching uses the standard 2D association path. Set it to True when depth is reliable and you want XYZ distance to help preserve IDs for overlapping or crossing objects.

Demo

This example requires the DepthAI v3 API, see installation instructions.

Pipeline

Source code

Python

Python
GitHub
1#!/usr/bin/env python3
2
3import cv2
4import depthai as dai
5import time
6
7
8fullFrameTracking = False
9useSpatialAssociation = False
10
11# Create pipeline
12with dai.Pipeline() as pipeline:
13    # Define sources and outputs
14    camRgb = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_A)
15    monoLeft = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_B)
16    monoRight = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_C)
17
18    stereo = pipeline.create(dai.node.StereoDepth)
19    leftOutput = monoLeft.requestOutput((640, 400))
20    rightOutput = monoRight.requestOutput((640, 400))
21    leftOutput.link(stereo.left)
22    rightOutput.link(stereo.right)
23
24    spatialDetectionNetwork = pipeline.create(dai.node.SpatialDetectionNetwork).build(camRgb, stereo, "yolov6-nano")
25    objectTracker = pipeline.create(dai.node.ObjectTracker)
26
27    spatialDetectionNetwork.setConfidenceThreshold(0.6)
28    spatialDetectionNetwork.input.setBlocking(False)
29    spatialDetectionNetwork.setBoundingBoxScaleFactor(0.5)
30    spatialDetectionNetwork.setDepthLowerThreshold(100)
31    spatialDetectionNetwork.setDepthUpperThreshold(5000)
32    labelMap = spatialDetectionNetwork.getClasses()
33
34    objectTracker.setDetectionLabelsToTrack([0])  # track only person
35    # possible tracking types: ZERO_TERM_COLOR_HISTOGRAM, ZERO_TERM_IMAGELESS, SHORT_TERM_IMAGELESS, SHORT_TERM_KCF
36    objectTracker.setTrackerType(dai.TrackerType.SHORT_TERM_IMAGELESS)
37    # take the smallest ID when new object is tracked, possible options: SMALLEST_ID, UNIQUE_ID
38    objectTracker.setTrackerIdAssignmentPolicy(dai.TrackerIdAssignmentPolicy.SMALLEST_ID)
39    if useSpatialAssociation:
40        objectTracker.setSpatialAssociation(True)
41        objectTracker.setSpatialAssociationWeight(0.5)
42        objectTracker.setSpatialDistanceThreshold(1.5)
43        objectTracker.setSpatialDepthAwareScale(0.1)
44
45    preview = objectTracker.passthroughTrackerFrame.createOutputQueue()
46    tracklets = objectTracker.out.createOutputQueue()
47
48    if fullFrameTracking:
49        camRgb.requestFullResolutionOutput().link(objectTracker.inputTrackerFrame)
50        # do not block the pipeline if it's too slow on full frame
51        objectTracker.inputTrackerFrame.setBlocking(False)
52        objectTracker.inputTrackerFrame.setMaxSize(1)
53    else:
54        spatialDetectionNetwork.passthrough.link(objectTracker.inputTrackerFrame)
55
56    spatialDetectionNetwork.passthrough.link(objectTracker.inputDetectionFrame)
57    spatialDetectionNetwork.out.link(objectTracker.inputDetections)
58
59    startTime = time.monotonic()
60    counter = 0
61    fps = 0
62    color = (255, 255, 255)
63    pipeline.start()
64    while(pipeline.isRunning()):
65        imgFrame = preview.get()
66        track = tracklets.get()
67        assert isinstance(imgFrame, dai.ImgFrame), "Expected ImgFrame"
68        assert isinstance(track, dai.Tracklets), "Expected Tracklets"
69
70        counter+=1
71        current_time = time.monotonic()
72        if (current_time - startTime) > 1 :
73            fps = counter / (current_time - startTime)
74            counter = 0
75            startTime = current_time
76
77        frame = imgFrame.getCvFrame()
78        trackletsData = track.tracklets
79        for t in trackletsData:
80            roi = t.roi.denormalize(frame.shape[1], frame.shape[0])
81            x1 = int(roi.topLeft().x)
82            y1 = int(roi.topLeft().y)
83            x2 = int(roi.bottomRight().x)
84            y2 = int(roi.bottomRight().y)
85
86            try:
87                label = labelMap[t.label]
88            except:
89                label = t.label
90
91            cv2.putText(frame, str(label), (x1 + 10, y1 + 20), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
92            cv2.putText(frame, f"ID: {[t.id]}", (x1 + 10, y1 + 35), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
93            cv2.putText(frame, t.status.name, (x1 + 10, y1 + 50), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
94            cv2.rectangle(frame, (x1, y1), (x2, y2), color, cv2.FONT_HERSHEY_SIMPLEX)
95
96            cv2.putText(frame, f"X: {int(t.spatialCoordinates.x)} mm", (x1 + 10, y1 + 65), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
97            cv2.putText(frame, f"Y: {int(t.spatialCoordinates.y)} mm", (x1 + 10, y1 + 80), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
98            cv2.putText(frame, f"Z: {int(t.spatialCoordinates.z)} mm", (x1 + 10, y1 + 95), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
99            if t.velocity is not None and t.speed is not None:
100                cv2.putText(frame, f"Velocity X: {t.velocity.x:.2f} m/s", (x1 + 10, y1 + 110), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
101                cv2.putText(frame, f"Velocity Y: {t.velocity.y:.2f} m/s", (x1 + 10, y1 + 125), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
102                cv2.putText(frame, f"Velocity Z: {t.velocity.z:.2f} m/s", (x1 + 10, y1 + 140), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
103                cv2.putText(frame, f"Speed: {t.speed:.2f} m/s", (x1 + 10, y1 + 155), cv2.FONT_HERSHEY_TRIPLEX, 0.5, 255)
104
105        cv2.putText(frame, "NN fps: {:.2f}".format(fps), (2, frame.shape[0] - 4), cv2.FONT_HERSHEY_TRIPLEX, 0.4, color)
106
107        cv2.imshow("tracker", frame)
108
109        if cv2.waitKey(1) == ord('q'):
110            break

C++

1#include <chrono>
2#include <depthai/depthai.hpp>
3#include <opencv2/opencv.hpp>
4#include <optional>
5
6#include "depthai/pipeline/datatype/Tracklets.hpp"
7
8int main() {
9    bool fullFrameTracking = false;
10    bool useSpatialAssociation = false;
11
12    // Create pipeline
13    dai::Pipeline pipeline;
14
15    // Define sources and outputs
16    auto camRgb = pipeline.create<dai::node::Camera>()->build(dai::CameraBoardSocket::CAM_A);
17    auto monoLeft = pipeline.create<dai::node::Camera>()->build(dai::CameraBoardSocket::CAM_B);
18    auto monoRight = pipeline.create<dai::node::Camera>()->build(dai::CameraBoardSocket::CAM_C);
19
20    // Create stereo node
21    auto stereo = pipeline.create<dai::node::StereoDepth>();
22    auto leftOutput = monoLeft->requestOutput(std::make_pair(640, 400));
23    auto rightOutput = monoRight->requestOutput(std::make_pair(640, 400));
24    leftOutput->link(stereo->left);
25    rightOutput->link(stereo->right);
26
27    // Create spatial detection network
28    dai::NNModelDescription modelDescription{"yolov6-nano"};
29    auto spatialDetectionNetwork = pipeline.create<dai::node::SpatialDetectionNetwork>()->build(camRgb, stereo, modelDescription);
30    spatialDetectionNetwork->setConfidenceThreshold(0.6f);
31    spatialDetectionNetwork->input.setBlocking(false);
32    spatialDetectionNetwork->setBoundingBoxScaleFactor(0.5f);
33    spatialDetectionNetwork->setDepthLowerThreshold(100);
34    spatialDetectionNetwork->setDepthUpperThreshold(5000);
35
36    // Create object tracker
37    auto objectTracker = pipeline.create<dai::node::ObjectTracker>();
38    objectTracker->setDetectionLabelsToTrack({0});  // track only person
39    objectTracker->setTrackerType(dai::TrackerType::SHORT_TERM_IMAGELESS);
40    objectTracker->setTrackerIdAssignmentPolicy(dai::TrackerIdAssignmentPolicy::SMALLEST_ID);
41    if(useSpatialAssociation) {
42        objectTracker->setSpatialAssociation(true);
43        objectTracker->setSpatialAssociationWeight(0.5f);
44        objectTracker->setSpatialDistanceThreshold(1.5f);
45        objectTracker->setSpatialDepthAwareScale(0.1f);
46    }
47
48    // Create output queues
49    auto preview = objectTracker->passthroughTrackerFrame.createOutputQueue();
50    auto tracklets = objectTracker->out.createOutputQueue();
51
52    // Link nodes
53    if(fullFrameTracking) {
54        camRgb->requestFullResolutionOutput()->link(objectTracker->inputTrackerFrame);
55        objectTracker->inputTrackerFrame.setBlocking(false);
56        objectTracker->inputTrackerFrame.setMaxSize(1);
57    } else {
58        spatialDetectionNetwork->passthrough.link(objectTracker->inputTrackerFrame);
59    }
60
61    spatialDetectionNetwork->passthrough.link(objectTracker->inputDetectionFrame);
62    spatialDetectionNetwork->out.link(objectTracker->inputDetections);
63
64    // Start pipeline
65    pipeline.start();
66
67    // FPS calculation variables
68    auto startTime = std::chrono::steady_clock::now();
69    int counter = 0;
70    float fps = 0;
71    cv::Scalar color(255, 255, 255);
72
73    while(pipeline.isRunning()) {
74        auto imgFrame = preview->get<dai::ImgFrame>();
75        auto track = tracklets->get<dai::Tracklets>();
76
77        counter++;
78        auto currentTime = std::chrono::steady_clock::now();
79        auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(currentTime - startTime).count();
80        if(elapsed >= 1) {
81            fps = counter / static_cast<float>(elapsed);
82            counter = 0;
83            startTime = currentTime;
84        }
85
86        cv::Mat frame = imgFrame->getCvFrame();
87        auto trackletsData = track->tracklets;
88
89        for(const auto& t : trackletsData) {
90            auto roi = t.roi.denormalize(frame.cols, frame.rows);
91            int x1 = static_cast<int>(roi.topLeft().x);
92            int y1 = static_cast<int>(roi.topLeft().y);
93            int x2 = static_cast<int>(roi.bottomRight().x);
94            int y2 = static_cast<int>(roi.bottomRight().y);
95
96            std::string label;
97            try {
98                label = spatialDetectionNetwork->getClasses().value()[t.label];
99            } catch(...) {
100                label = std::to_string(t.label);
101            }
102
103            cv::putText(frame, label, cv::Point(x1 + 10, y1 + 20), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
104            cv::putText(frame, "ID: " + std::to_string(t.id), cv::Point(x1 + 10, y1 + 35), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
105            cv::putText(frame,
106                        std::string(t.status == dai::Tracklet::TrackingStatus::LOST ? "LOST" : "TRACKED"),
107                        cv::Point(x1 + 10, y1 + 50),
108                        cv::FONT_HERSHEY_TRIPLEX,
109                        0.5,
110                        color);
111            cv::rectangle(frame, cv::Point(x1, y1), cv::Point(x2, y2), color, cv::FONT_HERSHEY_SIMPLEX);
112
113            cv::putText(frame,
114                        "X: " + std::to_string(static_cast<int>(t.spatialCoordinates.x)) + " mm",
115                        cv::Point(x1 + 10, y1 + 65),
116                        cv::FONT_HERSHEY_TRIPLEX,
117                        0.5,
118                        color);
119            cv::putText(frame,
120                        "Y: " + std::to_string(static_cast<int>(t.spatialCoordinates.y)) + " mm",
121                        cv::Point(x1 + 10, y1 + 80),
122                        cv::FONT_HERSHEY_TRIPLEX,
123                        0.5,
124                        color);
125            cv::putText(frame,
126                        "Z: " + std::to_string(static_cast<int>(t.spatialCoordinates.z)) + " mm",
127                        cv::Point(x1 + 10, y1 + 95),
128                        cv::FONT_HERSHEY_TRIPLEX,
129                        0.5,
130                        color);
131            if(t.velocity && t.speed) {
132                const auto& velocity = *t.velocity;
133                cv::putText(frame, cv::format("Velocity X: %.2f m/s", velocity.x), cv::Point(x1 + 10, y1 + 110), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
134                cv::putText(frame, cv::format("Velocity Y: %.2f m/s", velocity.y), cv::Point(x1 + 10, y1 + 125), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
135                cv::putText(frame, cv::format("Velocity Z: %.2f m/s", velocity.z), cv::Point(x1 + 10, y1 + 140), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
136                cv::putText(frame, cv::format("Speed: %.2f m/s", *t.speed), cv::Point(x1 + 10, y1 + 155), cv::FONT_HERSHEY_TRIPLEX, 0.5, color);
137            }
138        }
139
140        cv::putText(frame, "NN fps: " + std::to_string(fps).substr(0, 4), cv::Point(2, frame.rows - 4), cv::FONT_HERSHEY_TRIPLEX, 0.4, color);
141
142        cv::imshow("tracker", frame);
143
144        if(cv::waitKey(1) == 'q') {
145            break;
146        }
147    }
148
149    return 0;
150}

Need assistance?

Head over to Discussion Forum for technical support or any other questions you might have.