From d154217f9c7fc022572cf3117f1e90d22cb964eb Mon Sep 17 00:00:00 2001 From: Tarun Bommawar Date: Fri, 29 May 2026 17:09:13 -0400 Subject: [PATCH] feat: add MediaPipe hand landmark support --- src/supervision/key_points/core.py | 45 +++++++++++++------------ src/supervision/key_points/skeletons.py | 23 +++++++++++++ tests/helpers.py | 12 ++++++- tests/key_points/test_core.py | 35 +++++++++++++++++++ tests/key_points/test_skeletons.py | 34 +++++++++++++++++++ 5 files changed, 126 insertions(+), 23 deletions(-) diff --git a/src/supervision/key_points/core.py b/src/supervision/key_points/core.py index 823173352b..c0bd7392cf 100644 --- a/src/supervision/key_points/core.py +++ b/src/supervision/key_points/core.py @@ -537,9 +537,10 @@ def from_mediapipe( pose landmark detection inference result. Args: - mediapipe_results: The output results from Mediapipe. It supports pose - and face landmarks from `PoseLandmarker`, `FaceLandmarker` and the - legacy ones from `Pose` and `FaceMesh`. + mediapipe_results: The output results from Mediapipe. It supports pose, + face, and hand landmarks from `PoseLandmarker`, `FaceLandmarker`, + `HandLandmarker`, and the legacy ones from `Pose`, `FaceMesh`, and + `Hands`. resolution_wh: A tuple of the form `(width, height)` representing the resolution of the frame. @@ -606,28 +607,28 @@ def from_mediapipe( ``` """ - if hasattr(mediapipe_results, "pose_landmarks"): + if getattr(mediapipe_results, "pose_landmarks", None) is not None: results = mediapipe_results.pose_landmarks if not isinstance(mediapipe_results.pose_landmarks, list): - if mediapipe_results.pose_landmarks is None: - results = [] - else: - results = [ - [ - landmark - for landmark in mediapipe_results.pose_landmarks.landmark - ] - ] - elif hasattr(mediapipe_results, "face_landmarks"): - results = mediapipe_results.face_landmarks - elif hasattr(mediapipe_results, "multi_face_landmarks"): - if mediapipe_results.multi_face_landmarks is None: - results = [] - else: results = [ - face_landmark.landmark - for face_landmark in mediapipe_results.multi_face_landmarks + [landmark for landmark in mediapipe_results.pose_landmarks.landmark] ] + elif getattr(mediapipe_results, "face_landmarks", None) is not None: + results = mediapipe_results.face_landmarks + elif getattr(mediapipe_results, "hand_landmarks", None) is not None: + results = mediapipe_results.hand_landmarks + elif getattr(mediapipe_results, "multi_face_landmarks", None) is not None: + results = [ + face_landmark.landmark + for face_landmark in mediapipe_results.multi_face_landmarks + ] + elif getattr(mediapipe_results, "multi_hand_landmarks", None) is not None: + results = [ + hand_landmark.landmark + for hand_landmark in mediapipe_results.multi_hand_landmarks + ] + else: + results = [] if len(results) == 0: return cls.empty() @@ -643,7 +644,7 @@ def from_mediapipe( landmark.y * resolution_wh[1], ] prediction_xy.append(keypoint_xy) - prediction_confidence.append(landmark.visibility) + prediction_confidence.append(getattr(landmark, "visibility", 1.0)) xy.append(prediction_xy) confidence.append(prediction_confidence) diff --git a/src/supervision/key_points/skeletons.py b/src/supervision/key_points/skeletons.py index 71edfd657b..71e473710a 100644 --- a/src/supervision/key_points/skeletons.py +++ b/src/supervision/key_points/skeletons.py @@ -24,6 +24,29 @@ class Skeleton(Enum): (17, 15), ) + HAND = ( + (1, 2), + (2, 3), + (3, 4), + (4, 5), + (1, 6), + (6, 7), + (7, 8), + (8, 9), + (1, 10), + (10, 11), + (11, 12), + (12, 13), + (1, 14), + (14, 15), + (15, 16), + (16, 17), + (1, 18), + (18, 19), + (19, 20), + (20, 21), + ) + GHUM = ( (1, 2), (1, 5), diff --git a/tests/helpers.py b/tests/helpers.py index 1ce209b39c..b6506995f2 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -329,6 +329,12 @@ def __init__(self, x, y, visibility=1.0): self.visibility = visibility +class _FakeMediapipeLandmarkWithoutVisibility: + def __init__(self, x, y): + self.x = x + self.y = y + + class _FakeMediapipePose: def __init__(self, landmarks: list[_FakeMediapipeLandmark]): self.landmark = landmarks @@ -341,11 +347,15 @@ def __init__( | _FakeMediapipePose | None = None, face_landmarks: _FakeMediapipeLandmark | None = None, - multi_face_landmarks: list[_FakeMediapipeLandmark] | None = None, + hand_landmarks: list[list[_FakeMediapipeLandmark]] | None = None, + multi_face_landmarks: list[_FakeMediapipePose] | None = None, + multi_hand_landmarks: list[_FakeMediapipePose] | None = None, ): self.pose_landmarks = pose_landmarks self.face_landmarks = face_landmarks + self.hand_landmarks = hand_landmarks self.multi_face_landmarks = multi_face_landmarks + self.multi_hand_landmarks = multi_hand_landmarks def create_yolo_dataset( diff --git a/tests/key_points/test_core.py b/tests/key_points/test_core.py index e0f559f80d..82a496c23e 100644 --- a/tests/key_points/test_core.py +++ b/tests/key_points/test_core.py @@ -8,6 +8,7 @@ from tests.helpers import ( _create_key_points, _FakeMediapipeLandmark, + _FakeMediapipeLandmarkWithoutVisibility, _FakeMediapipePose, _FakeMediapipeResults, _FakeYoloNasKeyPoint, @@ -758,6 +759,40 @@ def test_from_yolo_nas_input(yolo_nas_results, expected_key_points): class_id=None, ), ), + ( + _FakeMediapipeResults( + hand_landmarks=[ + [ + _FakeMediapipeLandmarkWithoutVisibility(0.1, 0.2), + _FakeMediapipeLandmarkWithoutVisibility(0.3, 0.4), + ] + ] + ), + (100, 200), + _create_key_points( + xy=[[[10.0, 40.0], [30.0, 80.0]]], + confidence=[[1.0, 1.0]], + class_id=None, + ), + ), + ( + _FakeMediapipeResults( + multi_hand_landmarks=[ + _FakeMediapipePose( + landmarks=[ + _FakeMediapipeLandmarkWithoutVisibility(0.1, 0.2), + _FakeMediapipeLandmarkWithoutVisibility(0.3, 0.4), + ] + ) + ] + ), + (100, 200), + _create_key_points( + xy=[[[10.0, 40.0], [30.0, 80.0]]], + confidence=[[1.0, 1.0]], + class_id=None, + ), + ), ], ) def test_from_mediapipe_input(mediapipe_results, resolution_wh, expected_key_points): diff --git a/tests/key_points/test_skeletons.py b/tests/key_points/test_skeletons.py index e97562888f..ff9e16bb5b 100644 --- a/tests/key_points/test_skeletons.py +++ b/tests/key_points/test_skeletons.py @@ -58,3 +58,37 @@ def test_skeletons_by_vertex_count_mapping_behaviour(self): # For each vertex count, the stored skeleton should be the last one encountered for vertex_count, skeleton_value in expected_mapping.items(): assert SKELETONS_BY_VERTEX_COUNT[vertex_count] == skeleton_value + + def test_hand_skeleton_definition(self): + """Test MediaPipe hand skeleton definition.""" + hand_skeleton = Skeleton.HAND.value + + assert len(hand_skeleton) == 20 + assert len({vertex for edge in hand_skeleton for vertex in edge}) == 21 + assert SKELETONS_BY_VERTEX_COUNT[21] == hand_skeleton + assert SKELETONS_BY_EDGE_COUNT[20] == hand_skeleton + + def test_hand_skeleton_edges(self): + """Test MediaPipe hand skeleton follows expected finger connections.""" + assert Skeleton.HAND.value == ( + (1, 2), + (2, 3), + (3, 4), + (4, 5), + (1, 6), + (6, 7), + (7, 8), + (8, 9), + (1, 10), + (10, 11), + (11, 12), + (12, 13), + (1, 14), + (14, 15), + (15, 16), + (16, 17), + (1, 18), + (18, 19), + (19, 20), + (20, 21), + )