facebookresearch · cdtwigg · Nov 5, 2025 · Nov 5, 2025
diff --git a/cmake/build_variables.bzl b/cmake/build_variables.bzl
@@ -698,6 +698,7 @@ rasterizer_public_headers = [
     "rasterizer/image.h",
     "rasterizer/rasterizer.h",
     "rasterizer/tensor.h",
+    "rasterizer/text_rasterizer.h",
     "rasterizer/utility.h",
 ]
 
@@ -706,12 +707,14 @@ rasterizer_sources = [
     "rasterizer/geometry.cpp",
     "rasterizer/rasterizer.cpp",
     "rasterizer/image.cpp",
+    "rasterizer/text_rasterizer.cpp",
 ]
 
 rasterizer_test_sources = [
     "test/rasterizer/test_camera.cpp",
     "test/rasterizer/test_geometry.cpp",
     "test/rasterizer/test_software_rasterizer.cpp",
+    "test/rasterizer/test_text_rasterizer.cpp",
 ]
 
 #===========

diff --git a/momentum/marker_tracking/marker_tracker.cpp b/momentum/marker_tracking/marker_tracker.cpp
diff --git a/momentum/marker_tracking/marker_tracker.h b/momentum/marker_tracking/marker_tracker.h
@@ -155,6 +155,7 @@ Eigen::MatrixXf trackPosesPerframe(
 /// too.
 /// @param[in] config Solving options.
 /// @param[in] frameIndices Frame indices of the frames to be solved.
+/// @param[in] isContinuous Whether to use temporal coherence between frames.
 ///
 /// @return The solved motion. It has the same length as markerData. It repeats the same solved pose
 /// within a frame stride.
@@ -163,7 +164,8 @@ Eigen::MatrixXf trackPosesForFrames(
     const momentum::Character& character,
     const Eigen::MatrixXf& initialMotion,
     const TrackingConfig& config,
-    const std::vector<size_t>& frameIndices);
+    const std::vector<size_t>& frameIndices,
+    bool isContinuous = false);
 
 /// Calibrate body proportions and locator offsets of a character from input marker data.
 ///

diff --git a/momentum/rasterizer/text_rasterizer.cpp b/momentum/rasterizer/text_rasterizer.cpp
diff --git a/momentum/rasterizer/text_rasterizer.h b/momentum/rasterizer/text_rasterizer.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <momentum/rasterizer/camera.h>
+#include <momentum/rasterizer/fwd.h>
+#include <momentum/rasterizer/rasterizer.h>
+#include <Eigen/Core>
+#include <gsl/span>
+#include <string>
+
+namespace momentum::rasterizer {
+
+/// Horizontal alignment options for text rendering
+enum class HorizontalAlignment {
+  Left,
+  Center,
+  Right,
+};
+
+/// Vertical alignment options for text rendering
+enum class VerticalAlignment {
+  Top,
+  Center,
+  Bottom,
+};
+
+/// Rasterize text at 3D world positions
+///
+/// Projects 3D positions to image space using the camera and renders text strings at those
+/// locations. Uses an embedded bitmap font for rendering.
+///
+/// @param positionsWorld 3D positions in world coordinates where text should be rendered
+/// @param texts Text strings to render at each position
+/// @param camera Camera to render from
+/// @param modelMatrix Model transformation matrix
+/// @param nearClip Near clipping distance
+/// @param color RGB color for the text
+/// @param textScale Integer scaling factor for text size (1 = 1 pixel per font pixel)
+/// @param zBuffer Input/output depth buffer (SIMD-aligned)
+/// @param rgbBuffer Optional input/output RGB color buffer
+/// @param imageOffset Pixel offset for positioning
+/// @param horizontalAlignment Horizontal text alignment relative to position
+/// @param verticalAlignment Vertical text alignment relative to position
+void rasterizeText(
+    gsl::span<const Eigen::Vector3f> positionsWorld,
+    gsl::span<const std::string> texts,
+    const Camera& camera,
+    const Eigen::Matrix4f& modelMatrix,
+    float nearClip,
+    const Eigen::Vector3f& color,
+    int textScale,
+    Span2f zBuffer,
+    Span3f rgbBuffer = {},
+    float depthOffset = 0,
+    const Eigen::Vector2f& imageOffset = {0, 0},
+    HorizontalAlignment horizontalAlignment = HorizontalAlignment::Left,
+    VerticalAlignment verticalAlignment = VerticalAlignment::Top);
+
+/// Rasterize text directly in 2D image space
+///
+/// Renders text at 2D image positions without camera projection or depth testing.
+///
+/// @param positionsImage 2D positions in image coordinates where text should be rendered
+/// @param texts Text strings to render at each position
+/// @param color RGB color for the text
+/// @param textScale Integer scaling factor for text size (1 = 1 pixel per font pixel)
+/// @param rgbBuffer Input/output RGB color buffer
+/// @param zBuffer Optional depth buffer (fills with zeros when provided)
+/// @param imageOffset Pixel offset for positioning
+/// @param horizontalAlignment Horizontal text alignment relative to position
+/// @param verticalAlignment Vertical text alignment relative to position
+void rasterizeText2D(
+    gsl::span<const Eigen::Vector2f> positionsImage,
+    gsl::span<const std::string> texts,
+    const Eigen::Vector3f& color,
+    int textScale,
+    Span3f rgbBuffer,
+    Span2f zBuffer = {},
+    const Eigen::Vector2f& imageOffset = {0, 0},
+    HorizontalAlignment horizontalAlignment = HorizontalAlignment::Left,
+    VerticalAlignment verticalAlignment = VerticalAlignment::Top);
+
+} // namespace momentum::rasterizer
diff --git a/momentum/test/rasterizer/test_text_rasterizer.cpp b/momentum/test/rasterizer/test_text_rasterizer.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <momentum/rasterizer/camera.h>
+#include <momentum/rasterizer/image.h>
+#include <momentum/rasterizer/text_rasterizer.h>
+
+#include <gtest/gtest.h>
+
+using namespace momentum::rasterizer;
+
+TEST(TextRasterizer, BasicText3D) {
+  const int width = 200;
+  const int height = 100;
+
+  OpenCVDistortionParametersT<float> distortionParams;
+  auto intrinsics = std::make_shared<OpenCVIntrinsicsModel>(
+      width, height, width / 2.0f, height / 2.0f, width / 2.0f, height / 2.0f, distortionParams);
+
+  Camera camera(intrinsics);
+
+  auto zBuffer = makeRasterizerZBuffer(camera);
+  auto rgbBuffer = makeRasterizerRGBBuffer(camera);
+
+  std::vector<Eigen::Vector3f> positions = {Eigen::Vector3f(0.0f, 0.0f, 1.5f)};
+  std::vector<std::string> texts = {"Hello"};
+
+  rasterizeText(
+      positions,
+      texts,
+      camera,
+      Eigen::Matrix4f::Identity(),
+      0.1f,
+      Eigen::Vector3f(1.0f, 0.0f, 0.0f),
+      1,
+      zBuffer.view(),
+      rgbBuffer.view());
+
+  int pixelsSet = 0;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      if (zBuffer(y, x) < FLT_MAX) {
+        pixelsSet++;
+        EXPECT_NEAR(rgbBuffer(y, x, 0), 1.0f, 1e-5f);
+        EXPECT_NEAR(rgbBuffer(y, x, 1), 0.0f, 1e-5f);
+        EXPECT_NEAR(rgbBuffer(y, x, 2), 0.0f, 1e-5f);
+        EXPECT_NEAR(zBuffer(y, x), 1.5f, 1e-5f);
+      }
+    }
+  }
+
+  EXPECT_GT(pixelsSet, 0);
+}
+
+TEST(TextRasterizer, BasicText2D) {
+  const int width = 200;
+  const int height = 100;
+
+  OpenCVDistortionParametersT<float> distortionParams;
+  auto intrinsics = std::make_shared<OpenCVIntrinsicsModel>(
+      width, height, width / 2.0f, height / 2.0f, width / 2.0f, height / 2.0f, distortionParams);
+
+  Camera camera(intrinsics);
+
+  auto zBuffer = makeRasterizerZBuffer(camera);
+  auto rgbBuffer = makeRasterizerRGBBuffer(camera);
+
+  std::vector<Eigen::Vector2f> positions = {Eigen::Vector2f(10.0f, 10.0f)};
+  std::vector<std::string> texts = {"Test"};
+
+  rasterizeText2D(
+      positions, texts, Eigen::Vector3f(0.0f, 1.0f, 0.0f), 1, rgbBuffer.view(), zBuffer.view());
+
+  int pixelsSet = 0;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      if (zBuffer(y, x) < FLT_MAX) {
+        pixelsSet++;
+        EXPECT_NEAR(rgbBuffer(y, x, 0), 0.0f, 1e-5f);
+        EXPECT_NEAR(rgbBuffer(y, x, 1), 1.0f, 1e-5f);
+        EXPECT_NEAR(rgbBuffer(y, x, 2), 0.0f, 1e-5f);
+        EXPECT_NEAR(zBuffer(y, x), 0.0f, 1e-5f);
+      }
+    }
+  }
+
+  EXPECT_GT(pixelsSet, 0);
+}
+
+TEST(TextRasterizer, TextScaling) {
+  const int width = 400;
+  const int height = 200;
+
+  OpenCVDistortionParametersT<float> distortionParams;
+  auto intrinsics = std::make_shared<OpenCVIntrinsicsModel>(
+      width, height, width / 2.0f, height / 2.0f, width / 2.0f, height / 2.0f, distortionParams);
+
+  Camera camera(intrinsics);
+
+  auto rgbBuffer1 = makeRasterizerRGBBuffer(camera);
+  auto rgbBuffer2 = makeRasterizerRGBBuffer(camera);
+
+  std::vector<Eigen::Vector2f> positions = {Eigen::Vector2f(10.0f, 10.0f)};
+  std::vector<std::string> texts = {"A"};
+
+  rasterizeText2D(positions, texts, Eigen::Vector3f(1.0f, 1.0f, 1.0f), 1, rgbBuffer1.view());
+
+  int pixelsScale1 = 0;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      if (rgbBuffer1(y, x, 0) > 0.5f) {
+        pixelsScale1++;
+      }
+    }
+  }
+
+  rasterizeText2D(positions, texts, Eigen::Vector3f(1.0f, 1.0f, 1.0f), 2, rgbBuffer2.view());
+
+  int pixelsScale2 = 0;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      if (rgbBuffer2(y, x, 0) > 0.5f) {
+        pixelsScale2++;
+      }
+    }
+  }
+
+  EXPECT_GT(pixelsScale1, 0);
+  EXPECT_GT(pixelsScale2, pixelsScale1);
+  EXPECT_NEAR(static_cast<float>(pixelsScale2) / pixelsScale1, 4.0f, 1.0f);
+}
+
+TEST(TextRasterizer, MultipleTexts) {
+  const int width = 400;
+  const int height = 200;
+
+  OpenCVDistortionParametersT<float> distortionParams;
+  auto intrinsics = std::make_shared<OpenCVIntrinsicsModel>(
+      width, height, width / 2.0f, height / 2.0f, width / 2.0f, height / 2.0f, distortionParams);
+
+  Camera camera(intrinsics);
+
+  auto zBuffer = makeRasterizerZBuffer(camera);
+  auto rgbBuffer = makeRasterizerRGBBuffer(camera);
+
+  std::vector<Eigen::Vector2f> positions = {
+      Eigen::Vector2f(10.0f, 10.0f), Eigen::Vector2f(10.0f, 30.0f)};
+  std::vector<std::string> texts = {"Line1", "Line2"};
+
+  rasterizeText2D(
+      positions, texts, Eigen::Vector3f(1.0f, 0.0f, 1.0f), 1, rgbBuffer.view(), zBuffer.view());
+
+  int pixelsSet = 0;
+  for (int y = 0; y < height; ++y) {
+    for (int x = 0; x < width; ++x) {
+      if (zBuffer(y, x) < FLT_MAX) {
+        pixelsSet++;
+        EXPECT_NEAR(rgbBuffer(y, x, 0), 1.0f, 1e-5f);
+        EXPECT_NEAR(rgbBuffer(y, x, 1), 0.0f, 1e-5f);
+        EXPECT_NEAR(rgbBuffer(y, x, 2), 1.0f, 1e-5f);
+      }
+    }
+  }
+
+  EXPECT_GT(pixelsSet, 0);
+}
diff --git a/pymomentum/renderer/renderer_pybind.cpp b/pymomentum/renderer/renderer_pybind.cpp
@@ -15,6 +15,7 @@
 #include <momentum/character/skeleton_state.h>
 #include <momentum/rasterizer/camera.h>
 #include <momentum/rasterizer/rasterizer.h>
+#include <momentum/rasterizer/text_rasterizer.h>
 
 #include <pybind11/eigen.h>
 #include <pybind11/pybind11.h>
@@ -645,6 +646,18 @@ PYBIND11_MODULE(renderer, m) {
       .value("Ambient", momentum::rasterizer::LightType::Ambient)
       .value("Directional", momentum::rasterizer::LightType::Directional)
       .value("Point", momentum::rasterizer::LightType::Point);
+
+  py::enum_<momentum::rasterizer::HorizontalAlignment>(
+      m, "HorizontalAlignment", "Horizontal text alignment options.")
+      .value("Left", momentum::rasterizer::HorizontalAlignment::Left)
+      .value("Center", momentum::rasterizer::HorizontalAlignment::Center)
+      .value("Right", momentum::rasterizer::HorizontalAlignment::Right);
+
+  py::enum_<momentum::rasterizer::VerticalAlignment>(
+      m, "VerticalAlignment", "Vertical text alignment options.")
+      .value("Top", momentum::rasterizer::VerticalAlignment::Top)
+      .value("Center", momentum::rasterizer::VerticalAlignment::Center)
+      .value("Bottom", momentum::rasterizer::VerticalAlignment::Bottom);
   py::class_<momentum::rasterizer::Light>(
       m,
       "Light",
@@ -1459,4 +1472,65 @@ This is useful for rendering shadows using the classic projection shadows techni
       py::arg("light"),
       py::arg("plane_normal") = std::optional<Eigen::Vector3f>{},
       py::arg("plane_origin") = std::optional<Eigen::Vector3f>{});
+
+  m.def(
+      "rasterize_text",
+      &rasterizeText,
+      R"(Rasterize text at 3D world positions.
+
+Projects 3D positions to image space using the camera and renders text strings at those locations using an embedded bitmap font.
+
+:param positions: (nTexts x 3) torch.Tensor of 3D positions in world coordinates.
+:param texts: List of strings to render at each position.
+:param camera: Camera to render from.
+:param z_buffer: Z-buffer to render geometry onto; can be reused for multiple renders.
+:param rgb_buffer: Optional RGB-buffer to render geometry onto.
+:param color: RGB color for the text. Defaults to white (1, 1, 1).
+:param text_scale: Integer scaling factor for text size (1 = 1 pixel per font pixel). Defaults to 1.
+:param horizontal_alignment: Horizontal text alignment (Left, Center, or Right). Defaults to Left.
+:param vertical_alignment: Vertical text alignment (Top, Center, or Bottom). Defaults to Top.
+:param model_matrix: Additional matrix to apply to the model. Defaults to identity matrix.
+:param near_clip: Clip any text closer than this depth. Defaults to 0.1.
+:param depth_offset: Offset the depth values. Defaults to 0.
+:param image_offset: Offset by (x, y) pixels in image space.
+)",
+      py::arg("positions"),
+      py::arg("texts"),
+      py::arg("camera"),
+      py::arg("z_buffer"),
+      py::arg("rgb_buffer") = std::optional<at::Tensor>{},
+      py::kw_only(),
+      py::arg("color") = Eigen::Vector3f(1.0f, 1.0f, 1.0f),
+      py::arg("text_scale") = 1,
+      py::arg("horizontal_alignment") = momentum::rasterizer::HorizontalAlignment::Left,
+      py::arg("vertical_alignment") = momentum::rasterizer::VerticalAlignment::Top,
+      py::arg("model_matrix") = std::optional<Eigen::Matrix4f>{},
+      py::arg("near_clip") = 0.1f,
+      py::arg("depth_offset") = 0.0f,
+      py::arg("image_offset") = std::optional<Eigen::Vector2f>{});
+
+  m.def(
+      "rasterize_text_2d",
+      &rasterizeText2D,
+      R"(Rasterize text directly in 2D image space without camera projection or depth testing.
+
+:param positions: (nTexts x 2) torch.Tensor of 2D positions in image coordinates.
+:param texts: List of strings to render at each position.
+:param rgb_buffer: RGB-buffer to render geometry onto.
+:param color: RGB color for the text. Defaults to white (1, 1, 1).
+:param text_scale: Integer scaling factor for text size (1 = 1 pixel per font pixel). Defaults to 1.
+:param horizontal_alignment: Horizontal text alignment (Left, Center, or Right). Defaults to Left.
+:param vertical_alignment: Vertical text alignment (Top, Center, or Bottom). Defaults to Top.
+:param z_buffer: Optional Z-buffer to write zeros to for alpha matting.
+:param image_offset: Offset by (x, y) pixels in image space.
+)",
+      py::arg("positions"),
+      py::arg("texts"),
+      py::arg("rgb_buffer"),
+      py::arg("color") = Eigen::Vector3f(1.0f, 1.0f, 1.0f),
+      py::arg("text_scale") = 1,
+      py::arg("horizontal_alignment") = momentum::rasterizer::HorizontalAlignment::Left,
+      py::arg("vertical_alignment") = momentum::rasterizer::VerticalAlignment::Top,
+      py::arg("z_buffer") = std::optional<at::Tensor>{},
+      py::arg("image_offset") = std::optional<Eigen::Vector2f>{});
 }