Implement initializeFiltersContext for CPU device interface

dvrogozh · dvrogozh · commit bcde35ef7ee7 · 2025-08-28T20:51:16.000Z
Signed-off-by: Dmitry Rogozhkin &lt;dmitry.v.rogozhkin@intel.com&gt;
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -13,6 +13,35 @@ static bool g_cpu = registerDeviceInterface(
     torch::kCPU,
     [](const torch::Device& device) { return new CpuDeviceInterface(device); });
 
+ColorConversionLibrary getColorConversionLibrary(
+    const VideoStreamOptions& videoStreamOptions,
+    int width) {
+  // By default, we want to use swscale for color conversion because it is
+  // faster. However, it has width requirements, so we may need to fall back
+  // to filtergraph. We also need to respect what was requested from the
+  // options; we respect the options unconditionally, so it's possible for
+  // swscale's width requirements to be violated. We don't expose the ability to
+  // choose color conversion library publicly; we only use this ability
+  // internally.
+
+  // swscale requires widths to be multiples of 32:
+  // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
+  // so we fall back to filtergraph if the width is not a multiple of 32.
+  auto defaultLibrary = (width % 32 == 0)
+      ? ColorConversionLibrary::SWSCALE
+      : ColorConversionLibrary::FILTERGRAPH;
+
+  ColorConversionLibrary colorConversionLibrary =
+      videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
+
+  TORCH_CHECK(
+      colorConversionLibrary == ColorConversionLibrary::SWSCALE ||
+      colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH,
+      "Invalid color conversion library: ",
+      static_cast<int>(colorConversionLibrary));
+  return colorConversionLibrary;
+}
+
 } // namespace
 
 CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
@@ -22,6 +51,52 @@ CpuDeviceInterface::CpuDeviceInterface(const torch::Device& device)
       device_.type() == torch::kCPU, "Unsupported device: ", device_.str());
 }
 
+std::unique_ptr<FiltersContext> CpuDeviceInterface::initializeFiltersContextInternal(
+    const VideoStreamOptions& videoStreamOptions,
+    const UniqueAVFrame& avFrame,
+    const AVRational& timeBase) {
+  enum AVPixelFormat frameFormat =
+      static_cast<enum AVPixelFormat>(avFrame->format);
+  auto frameDims =
+      getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
+  int expectedOutputHeight = frameDims.height;
+  int expectedOutputWidth = frameDims.width;
+
+  std::unique_ptr<FiltersContext> filtersContext =
+      std::make_unique<FiltersContext>();
+
+  filtersContext->inputWidth = avFrame->width;
+  filtersContext->inputHeight = avFrame->height;
+  filtersContext->inputFormat = frameFormat;
+  filtersContext->inputAspectRatio = avFrame->sample_aspect_ratio;
+  filtersContext->outputWidth = expectedOutputWidth;
+  filtersContext->outputHeight = expectedOutputHeight;
+  filtersContext->outputFormat = AV_PIX_FMT_RGB24;
+  filtersContext->timeBase = timeBase;
+
+  std::stringstream filters;
+  filters << "scale=" << expectedOutputWidth << ":" << expectedOutputHeight;
+  filters << ":sws_flags=bilinear";
+
+  filtersContext->filters = filters.str();
+  return filtersContext;
+}
+
+std::unique_ptr<FiltersContext> CpuDeviceInterface::initializeFiltersContext(
+    const VideoStreamOptions& videoStreamOptions,
+    const UniqueAVFrame& avFrame,
+    const AVRational& timeBase) {
+  auto frameDims =
+      getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
+  int expectedOutputWidth = frameDims.width;
+
+  if (getColorConversionLibrary(videoStreamOptions, expectedOutputWidth) == ColorConversionLibrary::SWSCALE) {
+    return nullptr;
+  }
+
+  return initializeFiltersContextInternal(videoStreamOptions, avFrame, timeBase);
+}
+
 // Note [preAllocatedOutputTensor with swscale and filtergraph]:
 // Callers may pass a pre-allocated tensor, where the output.data tensor will
 // be stored. This parameter is honored in any case, but it only leads to a
@@ -56,56 +131,25 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
   }
 
   torch::Tensor outputTensor;
-  // We need to compare the current frame context with our previous frame
-  // context. If they are different, then we need to re-create our colorspace
-  // conversion objects. We create our colorspace conversion objects late so
-  // that we don't have to depend on the unreliable metadata in the header.
-  // And we sometimes re-create them because it's possible for frame
-  // resolution to change mid-stream. Finally, we want to reuse the colorspace
-  // conversion objects as much as possible for performance reasons.
-  enum AVPixelFormat frameFormat =
-      static_cast<enum AVPixelFormat>(avFrame->format);
-  FiltersContext filtersContext;
-
-  filtersContext.inputWidth = avFrame->width;
-  filtersContext.inputHeight = avFrame->height;
-  filtersContext.inputFormat = frameFormat;
-  filtersContext.inputAspectRatio = avFrame->sample_aspect_ratio;
-  filtersContext.outputWidth = expectedOutputWidth;
-  filtersContext.outputHeight = expectedOutputHeight;
-  filtersContext.outputFormat = AV_PIX_FMT_RGB24;
-  filtersContext.timeBase = timeBase;
-
-  std::stringstream filters;
-  filters << "scale=" << expectedOutputWidth << ":" << expectedOutputHeight;
-  filters << ":sws_flags=bilinear";
-
-  filtersContext.filters = filters.str();
-
-  // By default, we want to use swscale for color conversion because it is
-  // faster. However, it has width requirements, so we may need to fall back
-  // to filtergraph. We also need to respect what was requested from the
-  // options; we respect the options unconditionally, so it's possible for
-  // swscale's width requirements to be violated. We don't expose the ability to
-  // choose color conversion library publicly; we only use this ability
-  // internally.
-
-  // swscale requires widths to be multiples of 32:
-  // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
-  // so we fall back to filtergraph if the width is not a multiple of 32.
-  auto defaultLibrary = (expectedOutputWidth % 32 == 0)
-      ? ColorConversionLibrary::SWSCALE
-      : ColorConversionLibrary::FILTERGRAPH;
-
   ColorConversionLibrary colorConversionLibrary =
-      videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
+      getColorConversionLibrary(videoStreamOptions, expectedOutputWidth);
 
   if (colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
     outputTensor = preAllocatedOutputTensor.value_or(allocateEmptyHWCTensor(
         expectedOutputHeight, expectedOutputWidth, torch::kCPU));
 
+    // We need to compare the current frame context with our previous frame
+    // context. If they are different, then we need to re-create our colorspace
+    // conversion objects. We create our colorspace conversion objects late so
+    // that we don't have to depend on the unreliable metadata in the header.
+    // And we sometimes re-create them because it's possible for frame
+    // resolution to change mid-stream. Finally, we want to reuse the colorspace
+    // conversion objects as much as possible for performance reasons.
+    std::unique_ptr<FiltersContext> filtersContext =
+         initializeFiltersContextInternal(videoStreamOptions, avFrame, timeBase);
+
     if (!swsContext_ || prevFiltersContext_ != filtersContext) {
-      createSwsContext(filtersContext, avFrame->colorspace);
+      createSwsContext(*filtersContext, avFrame->colorspace);
       prevFiltersContext_ = std::move(filtersContext);
     }
     int resultHeight =
@@ -122,25 +166,16 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
 
     frameOutput.data = outputTensor;
   } else if (colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
-    if (!filterGraphContext_ || prevFiltersContext_ != filtersContext) {
-      filterGraphContext_ =
-          std::make_unique<FilterGraph>(filtersContext, videoStreamOptions);
-      prevFiltersContext_ = std::move(filtersContext);
-    }
-    outputTensor = convertAVFrameToTensorUsingFilterGraph(avFrame);
+    TORCH_CHECK_EQ(avFrame->format, AV_PIX_FMT_RGB24);
 
-    // Similarly to above, if this check fails it means the frame wasn't
-    // reshaped to its expected dimensions by filtergraph.
-    auto shape = outputTensor.sizes();
-    TORCH_CHECK(
-        (shape.size() == 3) && (shape[0] == expectedOutputHeight) &&
-            (shape[1] == expectedOutputWidth) && (shape[2] == 3),
-        "Expected output tensor of shape ",
-        expectedOutputHeight,
-        "x",
-        expectedOutputWidth,
-        "x3, got ",
-        shape);
+    std::vector<int64_t> shape = {expectedOutputHeight, expectedOutputWidth, 3};
+    std::vector<int64_t> strides = {avFrame->linesize[0], 3, 1};
+    AVFrame* avFramePtr = avFrame.release();
+    auto deleter = [avFramePtr](void*) {
+      UniqueAVFrame avFrameToDelete(avFramePtr);
+    };
+    outputTensor = torch::from_blob(
+      avFramePtr->data[0], shape, strides, deleter, {torch::kUInt8});
 
     if (preAllocatedOutputTensor.has_value()) {
       // We have already validated that preAllocatedOutputTensor and
@@ -150,11 +185,6 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
     } else {
       frameOutput.data = outputTensor;
     }
-  } else {
-    TORCH_CHECK(
-        false,
-        "Invalid color conversion library: ",
-        static_cast<int>(colorConversionLibrary));
   }
 }
 
@@ -176,25 +206,6 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwsScale(
   return resultHeight;
 }
 
-torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
-    const UniqueAVFrame& avFrame) {
-  UniqueAVFrame filteredAVFrame = filterGraphContext_->convert(avFrame);
-
-  TORCH_CHECK_EQ(filteredAVFrame->format, AV_PIX_FMT_RGB24);
-
-  auto frameDims = getHeightAndWidthFromResizedAVFrame(*filteredAVFrame.get());
-  int height = frameDims.height;
-  int width = frameDims.width;
-  std::vector<int64_t> shape = {height, width, 3};
-  std::vector<int64_t> strides = {filteredAVFrame->linesize[0], 3, 1};
-  AVFrame* filteredAVFramePtr = filteredAVFrame.release();
-  auto deleter = [filteredAVFramePtr](void*) {
-    UniqueAVFrame avFrameToDelete(filteredAVFramePtr);
-  };
-  return torch::from_blob(
-      filteredAVFramePtr->data[0], shape, strides, deleter, {torch::kUInt8});
-}
-
 void CpuDeviceInterface::createSwsContext(
     const FiltersContext& filtersContext,
     const enum AVColorSpace colorspace) {
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -26,6 +26,11 @@ class CpuDeviceInterface : public DeviceInterface {
   void initializeContext(
       [[maybe_unused]] AVCodecContext* codecContext) override {}
 
+  std::unique_ptr<FiltersContext> initializeFiltersContext(
+      const VideoStreamOptions& videoStreamOptions,
+      const UniqueAVFrame& avFrame,
+      const AVRational& timeBase) override;
+
   void convertAVFrameToFrameOutput(
       const VideoStreamOptions& videoStreamOptions,
       const AVRational& timeBase,
@@ -39,21 +44,21 @@ class CpuDeviceInterface : public DeviceInterface {
       const UniqueAVFrame& avFrame,
       torch::Tensor& outputTensor);
 
-  torch::Tensor convertAVFrameToTensorUsingFilterGraph(
-      const UniqueAVFrame& avFrame);
+  std::unique_ptr<FiltersContext> initializeFiltersContextInternal(
+      const VideoStreamOptions& videoStreamOptions,
+      const UniqueAVFrame& avFrame,
+      const AVRational& timeBase);
 
   void createSwsContext(
       const FiltersContext& filtersContext,
       const enum AVColorSpace colorspace);
 
-  // color-conversion fields. Only one of FilterGraphContext and
-  // UniqueSwsContext should be non-null.
-  std::unique_ptr<FilterGraph> filterGraphContext_;
+  // SWS color conversion context
   UniqueSwsContext swsContext_;
 
-  // Used to know whether a new FilterGraphContext or UniqueSwsContext should
+  // Used to know whether a new UniqueSwsContext should
   // be created before decoding a new frame.
-  FiltersContext prevFiltersContext_;
+  std::unique_ptr<FiltersContext> prevFiltersContext_;
 };
 
 } // namespace facebook::torchcodec