Generalize FilterGraph class to support HW backends

Dmitry Rogozhkin · dvrogozh · commit 84b01cee6577 · 2025-08-27T17:56:02.000Z
Signed-off-by: Dmitry Rogozhkin &lt;dmitry.v.rogozhkin@gmail.com&gt;
diff --git a/src/torchcodec/_core/CpuDeviceInterface.cpp b/src/torchcodec/_core/CpuDeviceInterface.cpp
@@ -65,13 +65,22 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
   // conversion objects as much as possible for performance reasons.
   enum AVPixelFormat frameFormat =
       static_cast<enum AVPixelFormat>(avFrame->format);
-  auto frameContext = DecodedFrameContext{
-      avFrame->width,
-      avFrame->height,
-      frameFormat,
-      avFrame->sample_aspect_ratio,
-      expectedOutputWidth,
-      expectedOutputHeight};
+  FiltersContext filtersContext;
+
+  filtersContext.inputWidth = avFrame->width;
+  filtersContext.inputHeight = avFrame->height;
+  filtersContext.inputFormat = frameFormat;
+  filtersContext.inputAspectRatio = avFrame->sample_aspect_ratio;
+  filtersContext.outputWidth = expectedOutputWidth;
+  filtersContext.outputHeight = expectedOutputHeight;
+  filtersContext.outputFormat = AV_PIX_FMT_RGB24;
+  filtersContext.timeBase = timeBase;
+
+  std::stringstream filters;
+  filters << "scale=" << expectedOutputWidth << ":" << expectedOutputHeight;
+  filters << ":sws_flags=bilinear";
+
+  filtersContext.filters = filters.str();
 
   // By default, we want to use swscale for color conversion because it is
   // faster. However, it has width requirements, so we may need to fall back
@@ -95,9 +104,9 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
     outputTensor = preAllocatedOutputTensor.value_or(allocateEmptyHWCTensor(
         expectedOutputHeight, expectedOutputWidth, torch::kCPU));
 
-    if (!swsContext_ || prevFrameContext_ != frameContext) {
-      createSwsContext(frameContext, avFrame->colorspace);
-      prevFrameContext_ = frameContext;
+    if (!swsContext_ || prevFiltersContext_ != filtersContext) {
+      createSwsContext(filtersContext, avFrame->colorspace);
+      prevFiltersContext_ = std::move(filtersContext);
     }
     int resultHeight =
         convertAVFrameToTensorUsingSwsScale(avFrame, outputTensor);
@@ -113,10 +122,10 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
 
     frameOutput.data = outputTensor;
   } else if (colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
-    if (!filterGraphContext_ || prevFrameContext_ != frameContext) {
-      filterGraphContext_ = std::make_unique<FilterGraph>(
-          frameContext, videoStreamOptions, timeBase);
-      prevFrameContext_ = frameContext;
+    if (!filterGraphContext_ || prevFiltersContext_ != filtersContext) {
+      filterGraphContext_ =
+          std::make_unique<FilterGraph>(filtersContext, videoStreamOptions);
+      prevFiltersContext_ = std::move(filtersContext);
     }
     outputTensor = convertAVFrameToTensorUsingFilterGraph(avFrame);
 
@@ -187,15 +196,15 @@ torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
 }
 
 void CpuDeviceInterface::createSwsContext(
-    const DecodedFrameContext& frameContext,
+    const FiltersContext& filtersContext,
     const enum AVColorSpace colorspace) {
   SwsContext* swsContext = sws_getContext(
-      frameContext.decodedWidth,
-      frameContext.decodedHeight,
-      frameContext.decodedFormat,
-      frameContext.expectedWidth,
-      frameContext.expectedHeight,
-      AV_PIX_FMT_RGB24,
+      filtersContext.inputWidth,
+      filtersContext.inputHeight,
+      filtersContext.inputFormat,
+      filtersContext.outputWidth,
+      filtersContext.outputHeight,
+      filtersContext.outputFormat,
       SWS_BILINEAR,
       nullptr,
       nullptr,
diff --git a/src/torchcodec/_core/CpuDeviceInterface.h b/src/torchcodec/_core/CpuDeviceInterface.h
@@ -43,22 +43,17 @@ class CpuDeviceInterface : public DeviceInterface {
       const UniqueAVFrame& avFrame);
 
   void createSwsContext(
-      const DecodedFrameContext& frameContext,
+      const FiltersContext& filtersContext,
       const enum AVColorSpace colorspace);
 
-  void createFilterGraph(
-      const DecodedFrameContext& frameContext,
-      const VideoStreamOptions& videoStreamOptions,
-      const AVRational& timeBase);
-
   // color-conversion fields. Only one of FilterGraphContext and
   // UniqueSwsContext should be non-null.
   std::unique_ptr<FilterGraph> filterGraphContext_;
   UniqueSwsContext swsContext_;
 
   // Used to know whether a new FilterGraphContext or UniqueSwsContext should
   // be created before decoding a new frame.
-  DecodedFrameContext prevFrameContext_;
+  FiltersContext prevFiltersContext_;
 };
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/FilterGraph.cpp b/src/torchcodec/_core/FilterGraph.cpp
@@ -13,22 +13,26 @@ extern "C" {
 
 namespace facebook::torchcodec {
 
-bool DecodedFrameContext::operator==(const DecodedFrameContext& other) {
-  return decodedWidth == other.decodedWidth &&
-      decodedHeight == other.decodedHeight &&
-      decodedFormat == other.decodedFormat &&
-      expectedWidth == other.expectedWidth &&
-      expectedHeight == other.expectedHeight;
+bool operator==(const AVRational& lhs, const AVRational& rhs) {
+  return lhs.num == rhs.num && lhs.den == rhs.den;
 }
 
-bool DecodedFrameContext::operator!=(const DecodedFrameContext& other) {
+bool FiltersContext::operator==(const FiltersContext& other) {
+  return inputWidth == other.inputWidth && inputHeight == other.inputHeight &&
+      inputFormat == other.inputFormat && outputWidth == other.outputWidth &&
+      outputHeight == other.outputHeight &&
+      outputFormat == other.outputFormat && filters == other.filters &&
+      timeBase == other.timeBase &&
+      hwFramesCtx.get() == other.hwFramesCtx.get();
+}
+
+bool FiltersContext::operator!=(const FiltersContext& other) {
   return !(*this == other);
 }
 
 FilterGraph::FilterGraph(
-    const DecodedFrameContext& frameContext,
-    const VideoStreamOptions& videoStreamOptions,
-    const AVRational& timeBase) {
+    const FiltersContext& filtersContext,
+    const VideoStreamOptions& videoStreamOptions) {
   filterGraph_.reset(avfilter_graph_alloc());
   TORCH_CHECK(filterGraph_.get() != nullptr);
 
@@ -39,26 +43,40 @@ FilterGraph::FilterGraph(
   const AVFilter* buffersrc = avfilter_get_by_name("buffer");
   const AVFilter* buffersink = avfilter_get_by_name("buffersink");
 
-  std::stringstream filterArgs;
-  filterArgs << "video_size=" << frameContext.decodedWidth << "x"
-             << frameContext.decodedHeight;
-  filterArgs << ":pix_fmt=" << frameContext.decodedFormat;
-  filterArgs << ":time_base=" << timeBase.num << "/" << timeBase.den;
-  filterArgs << ":pixel_aspect=" << frameContext.decodedAspectRatio.num << "/"
-             << frameContext.decodedAspectRatio.den;
-
-  int status = avfilter_graph_create_filter(
-      &sourceContext_,
-      buffersrc,
-      "in",
-      filterArgs.str().c_str(),
-      nullptr,
-      filterGraph_.get());
+  auto deleter = [](AVBufferSrcParameters* p) {
+    if (p) {
+      av_freep(&p);
+    }
+  };
+  std::unique_ptr<AVBufferSrcParameters, decltype(deleter)> srcParams(
+      nullptr, deleter);
+
+  srcParams.reset(av_buffersrc_parameters_alloc());
+  TORCH_CHECK(srcParams, "Failed to allocate buffersrc params");
+
+  srcParams->format = filtersContext.inputFormat;
+  srcParams->width = filtersContext.inputWidth;
+  srcParams->height = filtersContext.inputHeight;
+  srcParams->sample_aspect_ratio = filtersContext.inputAspectRatio;
+  srcParams->time_base = filtersContext.timeBase;
+  if (filtersContext.hwFramesCtx) {
+    srcParams->hw_frames_ctx = av_buffer_ref(filtersContext.hwFramesCtx.get());
+  }
+
+  sourceContext_ =
+      avfilter_graph_alloc_filter(filterGraph_.get(), buffersrc, "in");
+  TORCH_CHECK(sourceContext_, "Failed to allocate filter graph");
+
+  int status = av_buffersrc_parameters_set(sourceContext_, srcParams.get());
   TORCH_CHECK(
       status >= 0,
       "Failed to create filter graph: ",
-      filterArgs.str(),
-      ": ",
+      getFFMPEGErrorStringFromErrorCode(status));
+
+  status = avfilter_init_str(sourceContext_, nullptr);
+  TORCH_CHECK(
+      status >= 0,
+      "Failed to create filter graph : ",
       getFFMPEGErrorStringFromErrorCode(status));
 
   status = avfilter_graph_create_filter(
@@ -68,7 +86,8 @@ FilterGraph::FilterGraph(
       "Failed to create filter graph: ",
       getFFMPEGErrorStringFromErrorCode(status));
 
-  enum AVPixelFormat pix_fmts[] = {AV_PIX_FMT_RGB24, AV_PIX_FMT_NONE};
+  enum AVPixelFormat pix_fmts[] = {
+      filtersContext.outputFormat, AV_PIX_FMT_NONE};
 
   status = av_opt_set_int_list(
       sinkContext_,
@@ -93,16 +112,11 @@ FilterGraph::FilterGraph(
   inputs->pad_idx = 0;
   inputs->next = nullptr;
 
-  std::stringstream description;
-  description << "scale=" << frameContext.expectedWidth << ":"
-              << frameContext.expectedHeight;
-  description << ":sws_flags=bilinear";
-
   AVFilterInOut* outputsTmp = outputs.release();
   AVFilterInOut* inputsTmp = inputs.release();
   status = avfilter_graph_parse_ptr(
       filterGraph_.get(),
-      description.str().c_str(),
+      filtersContext.filters.c_str(),
       &inputsTmp,
       &outputsTmp,
       nullptr);
@@ -128,8 +142,7 @@ UniqueAVFrame FilterGraph::convert(const UniqueAVFrame& avFrame) {
   UniqueAVFrame filteredAVFrame(av_frame_alloc());
   status = av_buffersink_get_frame(sinkContext_, filteredAVFrame.get());
   TORCH_CHECK(
-      status >= AVSUCCESS, "Failed to fet frame from buffer sink context");
-  TORCH_CHECK_EQ(filteredAVFrame->format, AV_PIX_FMT_RGB24);
+      status >= AVSUCCESS, "Failed to get frame from buffer sink context");
 
   return filteredAVFrame;
 }
diff --git a/src/torchcodec/_core/FilterGraph.h b/src/torchcodec/_core/FilterGraph.h
@@ -11,24 +11,28 @@
 
 namespace facebook::torchcodec {
 
-struct DecodedFrameContext {
-  int decodedWidth;
-  int decodedHeight;
-  AVPixelFormat decodedFormat;
-  AVRational decodedAspectRatio;
-  int expectedWidth;
-  int expectedHeight;
-
-  bool operator==(const DecodedFrameContext&);
-  bool operator!=(const DecodedFrameContext&);
+struct FiltersContext {
+  int inputWidth = 0;
+  int inputHeight = 0;
+  AVPixelFormat inputFormat = AV_PIX_FMT_NONE;
+  AVRational inputAspectRatio = {0, 0};
+  int outputWidth = 0;
+  int outputHeight = 0;
+  AVPixelFormat outputFormat = AV_PIX_FMT_NONE;
+
+  std::string filters;
+  AVRational timeBase = {0, 0};
+  UniqueAVBufferRef hwFramesCtx;
+
+  bool operator==(const FiltersContext&);
+  bool operator!=(const FiltersContext&);
 };
 
 class FilterGraph {
  public:
   FilterGraph(
-      const DecodedFrameContext& frameContext,
-      const VideoStreamOptions& videoStreamOptions,
-      const AVRational& timeBase);
+      const FiltersContext& filtersContext,
+      const VideoStreamOptions& videoStreamOptions);
 
   UniqueAVFrame convert(const UniqueAVFrame& avFrame);