Skip to content

Commit 904b99a

Browse files
author
pytorchbot
committed
2025-12-09 nightly release (64373bf)
1 parent ebec3d5 commit 904b99a

File tree

11 files changed

+332
-12
lines changed

11 files changed

+332
-12
lines changed

src/torchcodec/_core/CudaDeviceInterface.cpp

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "Cache.h"
77
#include "CudaDeviceInterface.h"
88
#include "FFMPEGCommon.h"
9+
#include "ValidationUtils.h"
910

1011
extern "C" {
1112
#include <libavutil/hwcontext_cuda.h>
@@ -362,4 +363,123 @@ std::string CudaDeviceInterface::getDetails() {
362363
(usingCPUFallback_ ? "CPU fallback." : "NVDEC.");
363364
}
364365

366+
// --------------------------------------------------------------------------
367+
// Below are methods exclusive to video encoding:
368+
// --------------------------------------------------------------------------
369+
namespace {
370+
// RGB to NV12 color conversion matrix for BT.601 limited range.
371+
// NPP ColorTwist function used below expects the limited range
372+
// color conversion matrix, and this matches FFmpeg's default behavior.
373+
const Npp32f defaultLimitedRangeRgbToNv12[3][4] = {
374+
// Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B)
375+
{0.257f, 0.504f, 0.098f, 16.0f},
376+
// U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients)
377+
{-0.148f, -0.291f, 0.439f, 128.0f},
378+
// V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients)
379+
{0.439f, -0.368f, -0.071f, 128.0f}};
380+
} // namespace
381+
382+
UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
383+
const torch::Tensor& tensor,
384+
int frameIndex,
385+
AVCodecContext* codecContext) {
386+
TORCH_CHECK(
387+
tensor.dim() == 3 && tensor.size(0) == 3,
388+
"Expected 3D RGB tensor (CHW format), got shape: ",
389+
tensor.sizes());
390+
TORCH_CHECK(
391+
tensor.device().type() == torch::kCUDA,
392+
"Expected tensor on CUDA device, got: ",
393+
tensor.device().str());
394+
395+
UniqueAVFrame avFrame(av_frame_alloc());
396+
TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
397+
int height = static_cast<int>(tensor.size(1));
398+
int width = static_cast<int>(tensor.size(2));
399+
400+
// TODO-VideoEncoder: Unify AVFrame creation with CPU version of this method
401+
avFrame->format = AV_PIX_FMT_CUDA;
402+
avFrame->height = height;
403+
avFrame->width = width;
404+
avFrame->pts = frameIndex;
405+
406+
// FFmpeg's av_hwframe_get_buffer is used to allocate memory on CUDA device.
407+
// TODO-VideoEncoder: Consider using pytorch to allocate CUDA memory for
408+
// efficiency
409+
int ret =
410+
av_hwframe_get_buffer(codecContext->hw_frames_ctx, avFrame.get(), 0);
411+
TORCH_CHECK(
412+
ret >= 0,
413+
"Failed to allocate hardware frame: ",
414+
getFFMPEGErrorStringFromErrorCode(ret));
415+
416+
TORCH_CHECK(
417+
avFrame != nullptr && avFrame->data[0] != nullptr,
418+
"avFrame must be pre-allocated with CUDA memory");
419+
420+
// TODO VideoEncoder: Investigate ways to avoid this copy
421+
torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
422+
423+
NppiSize oSizeROI = {width, height};
424+
NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
425+
static_cast<const Npp8u*>(hwcFrame.data_ptr()),
426+
validateInt64ToInt(
427+
hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"),
428+
avFrame->data,
429+
avFrame->linesize,
430+
oSizeROI,
431+
defaultLimitedRangeRgbToNv12,
432+
*nppCtx_);
433+
434+
TORCH_CHECK(
435+
status == NPP_SUCCESS,
436+
"Failed to convert RGB to NV12: NPP error code ",
437+
status);
438+
439+
// TODO-VideoEncoder: Enable configuration of color properties, similar to
440+
// FFmpeg. Below are the default color properties used by FFmpeg.
441+
avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
442+
avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range
443+
444+
return avFrame;
445+
}
446+
447+
// Allocates and initializes AVHWFramesContext, and sets pixel format fields
448+
// to enable encoding with CUDA device. The hw_frames_ctx field is needed by
449+
// FFmpeg to allocate frames on GPU's memory.
450+
void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
451+
AVCodecContext* codecContext) {
452+
TORCH_CHECK(codecContext != nullptr, "codecContext is null");
453+
TORCH_CHECK(
454+
hardwareDeviceCtx_, "Hardware device context has not been initialized");
455+
456+
AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
457+
TORCH_CHECK(
458+
hwFramesCtxRef != nullptr,
459+
"Failed to allocate hardware frames context for codec");
460+
461+
// TODO-VideoEncoder: Enable user set pixel formats to be set
462+
// (outPixelFormat_) and handled with the appropriate NPP function
463+
codecContext->sw_pix_fmt = AV_PIX_FMT_NV12;
464+
// Always set pixel format to support CUDA encoding.
465+
codecContext->pix_fmt = AV_PIX_FMT_CUDA;
466+
467+
AVHWFramesContext* hwFramesCtx =
468+
reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
469+
hwFramesCtx->format = codecContext->pix_fmt;
470+
hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
471+
hwFramesCtx->width = codecContext->width;
472+
hwFramesCtx->height = codecContext->height;
473+
474+
int ret = av_hwframe_ctx_init(hwFramesCtxRef);
475+
if (ret < 0) {
476+
av_buffer_unref(&hwFramesCtxRef);
477+
TORCH_CHECK(
478+
false,
479+
"Failed to initialize CUDA frames context for codec: ",
480+
getFFMPEGErrorStringFromErrorCode(ret));
481+
}
482+
codecContext->hw_frames_ctx = hwFramesCtxRef;
483+
}
484+
365485
} // namespace facebook::torchcodec

src/torchcodec/_core/CudaDeviceInterface.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,14 @@ class CudaDeviceInterface : public DeviceInterface {
4141

4242
std::string getDetails() override;
4343

44+
UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
45+
const torch::Tensor& tensor,
46+
int frameIndex,
47+
AVCodecContext* codecContext) override;
48+
49+
void setupHardwareFrameContextForEncoding(
50+
AVCodecContext* codecContext) override;
51+
4452
private:
4553
// Our CUDA decoding code assumes NV12 format. In order to handle other
4654
// kinds of input, we need to convert them to NV12. Our current implementation

src/torchcodec/_core/DeviceInterface.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,24 @@ class DeviceInterface {
138138
return "";
139139
}
140140

141+
// Function used for video encoding, only implemented in CudaDeviceInterface.
142+
// It is here to isolate CUDA dependencies from CPU builds
143+
// TODO Video-Encoder: Reconsider using video encoding functions in device
144+
// interface
145+
virtual UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
146+
[[maybe_unused]] const torch::Tensor& tensor,
147+
[[maybe_unused]] int frameIndex,
148+
[[maybe_unused]] AVCodecContext* codecContext) {
149+
TORCH_CHECK(false);
150+
}
151+
152+
// Function used for video encoding, only implemented in CudaDeviceInterface.
153+
// It is here to isolate CUDA dependencies from CPU builds
154+
virtual void setupHardwareFrameContextForEncoding(
155+
[[maybe_unused]] AVCodecContext* codecContext) {
156+
TORCH_CHECK(false);
157+
}
158+
141159
protected:
142160
torch::Device device_;
143161
SharedAVCodecContext codecContext_;

src/torchcodec/_core/Encoder.cpp

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "torch/types.h"
66

77
extern "C" {
8+
#include <libavutil/hwcontext.h>
89
#include <libavutil/opt.h>
910
#include <libavutil/pixdesc.h>
1011
}
@@ -724,6 +725,11 @@ VideoEncoder::VideoEncoder(
724725

725726
void VideoEncoder::initializeEncoder(
726727
const VideoStreamOptions& videoStreamOptions) {
728+
// Only create device interface when frames are on a CUDA device.
729+
// Encoding on CPU is implemented in this file.
730+
if (frames_.device().is_cuda()) {
731+
deviceInterface_ = createDeviceInterface(frames_.device());
732+
}
727733
const AVCodec* avCodec = nullptr;
728734
// If codec arg is provided, find codec using logic similar to FFmpeg:
729735
// https://github.com/FFmpeg/FFmpeg/blob/master/fftools/ffmpeg_opt.c#L804-L835
@@ -769,6 +775,12 @@ void VideoEncoder::initializeEncoder(
769775
outHeight_ = inHeight_;
770776

771777
if (videoStreamOptions.pixelFormat.has_value()) {
778+
if (frames_.device().is_cuda()) {
779+
TORCH_CHECK(
780+
false,
781+
"GPU Video encoding currently only supports the NV12 pixel format. "
782+
"Do not set pixel_format to use NV12.");
783+
}
772784
outPixelFormat_ =
773785
validatePixelFormat(*avCodec, videoStreamOptions.pixelFormat.value());
774786
} else {
@@ -820,6 +832,14 @@ void VideoEncoder::initializeEncoder(
820832
videoStreamOptions.preset.value().c_str(),
821833
0);
822834
}
835+
836+
// When frames are on a CUDA device, deviceInterface_ will be defined.
837+
if (frames_.device().is_cuda() && deviceInterface_) {
838+
deviceInterface_->registerHardwareDeviceWithCodec(avCodecContext_.get());
839+
deviceInterface_->setupHardwareFrameContextForEncoding(
840+
avCodecContext_.get());
841+
}
842+
823843
int status = avcodec_open2(avCodecContext_.get(), avCodec, &avCodecOptions);
824844
av_dict_free(&avCodecOptions);
825845

@@ -860,7 +880,20 @@ void VideoEncoder::encode() {
860880
int numFrames = static_cast<int>(frames_.sizes()[0]);
861881
for (int i = 0; i < numFrames; ++i) {
862882
torch::Tensor currFrame = frames_[i];
863-
UniqueAVFrame avFrame = convertTensorToAVFrame(currFrame, i);
883+
UniqueAVFrame avFrame;
884+
if (frames_.device().is_cuda() && deviceInterface_) {
885+
auto cudaFrame = deviceInterface_->convertCUDATensorToAVFrameForEncoding(
886+
currFrame, i, avCodecContext_.get());
887+
TORCH_CHECK(
888+
cudaFrame != nullptr,
889+
"convertCUDATensorToAVFrameForEncoding failed for frame ",
890+
i,
891+
" on device: ",
892+
frames_.device());
893+
avFrame = std::move(cudaFrame);
894+
} else {
895+
avFrame = convertTensorToAVFrame(currFrame, i);
896+
}
864897
encodeFrame(autoAVPacket, avFrame);
865898
}
866899

src/torchcodec/_core/Encoder.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <map>
44
#include <string>
55
#include "AVIOContextHolder.h"
6+
#include "DeviceInterface.h"
67
#include "FFMPEGCommon.h"
78
#include "StreamOptions.h"
89

@@ -183,6 +184,7 @@ class VideoEncoder {
183184
AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;
184185

185186
std::unique_ptr<AVIOContextHolder> avioContextHolder_;
187+
std::unique_ptr<DeviceInterface> deviceInterface_;
186188

187189
bool encodeWasCalled_ = false;
188190
AVDictionary* avFormatOptions_ = nullptr;

src/torchcodec/_core/StreamOptions.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ struct VideoStreamOptions {
4141
ColorConversionLibrary::FILTERGRAPH;
4242

4343
// By default we use CPU for decoding for both C++ and python users.
44+
// Note: This is not used for video encoding, because device is determined by
45+
// the device of the input frame tensor.
4446
torch::Device device = torch::kCPU;
4547
// Device variant (e.g., "ffmpeg", "beta", etc.)
4648
std::string_view deviceVariant = "ffmpeg";

src/torchcodec/_core/Transform.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class ResizeTransform : public Transform {
4343
public:
4444
enum class InterpolationMode { BILINEAR };
4545

46-
ResizeTransform(const FrameDims& dims)
46+
explicit ResizeTransform(const FrameDims& dims)
4747
: outputDims_(dims), interpolationMode_(InterpolationMode::BILINEAR) {}
4848

4949
ResizeTransform(const FrameDims& dims, InterpolationMode interpolationMode)
@@ -62,7 +62,7 @@ class CropTransform : public Transform {
6262
CropTransform(const FrameDims& dims, int x, int y);
6363

6464
// Becomes a center crop if x and y are not specified.
65-
CropTransform(const FrameDims& dims);
65+
explicit CropTransform(const FrameDims& dims);
6666

6767
std::string getFilterGraphCpu() const override;
6868
std::optional<FrameDims> getOutputFrameDims() const override;

src/torchcodec/_core/custom_ops.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,6 +1051,9 @@ TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
10511051
m.impl("_create_from_file_like", &_create_from_file_like);
10521052
m.impl(
10531053
"_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
1054+
m.impl("encode_video_to_file", &encode_video_to_file);
1055+
m.impl("encode_video_to_tensor", &encode_video_to_tensor);
1056+
m.impl("_encode_video_to_file_like", &_encode_video_to_file_like);
10541057
}
10551058

10561059
TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {

src/torchcodec/_core/ops.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ def encode_video_to_file_like(
220220
"""Encode video frames to a file-like object.
221221
222222
Args:
223-
frames: Video frames tensor
223+
frames: Video frames tensor. The device of the frames tensor will be used for encoding.
224224
frame_rate: Frame rate in frames per second
225225
format: Video format (e.g., "mp4", "mov", "mkv")
226226
file_like: File-like object that supports write() and seek() methods

src/torchcodec/encoders/_video_encoder.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class VideoEncoder:
1515
tensor of shape ``(N, C, H, W)`` where N is the number of frames,
1616
C is 3 channels (RGB), H is height, and W is width.
1717
Values must be uint8 in the range ``[0, 255]``.
18+
The device of the frames tensor will be used for encoding.
1819
frame_rate (float): The frame rate of the **input** ``frames``. Also defines the encoded **output** frame rate.
1920
"""
2021

0 commit comments

Comments
 (0)