|
6 | 6 | #include "Cache.h" |
7 | 7 | #include "CudaDeviceInterface.h" |
8 | 8 | #include "FFMPEGCommon.h" |
| 9 | +#include "ValidationUtils.h" |
9 | 10 |
|
10 | 11 | extern "C" { |
11 | 12 | #include <libavutil/hwcontext_cuda.h> |
@@ -362,4 +363,123 @@ std::string CudaDeviceInterface::getDetails() { |
362 | 363 | (usingCPUFallback_ ? "CPU fallback." : "NVDEC."); |
363 | 364 | } |
364 | 365 |
|
| 366 | +// -------------------------------------------------------------------------- |
| 367 | +// Below are methods exclusive to video encoding: |
| 368 | +// -------------------------------------------------------------------------- |
| 369 | +namespace { |
| 370 | +// RGB to NV12 color conversion matrix for BT.601 limited range. |
| 371 | +// NPP ColorTwist function used below expects the limited range |
| 372 | +// color conversion matrix, and this matches FFmpeg's default behavior. |
| 373 | +const Npp32f defaultLimitedRangeRgbToNv12[3][4] = { |
| 374 | + // Y = 16 + 0.859 * (0.299*R + 0.587*G + 0.114*B) |
| 375 | + {0.257f, 0.504f, 0.098f, 16.0f}, |
| 376 | + // U = -0.148*R - 0.291*G + 0.439*B + 128 (BT.601 coefficients) |
| 377 | + {-0.148f, -0.291f, 0.439f, 128.0f}, |
| 378 | + // V = 0.439*R - 0.368*G - 0.071*B + 128 (BT.601 coefficients) |
| 379 | + {0.439f, -0.368f, -0.071f, 128.0f}}; |
| 380 | +} // namespace |
| 381 | + |
| 382 | +UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding( |
| 383 | + const torch::Tensor& tensor, |
| 384 | + int frameIndex, |
| 385 | + AVCodecContext* codecContext) { |
| 386 | + TORCH_CHECK( |
| 387 | + tensor.dim() == 3 && tensor.size(0) == 3, |
| 388 | + "Expected 3D RGB tensor (CHW format), got shape: ", |
| 389 | + tensor.sizes()); |
| 390 | + TORCH_CHECK( |
| 391 | + tensor.device().type() == torch::kCUDA, |
| 392 | + "Expected tensor on CUDA device, got: ", |
| 393 | + tensor.device().str()); |
| 394 | + |
| 395 | + UniqueAVFrame avFrame(av_frame_alloc()); |
| 396 | + TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame"); |
| 397 | + int height = static_cast<int>(tensor.size(1)); |
| 398 | + int width = static_cast<int>(tensor.size(2)); |
| 399 | + |
| 400 | + // TODO-VideoEncoder: Unify AVFrame creation with CPU version of this method |
| 401 | + avFrame->format = AV_PIX_FMT_CUDA; |
| 402 | + avFrame->height = height; |
| 403 | + avFrame->width = width; |
| 404 | + avFrame->pts = frameIndex; |
| 405 | + |
| 406 | + // FFmpeg's av_hwframe_get_buffer is used to allocate memory on CUDA device. |
| 407 | + // TODO-VideoEncoder: Consider using pytorch to allocate CUDA memory for |
| 408 | + // efficiency |
| 409 | + int ret = |
| 410 | + av_hwframe_get_buffer(codecContext->hw_frames_ctx, avFrame.get(), 0); |
| 411 | + TORCH_CHECK( |
| 412 | + ret >= 0, |
| 413 | + "Failed to allocate hardware frame: ", |
| 414 | + getFFMPEGErrorStringFromErrorCode(ret)); |
| 415 | + |
| 416 | + TORCH_CHECK( |
| 417 | + avFrame != nullptr && avFrame->data[0] != nullptr, |
| 418 | + "avFrame must be pre-allocated with CUDA memory"); |
| 419 | + |
| 420 | + // TODO VideoEncoder: Investigate ways to avoid this copy |
| 421 | + torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous(); |
| 422 | + |
| 423 | + NppiSize oSizeROI = {width, height}; |
| 424 | + NppStatus status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx( |
| 425 | + static_cast<const Npp8u*>(hwcFrame.data_ptr()), |
| 426 | + validateInt64ToInt( |
| 427 | + hwcFrame.stride(0) * hwcFrame.element_size(), "nSrcStep"), |
| 428 | + avFrame->data, |
| 429 | + avFrame->linesize, |
| 430 | + oSizeROI, |
| 431 | + defaultLimitedRangeRgbToNv12, |
| 432 | + *nppCtx_); |
| 433 | + |
| 434 | + TORCH_CHECK( |
| 435 | + status == NPP_SUCCESS, |
| 436 | + "Failed to convert RGB to NV12: NPP error code ", |
| 437 | + status); |
| 438 | + |
| 439 | + // TODO-VideoEncoder: Enable configuration of color properties, similar to |
| 440 | + // FFmpeg. Below are the default color properties used by FFmpeg. |
| 441 | + avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601 |
| 442 | + avFrame->color_range = AVCOL_RANGE_MPEG; // Limited range |
| 443 | + |
| 444 | + return avFrame; |
| 445 | +} |
| 446 | + |
| 447 | +// Allocates and initializes AVHWFramesContext, and sets pixel format fields |
| 448 | +// to enable encoding with CUDA device. The hw_frames_ctx field is needed by |
| 449 | +// FFmpeg to allocate frames on GPU's memory. |
| 450 | +void CudaDeviceInterface::setupHardwareFrameContextForEncoding( |
| 451 | + AVCodecContext* codecContext) { |
| 452 | + TORCH_CHECK(codecContext != nullptr, "codecContext is null"); |
| 453 | + TORCH_CHECK( |
| 454 | + hardwareDeviceCtx_, "Hardware device context has not been initialized"); |
| 455 | + |
| 456 | + AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get()); |
| 457 | + TORCH_CHECK( |
| 458 | + hwFramesCtxRef != nullptr, |
| 459 | + "Failed to allocate hardware frames context for codec"); |
| 460 | + |
| 461 | + // TODO-VideoEncoder: Enable user set pixel formats to be set |
| 462 | + // (outPixelFormat_) and handled with the appropriate NPP function |
| 463 | + codecContext->sw_pix_fmt = AV_PIX_FMT_NV12; |
| 464 | + // Always set pixel format to support CUDA encoding. |
| 465 | + codecContext->pix_fmt = AV_PIX_FMT_CUDA; |
| 466 | + |
| 467 | + AVHWFramesContext* hwFramesCtx = |
| 468 | + reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data); |
| 469 | + hwFramesCtx->format = codecContext->pix_fmt; |
| 470 | + hwFramesCtx->sw_format = codecContext->sw_pix_fmt; |
| 471 | + hwFramesCtx->width = codecContext->width; |
| 472 | + hwFramesCtx->height = codecContext->height; |
| 473 | + |
| 474 | + int ret = av_hwframe_ctx_init(hwFramesCtxRef); |
| 475 | + if (ret < 0) { |
| 476 | + av_buffer_unref(&hwFramesCtxRef); |
| 477 | + TORCH_CHECK( |
| 478 | + false, |
| 479 | + "Failed to initialize CUDA frames context for codec: ", |
| 480 | + getFFMPEGErrorStringFromErrorCode(ret)); |
| 481 | + } |
| 482 | + codecContext->hw_frames_ctx = hwFramesCtxRef; |
| 483 | +} |
| 484 | + |
365 | 485 | } // namespace facebook::torchcodec |
0 commit comments