|
6 | 6 | * LICENSE file in the root directory of this source tree. |
7 | 7 | */ |
8 | 8 |
|
9 | | -#include <libspdl/core/demuxing.h> |
10 | | -#include <libspdl/cuda/nvjpeg/decoding.h> |
| 9 | +#include <libspdl/cuda/buffer.h> |
| 10 | +#include <libspdl/cuda/types.h> |
| 11 | +#include "libspdl/cuda/detail/utils.h" |
11 | 12 |
|
12 | 13 | #include "libspdl/core/detail/logging.h" |
13 | 14 | #include "libspdl/core/detail/tracing.h" |
| 15 | +#include "libspdl/cuda/nvjpeg/detail/utils.h" |
14 | 16 |
|
15 | | -#include <fmt/core.h> |
| 17 | +#ifdef SPDL_USE_NPPI |
| 18 | +#include "libspdl/cuda/npp/detail/resize.h" |
| 19 | +#endif |
| 20 | + |
| 21 | +#include <fmt/format.h> |
| 22 | +#include <glog/logging.h> |
16 | 23 |
|
17 | 24 | namespace spdl::cuda { |
| 25 | +namespace { |
18 | 26 |
|
19 | | -#ifdef SPDL_USE_NVJPEG |
20 | | -namespace detail { |
21 | | -CUDABufferPtr decode_image_nvjpeg( |
22 | | - const std::string_view& data, |
23 | | - const CUDAConfig& cuda_config, |
24 | | - int scale_width, |
25 | | - int scale_height, |
26 | | - const std::string& pix_fmt); |
27 | | -CUDABufferPtr decode_image_nvjpeg( |
28 | | - const std::vector<std::string_view>& data, |
| 27 | +std::tuple<size_t, bool> get_shape(nvjpegOutputFormat_t out_fmt) { |
| 28 | + switch (out_fmt) { |
| 29 | + // TODO: Support NVJPEG_OUTPUT_YUV? |
| 30 | + case NVJPEG_OUTPUT_RGB: |
| 31 | + [[fallthrough]]; |
| 32 | + case NVJPEG_OUTPUT_BGR: |
| 33 | + return {3, false}; |
| 34 | + case NVJPEG_OUTPUT_RGBI: |
| 35 | + [[fallthrough]]; |
| 36 | + case NVJPEG_OUTPUT_BGRI: |
| 37 | + return {3, true}; |
| 38 | + case NVJPEG_OUTPUT_Y: |
| 39 | + return {1, false}; |
| 40 | + default: |
| 41 | + // It should be already handled by `get_nvjpeg_output_format` |
| 42 | + SPDL_FAIL_INTERNAL(fmt::format( |
| 43 | + "Unexpected output format: {}", detail::to_string(out_fmt))); |
| 44 | + } |
| 45 | +} |
| 46 | + |
| 47 | +struct SizeMeta { |
| 48 | + size_t width; |
| 49 | + size_t height; |
| 50 | + size_t num_channels; |
| 51 | + bool interleaved; |
| 52 | +}; |
| 53 | + |
| 54 | +std::tuple<CUDABufferPtr, SizeMeta> get_output( |
| 55 | + nvjpegOutputFormat_t out_fmt, |
| 56 | + size_t height, |
| 57 | + size_t width, |
29 | 58 | const CUDAConfig& cuda_config, |
30 | | - int scale_width, |
31 | | - int scale_height, |
32 | | - const std::string& pix_fmt); |
33 | | -} // namespace detail |
34 | | -#endif |
| 59 | + std::optional<size_t> batch_size = std::nullopt) { |
| 60 | + auto [num_channels, interleaved] = get_shape(out_fmt); |
| 61 | + |
| 62 | + auto buffer = [&](const size_t ch, bool interleaved_2) { |
| 63 | + return batch_size |
| 64 | + ? (interleaved_2 |
| 65 | + ? cuda_buffer({*batch_size, height, width, ch}, cuda_config) |
| 66 | + : cuda_buffer({*batch_size, ch, height, width}, cuda_config)) |
| 67 | + : (interleaved_2 ? cuda_buffer({height, width, ch}, cuda_config) |
| 68 | + : cuda_buffer({ch, height, width}, cuda_config)); |
| 69 | + }(num_channels, interleaved); |
| 70 | + |
| 71 | + return { |
| 72 | + std::move(buffer), |
| 73 | + SizeMeta{ |
| 74 | + .width = width, |
| 75 | + .height = height, |
| 76 | + .num_channels = num_channels, |
| 77 | + .interleaved = interleaved}}; |
| 78 | +} |
| 79 | + |
| 80 | +void wrap_buffer( |
| 81 | + CUDABufferPtr& buffer, |
| 82 | + SizeMeta meta, |
| 83 | + nvjpegImage_t& image, |
| 84 | + size_t batch = 0) { |
| 85 | + auto ptr = static_cast<uint8_t*>(buffer->data()); |
| 86 | + ptr += batch * meta.height * meta.width * meta.num_channels; |
| 87 | + auto pitch = meta.interleaved ? meta.width * meta.num_channels : meta.width; |
| 88 | + for (int c = 0; c < meta.num_channels; c++) { |
| 89 | + image.channel[c] = ptr; |
| 90 | + image.pitch[c] = pitch; |
| 91 | + ptr += pitch * meta.height; |
| 92 | + } |
| 93 | +} |
| 94 | + |
| 95 | +std::tuple<CUDABufferPtr, SizeMeta, nvjpegImage_t> decode( |
| 96 | + std::string_view data, |
| 97 | + nvjpegOutputFormat_t fmt, |
| 98 | + const CUDAConfig& cuda_config) { |
| 99 | + auto nvjpeg = detail::get_nvjpeg(); |
| 100 | + |
| 101 | + // Note: Creation/destruction of nvjpegJpegState_t is thread-safe, however, |
| 102 | + // looking at the trace, it appears that they have internal locking mechanism |
| 103 | + // which make these operations as slow as several hudreds milliseconds in |
| 104 | + // multithread situation. So we use thread local. |
| 105 | + thread_local auto jpeg_state = detail::get_nvjpeg_jpeg_state(nvjpeg); |
| 106 | + |
| 107 | + int num_components; |
| 108 | + nvjpegChromaSubsampling_t subsampling; |
| 109 | + thread_local int widths[NVJPEG_MAX_COMPONENT]; |
| 110 | + thread_local int heights[NVJPEG_MAX_COMPONENT]; |
| 111 | + { |
| 112 | + TRACE_EVENT("decoding", "nvjpegGetImageInfo"); |
| 113 | + CHECK_NVJPEG( |
| 114 | + nvjpegGetImageInfo( |
| 115 | + nvjpeg, |
| 116 | + (const unsigned char*)data.data(), |
| 117 | + data.size(), |
| 118 | + &num_components, |
| 119 | + &subsampling, |
| 120 | + widths, |
| 121 | + heights), |
| 122 | + "Failed to fetch image information."); |
| 123 | + } |
| 124 | + |
| 125 | + auto [buffer, meta] = get_output(fmt, heights[0], widths[0], cuda_config); |
| 126 | + nvjpegImage_t image; |
| 127 | + wrap_buffer(buffer, meta, image); |
| 128 | + |
| 129 | + // Note: backend is not used by NVJPEG API when using nvjpegDecode(). |
| 130 | + // |
| 131 | + // https://docs.nvidia.com/cuda/nvjpeg/index.html#decode-apisingle-phase |
| 132 | + // >> From CUDA 11 onwards, nvjpegDecode() picks the best available back-end |
| 133 | + // >> for a given image, user no longer has control on this. If there is a |
| 134 | + // >> need to select the back-end, then consider using nvjpegDecodeJpeg. |
| 135 | + // >> This is a new API added in CUDA 11 which allows user to control the |
| 136 | + // >> back-end. |
| 137 | + { |
| 138 | + TRACE_EVENT("decoding", "nvjpegDecode"); |
| 139 | + CHECK_NVJPEG( |
| 140 | + nvjpegDecode( |
| 141 | + nvjpeg, |
| 142 | + jpeg_state.get(), |
| 143 | + (const unsigned char*)data.data(), |
| 144 | + data.size(), |
| 145 | + fmt, |
| 146 | + &image, |
| 147 | + (CUstream_st*)cuda_config.stream), |
| 148 | + "Failed to decode an image."); |
| 149 | + } |
| 150 | + return {std::move(buffer), meta, image}; |
| 151 | +} |
| 152 | + |
| 153 | +} // namespace |
35 | 154 |
|
36 | 155 | CUDABufferPtr decode_image_nvjpeg( |
37 | 156 | const std::string_view& data, |
38 | 157 | const CUDAConfig& cuda_config, |
39 | 158 | int scale_width, |
40 | 159 | int scale_height, |
41 | 160 | const std::string& pix_fmt) { |
42 | | -#ifndef SPDL_USE_NVJPEG |
43 | | - SPDL_FAIL("SPDL is not compiled with NVJPEG support."); |
| 161 | + auto fmt = detail::get_nvjpeg_output_format(pix_fmt); |
| 162 | + |
| 163 | + detail::set_cuda_primary_context(cuda_config.device_index); |
| 164 | + |
| 165 | + auto [buffer, src_meta, decoded] = decode(data, fmt, cuda_config); |
| 166 | + |
| 167 | + if (scale_width > 0 && scale_height > 0) { |
| 168 | +#ifndef SPDL_USE_NPPI |
| 169 | + SPDL_FAIL( |
| 170 | + "Image resizing while decoding with NVJPEG reqreuires SPDL to be compiled with NPPI support."); |
44 | 171 | #else |
45 | | - return detail::decode_image_nvjpeg( |
46 | | - data, cuda_config, scale_width, scale_height, pix_fmt); |
| 172 | + auto [buffer2, meta2] = |
| 173 | + get_output(fmt, scale_height, scale_width, cuda_config); |
| 174 | + nvjpegImage_t resized; |
| 175 | + wrap_buffer(buffer2, meta2, resized); |
| 176 | + |
| 177 | + detail::resize_npp( |
| 178 | + fmt, |
| 179 | + decoded, |
| 180 | + src_meta.width, |
| 181 | + src_meta.height, |
| 182 | + resized, |
| 183 | + scale_width, |
| 184 | + scale_height); |
| 185 | + |
| 186 | + return std::move(buffer2); |
47 | 187 | #endif |
| 188 | + } |
| 189 | + |
| 190 | + return std::move(buffer); |
48 | 191 | } |
49 | 192 |
|
50 | 193 | CUDABufferPtr decode_image_nvjpeg( |
51 | | - const std::vector<std::string_view>& data, |
| 194 | + const std::vector<std::string_view>& dataset, |
52 | 195 | const CUDAConfig& cuda_config, |
53 | 196 | int scale_width, |
54 | 197 | int scale_height, |
55 | 198 | const std::string& pix_fmt) { |
56 | | -#ifndef SPDL_USE_NVJPEG |
57 | | - SPDL_FAIL("SPDL is not compiled with NVJPEG support."); |
| 199 | +#ifndef SPDL_USE_NPPI |
| 200 | + SPDL_FAIL( |
| 201 | + "Image resizing while decoding with NVJPEG reqreuires SPDL to be compiled with NPPI support."); |
58 | 202 | #else |
59 | | - return detail::decode_image_nvjpeg( |
60 | | - data, cuda_config, scale_width, scale_height, pix_fmt); |
| 203 | + auto batch_size = dataset.size(); |
| 204 | + if (batch_size == 0) { |
| 205 | + SPDL_FAIL("No input is provided."); |
| 206 | + } |
| 207 | + if (scale_width <= 0 && scale_height <= 0) { |
| 208 | + SPDL_FAIL("Both `scale_width` and `scale_height` must be specified."); |
| 209 | + } |
| 210 | + |
| 211 | + auto fmt = detail::get_nvjpeg_output_format(pix_fmt); |
| 212 | + |
| 213 | + detail::set_cuda_primary_context(cuda_config.device_index); |
| 214 | + |
| 215 | + auto [out_buffer, out_meta] = |
| 216 | + get_output(fmt, scale_height, scale_width, cuda_config, batch_size); |
| 217 | + nvjpegImage_t out_wrapper; |
| 218 | + |
| 219 | + for (size_t i = 0; i < batch_size; ++i) { |
| 220 | + auto [src_buffer, src_meta, decoded] = decode(dataset[i], fmt, cuda_config); |
| 221 | + |
| 222 | + wrap_buffer(out_buffer, out_meta, out_wrapper, i); |
| 223 | + detail::resize_npp( |
| 224 | + fmt, |
| 225 | + decoded, |
| 226 | + src_meta.width, |
| 227 | + src_meta.height, |
| 228 | + out_wrapper, |
| 229 | + scale_width, |
| 230 | + scale_height); |
| 231 | + } |
| 232 | + return std::move(out_buffer); |
61 | 233 | #endif |
62 | 234 | } |
63 | 235 |
|
|
0 commit comments