Skip to content

Commit b65e175

Browse files
authored
Clean up macros (#539)
1 parent 5556aa6 commit b65e175

File tree

15 files changed

+256
-351
lines changed

15 files changed

+256
-351
lines changed

src/libspdl/core/storage.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212

1313
#include <cstddef>
1414
#include <cstdint>
15-
#include <functional>
16-
#include <optional>
1715

1816
namespace spdl::core {
1917
struct Storage {

src/libspdl/cuda/CMakeLists.txt

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,17 +27,12 @@ set(srcs
2727
buffer.cpp
2828
storage.cpp
2929
transfer.cpp
30+
color_conversion.cpp
31+
utils.cpp
32+
detail/color_conversion.cu
33+
detail/utils.cpp
3034
)
3135

32-
if (SPDL_USE_CUDA)
33-
list(APPEND srcs
34-
color_conversion.cpp
35-
utils.cpp
36-
detail/color_conversion.cu
37-
detail/utils.cpp
38-
)
39-
endif()
40-
4136
if (SPDL_USE_NVCODEC)
4237
list(APPEND srcs
4338
nvdec/decoder.cpp

src/libspdl/cuda/buffer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ inline size_t prod(const std::vector<size_t>& shape) {
4141
CUDABufferPtr cuda_buffer(
4242
const std::vector<size_t>& shape,
4343
const CUDAConfig& cfg,
44-
core::ElemClass elem_class,
44+
spdl::core::ElemClass elem_class,
4545
size_t depth) {
4646
return std::make_unique<CUDABuffer>(
4747
cfg.device_index,

src/libspdl/cuda/buffer.h

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ namespace spdl::cuda {
2323
///
2424
/// Contiguous array data on a CUDA device.
2525
struct CUDABuffer {
26-
#ifdef SPDL_USE_CUDA
2726
int device_index;
2827
CUDAStoragePtr storage;
2928

@@ -41,8 +40,6 @@ struct CUDABuffer {
4140
void* data() const;
4241

4342
uintptr_t get_cuda_stream() const;
44-
45-
#endif
4643
};
4744

4845
using CUDABufferPtr = std::unique_ptr<CUDABuffer>;
@@ -51,15 +48,12 @@ using CUDABufferPtr = std::unique_ptr<CUDABuffer>;
5148
// Factory functions
5249
////////////////////////////////////////////////////////////////////////////////
5350

54-
// TODO: Remove this conditional
55-
#ifdef SPDL_USE_CUDA
5651
///
5752
/// Create ``CUDABuffer``.
5853
CUDABufferPtr cuda_buffer(
5954
const std::vector<size_t>& shape,
6055
const CUDAConfig& cfg,
61-
core::ElemClass elem_class = core::ElemClass::UInt,
56+
spdl::core::ElemClass elem_class = spdl::core::ElemClass::UInt,
6257
size_t depth = sizeof(uint8_t));
63-
#endif
6458

6559
} // namespace spdl::cuda

src/libspdl/cuda/npp/detail/resize.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,10 +68,6 @@ void resize_npp(
6868
nvjpegImage_t dst,
6969
int dst_width,
7070
int dst_height) {
71-
#ifndef SPDL_USE_NPPI
72-
SPDL_FAIL(
73-
"Image resizing while decoding with NVJPEG reqreuires SPDL to be compiled with NPPI support.");
74-
#else
7571
NppStreamContext stream;
7672
stream.hStream = nullptr; // default stream
7773

@@ -108,7 +104,6 @@ void resize_npp(
108104
SPDL_FAIL_INTERNAL(
109105
fmt::format("Unexpected output format: {}", to_string(fmt)));
110106
}
111-
#endif
112107
}
113108

114109
} // namespace spdl::cuda::detail

src/libspdl/cuda/nvdec/decoder.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,12 @@
1111
#include "libspdl/core/detail/logging.h"
1212
#include "libspdl/core/detail/tracing.h"
1313

14-
#ifdef SPDL_USE_NVCODEC
1514
#include "libspdl/cuda/nvdec/detail/decoder.h"
16-
#endif
1715

1816
#include <fmt/core.h>
1917

2018
namespace spdl::cuda {
2119

22-
#ifdef SPDL_USE_NVCODEC
2320
namespace {
2421
void validate_nvdec_params(
2522
int cuda_device_index,
@@ -94,6 +91,4 @@ std::vector<CUDABuffer> NvDecDecoder::flush() {
9491
core->flush(&ret);
9592
return ret;
9693
}
97-
98-
#endif
9994
} // namespace spdl::cuda

src/libspdl/cuda/nvdec/decoder.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ class NvDecDecoderCore;
3535
// decoder.reset();
3636
// decoder.init();
3737
class NvDecDecoder {
38-
#ifdef SPDL_USE_NVCODEC
3938
detail::NvDecDecoderCore* core;
4039

4140
public:
@@ -67,7 +66,6 @@ class NvDecDecoder {
6766

6867
// Call this method at the end of video stream.
6968
std::vector<CUDABuffer> flush();
70-
#endif
7169
};
7270

7371
} // namespace spdl::cuda

src/libspdl/cuda/nvjpeg/decoding.cpp

Lines changed: 199 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,58 +6,230 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#include <libspdl/core/demuxing.h>
10-
#include <libspdl/cuda/nvjpeg/decoding.h>
9+
#include <libspdl/cuda/buffer.h>
10+
#include <libspdl/cuda/types.h>
11+
#include "libspdl/cuda/detail/utils.h"
1112

1213
#include "libspdl/core/detail/logging.h"
1314
#include "libspdl/core/detail/tracing.h"
15+
#include "libspdl/cuda/nvjpeg/detail/utils.h"
1416

15-
#include <fmt/core.h>
17+
#ifdef SPDL_USE_NPPI
18+
#include "libspdl/cuda/npp/detail/resize.h"
19+
#endif
20+
21+
#include <fmt/format.h>
22+
#include <glog/logging.h>
1623

1724
namespace spdl::cuda {
25+
namespace {
1826

19-
#ifdef SPDL_USE_NVJPEG
20-
namespace detail {
21-
CUDABufferPtr decode_image_nvjpeg(
22-
const std::string_view& data,
23-
const CUDAConfig& cuda_config,
24-
int scale_width,
25-
int scale_height,
26-
const std::string& pix_fmt);
27-
CUDABufferPtr decode_image_nvjpeg(
28-
const std::vector<std::string_view>& data,
27+
std::tuple<size_t, bool> get_shape(nvjpegOutputFormat_t out_fmt) {
28+
switch (out_fmt) {
29+
// TODO: Support NVJPEG_OUTPUT_YUV?
30+
case NVJPEG_OUTPUT_RGB:
31+
[[fallthrough]];
32+
case NVJPEG_OUTPUT_BGR:
33+
return {3, false};
34+
case NVJPEG_OUTPUT_RGBI:
35+
[[fallthrough]];
36+
case NVJPEG_OUTPUT_BGRI:
37+
return {3, true};
38+
case NVJPEG_OUTPUT_Y:
39+
return {1, false};
40+
default:
41+
// It should be already handled by `get_nvjpeg_output_format`
42+
SPDL_FAIL_INTERNAL(fmt::format(
43+
"Unexpected output format: {}", detail::to_string(out_fmt)));
44+
}
45+
}
46+
47+
struct SizeMeta {
48+
size_t width;
49+
size_t height;
50+
size_t num_channels;
51+
bool interleaved;
52+
};
53+
54+
std::tuple<CUDABufferPtr, SizeMeta> get_output(
55+
nvjpegOutputFormat_t out_fmt,
56+
size_t height,
57+
size_t width,
2958
const CUDAConfig& cuda_config,
30-
int scale_width,
31-
int scale_height,
32-
const std::string& pix_fmt);
33-
} // namespace detail
34-
#endif
59+
std::optional<size_t> batch_size = std::nullopt) {
60+
auto [num_channels, interleaved] = get_shape(out_fmt);
61+
62+
auto buffer = [&](const size_t ch, bool interleaved_2) {
63+
return batch_size
64+
? (interleaved_2
65+
? cuda_buffer({*batch_size, height, width, ch}, cuda_config)
66+
: cuda_buffer({*batch_size, ch, height, width}, cuda_config))
67+
: (interleaved_2 ? cuda_buffer({height, width, ch}, cuda_config)
68+
: cuda_buffer({ch, height, width}, cuda_config));
69+
}(num_channels, interleaved);
70+
71+
return {
72+
std::move(buffer),
73+
SizeMeta{
74+
.width = width,
75+
.height = height,
76+
.num_channels = num_channels,
77+
.interleaved = interleaved}};
78+
}
79+
80+
void wrap_buffer(
81+
CUDABufferPtr& buffer,
82+
SizeMeta meta,
83+
nvjpegImage_t& image,
84+
size_t batch = 0) {
85+
auto ptr = static_cast<uint8_t*>(buffer->data());
86+
ptr += batch * meta.height * meta.width * meta.num_channels;
87+
auto pitch = meta.interleaved ? meta.width * meta.num_channels : meta.width;
88+
for (int c = 0; c < meta.num_channels; c++) {
89+
image.channel[c] = ptr;
90+
image.pitch[c] = pitch;
91+
ptr += pitch * meta.height;
92+
}
93+
}
94+
95+
std::tuple<CUDABufferPtr, SizeMeta, nvjpegImage_t> decode(
96+
std::string_view data,
97+
nvjpegOutputFormat_t fmt,
98+
const CUDAConfig& cuda_config) {
99+
auto nvjpeg = detail::get_nvjpeg();
100+
101+
// Note: Creation/destruction of nvjpegJpegState_t is thread-safe, however,
102+
// looking at the trace, it appears that they have internal locking mechanism
103+
// which make these operations as slow as several hudreds milliseconds in
104+
// multithread situation. So we use thread local.
105+
thread_local auto jpeg_state = detail::get_nvjpeg_jpeg_state(nvjpeg);
106+
107+
int num_components;
108+
nvjpegChromaSubsampling_t subsampling;
109+
thread_local int widths[NVJPEG_MAX_COMPONENT];
110+
thread_local int heights[NVJPEG_MAX_COMPONENT];
111+
{
112+
TRACE_EVENT("decoding", "nvjpegGetImageInfo");
113+
CHECK_NVJPEG(
114+
nvjpegGetImageInfo(
115+
nvjpeg,
116+
(const unsigned char*)data.data(),
117+
data.size(),
118+
&num_components,
119+
&subsampling,
120+
widths,
121+
heights),
122+
"Failed to fetch image information.");
123+
}
124+
125+
auto [buffer, meta] = get_output(fmt, heights[0], widths[0], cuda_config);
126+
nvjpegImage_t image;
127+
wrap_buffer(buffer, meta, image);
128+
129+
// Note: backend is not used by NVJPEG API when using nvjpegDecode().
130+
//
131+
// https://docs.nvidia.com/cuda/nvjpeg/index.html#decode-apisingle-phase
132+
// >> From CUDA 11 onwards, nvjpegDecode() picks the best available back-end
133+
// >> for a given image, user no longer has control on this. If there is a
134+
// >> need to select the back-end, then consider using nvjpegDecodeJpeg.
135+
// >> This is a new API added in CUDA 11 which allows user to control the
136+
// >> back-end.
137+
{
138+
TRACE_EVENT("decoding", "nvjpegDecode");
139+
CHECK_NVJPEG(
140+
nvjpegDecode(
141+
nvjpeg,
142+
jpeg_state.get(),
143+
(const unsigned char*)data.data(),
144+
data.size(),
145+
fmt,
146+
&image,
147+
(CUstream_st*)cuda_config.stream),
148+
"Failed to decode an image.");
149+
}
150+
return {std::move(buffer), meta, image};
151+
}
152+
153+
} // namespace
35154

36155
CUDABufferPtr decode_image_nvjpeg(
37156
const std::string_view& data,
38157
const CUDAConfig& cuda_config,
39158
int scale_width,
40159
int scale_height,
41160
const std::string& pix_fmt) {
42-
#ifndef SPDL_USE_NVJPEG
43-
SPDL_FAIL("SPDL is not compiled with NVJPEG support.");
161+
auto fmt = detail::get_nvjpeg_output_format(pix_fmt);
162+
163+
detail::set_cuda_primary_context(cuda_config.device_index);
164+
165+
auto [buffer, src_meta, decoded] = decode(data, fmt, cuda_config);
166+
167+
if (scale_width > 0 && scale_height > 0) {
168+
#ifndef SPDL_USE_NPPI
169+
SPDL_FAIL(
170+
"Image resizing while decoding with NVJPEG reqreuires SPDL to be compiled with NPPI support.");
44171
#else
45-
return detail::decode_image_nvjpeg(
46-
data, cuda_config, scale_width, scale_height, pix_fmt);
172+
auto [buffer2, meta2] =
173+
get_output(fmt, scale_height, scale_width, cuda_config);
174+
nvjpegImage_t resized;
175+
wrap_buffer(buffer2, meta2, resized);
176+
177+
detail::resize_npp(
178+
fmt,
179+
decoded,
180+
src_meta.width,
181+
src_meta.height,
182+
resized,
183+
scale_width,
184+
scale_height);
185+
186+
return std::move(buffer2);
47187
#endif
188+
}
189+
190+
return std::move(buffer);
48191
}
49192

50193
CUDABufferPtr decode_image_nvjpeg(
51-
const std::vector<std::string_view>& data,
194+
const std::vector<std::string_view>& dataset,
52195
const CUDAConfig& cuda_config,
53196
int scale_width,
54197
int scale_height,
55198
const std::string& pix_fmt) {
56-
#ifndef SPDL_USE_NVJPEG
57-
SPDL_FAIL("SPDL is not compiled with NVJPEG support.");
199+
#ifndef SPDL_USE_NPPI
200+
SPDL_FAIL(
201+
"Image resizing while decoding with NVJPEG reqreuires SPDL to be compiled with NPPI support.");
58202
#else
59-
return detail::decode_image_nvjpeg(
60-
data, cuda_config, scale_width, scale_height, pix_fmt);
203+
auto batch_size = dataset.size();
204+
if (batch_size == 0) {
205+
SPDL_FAIL("No input is provided.");
206+
}
207+
if (scale_width <= 0 && scale_height <= 0) {
208+
SPDL_FAIL("Both `scale_width` and `scale_height` must be specified.");
209+
}
210+
211+
auto fmt = detail::get_nvjpeg_output_format(pix_fmt);
212+
213+
detail::set_cuda_primary_context(cuda_config.device_index);
214+
215+
auto [out_buffer, out_meta] =
216+
get_output(fmt, scale_height, scale_width, cuda_config, batch_size);
217+
nvjpegImage_t out_wrapper;
218+
219+
for (size_t i = 0; i < batch_size; ++i) {
220+
auto [src_buffer, src_meta, decoded] = decode(dataset[i], fmt, cuda_config);
221+
222+
wrap_buffer(out_buffer, out_meta, out_wrapper, i);
223+
detail::resize_npp(
224+
fmt,
225+
decoded,
226+
src_meta.width,
227+
src_meta.height,
228+
out_wrapper,
229+
scale_width,
230+
scale_height);
231+
}
232+
return std::move(out_buffer);
61233
#endif
62234
}
63235

0 commit comments

Comments
 (0)