Skip to content

Commit 76e3a67

Browse files
committed
graph: backend: dnnl: executables: add exec profile verbose for gen_index
1 parent 61359b6 commit 76e3a67

File tree

2 files changed

+67
-3
lines changed

2 files changed

+67
-3
lines changed

src/graph/backend/dnnl/executables/gen_index.cpp

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ genindex_executable_t::genindex_executable_t(std::shared_ptr<op_t> &op,
3838
output_dims_[i] = output_lt.dims[i];
3939
output_strides_[i] = output_lt.layout.strides[i];
4040
}
41+
info_ = std::string(dnnl_engine_kind2str(
42+
static_cast<dnnl_engine_kind_t>(p_engine.get_kind())))
43+
+ "," + op->str();
4144
#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
4245
&& DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
4346
if (p_engine.get_kind() == engine::kind::gpu) {
@@ -59,7 +62,7 @@ genindex_executable_t::genindex_executable_t(std::shared_ptr<op_t> &op,
5962
#endif
6063
}
6164

62-
void genindex_executable_t::execute(const stream &stream,
65+
void genindex_executable_t::execute_impl(const stream &stream,
6366
const std::unordered_map<int, memory> &args) const {
6467
const auto &it_dst = args.find(DNNL_ARG_DST);
6568
if (it_dst == args.end()) return;
@@ -79,8 +82,24 @@ void genindex_executable_t::execute(const stream &stream,
7982
stream.get()->after_exec_hook();
8083
}
8184

85+
void genindex_executable_t::execute(const stream &stream,
86+
const std::unordered_map<int, memory> &args) const {
87+
if (get_verbose(dnnl::impl::verbose_t::exec_profile,
88+
dnnl::impl::component_t::graph)) {
89+
stream.get()->wait();
90+
double start_ms = dnnl::impl::get_msec();
91+
execute_impl(stream, args);
92+
stream.get()->wait();
93+
double duration_ms = dnnl::impl::get_msec() - start_ms;
94+
VPROF(start_ms, graph, exec, VERBOSE_profile, info_.c_str(),
95+
duration_ms);
96+
} else {
97+
execute_impl(stream, args);
98+
}
99+
}
100+
82101
#ifdef DNNL_WITH_SYCL
83-
::sycl::event genindex_executable_t::execute_sycl(const stream &stream,
102+
::sycl::event genindex_executable_t::execute_sycl_impl(const stream &stream,
84103
const std::unordered_map<int, memory> &args,
85104
const std::vector<::sycl::event> &deps) const {
86105
if (stream.get_engine().get_kind() == engine::kind::cpu) {
@@ -127,10 +146,28 @@ ::sycl::event genindex_executable_t::execute_sycl(const stream &stream,
127146
throw std::runtime_error("Unimplement");
128147
#endif
129148
}
149+
150+
::sycl::event genindex_executable_t::execute_sycl(const stream &stream,
151+
const std::unordered_map<int, memory> &args,
152+
const std::vector<::sycl::event> &deps) const {
153+
if (get_verbose(dnnl::impl::verbose_t::exec_profile,
154+
dnnl::impl::component_t::graph)) {
155+
stream.get()->wait();
156+
double start_ms = dnnl::impl::get_msec();
157+
execute_sycl_impl(stream, args, deps);
158+
stream.get()->wait();
159+
double duration_ms = dnnl::impl::get_msec() - start_ms;
160+
VPROF(start_ms, graph, exec, VERBOSE_profile, info_.c_str(),
161+
duration_ms);
162+
return {}; // no event returned in profiling mode
163+
} else {
164+
return execute_sycl_impl(stream, args, deps);
165+
}
166+
}
130167
#endif
131168

132169
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
133-
cl_event genindex_executable_t::execute_ocl(const stream &stream,
170+
cl_event genindex_executable_t::execute_ocl_impl(const stream &stream,
134171
const std::unordered_map<int, memory> &args,
135172
const std::vector<cl_event> &deps) const {
136173
#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
@@ -174,6 +211,24 @@ cl_event genindex_executable_t::execute_ocl(const stream &stream,
174211
throw std::runtime_error("Unimplement");
175212
#endif
176213
}
214+
215+
cl_event genindex_executable_t::execute_ocl(const stream &stream,
216+
const std::unordered_map<int, memory> &args,
217+
const std::vector<cl_event> &deps) const {
218+
if (get_verbose(dnnl::impl::verbose_t::exec_profile,
219+
dnnl::impl::component_t::graph)) {
220+
stream.get()->wait();
221+
double start_ms = dnnl::impl::get_msec();
222+
execute_ocl_impl(stream, args, deps);
223+
stream.get()->wait();
224+
double duration_ms = dnnl::impl::get_msec() - start_ms;
225+
VPROF(start_ms, graph, exec, VERBOSE_profile, info_.c_str(),
226+
duration_ms);
227+
return nullptr; // no event returned in profiling mode
228+
} else {
229+
return execute_ocl_impl(stream, args, deps);
230+
}
231+
}
177232
#endif
178233

179234
arg_indices_t genindex_executable_t::get_arg_indices(const op_t *op) {

src/graph/backend/dnnl/executables/gen_index.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,22 +38,31 @@ struct genindex_executable_t : public op_executable_t {
3838

3939
void execute(const stream &stream,
4040
const std::unordered_map<int, memory> &args) const override;
41+
void execute_impl(const stream &stream,
42+
const std::unordered_map<int, memory> &args) const;
4143

4244
#ifdef DNNL_WITH_SYCL
4345
::sycl::event execute_sycl(const stream &stream,
4446
const std::unordered_map<int, memory> &args,
4547
const std::vector<::sycl::event> &deps) const override;
48+
::sycl::event execute_sycl_impl(const stream &stream,
49+
const std::unordered_map<int, memory> &args,
50+
const std::vector<::sycl::event> &deps) const;
4651
#endif
4752

4853
#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
4954
cl_event execute_ocl(const stream &stream,
5055
const std::unordered_map<int, memory> &args,
5156
const std::vector<cl_event> &deps) const override;
57+
cl_event execute_ocl_impl(const stream &stream,
58+
const std::unordered_map<int, memory> &args,
59+
const std::vector<cl_event> &deps) const;
5260
#endif
5361

5462
private:
5563
int axis_, nelems_, ndims_;
5664
dims_t output_dims_, output_strides_;
65+
std::string info_;
5766

5867
#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
5968
&& (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL)

0 commit comments

Comments
 (0)