Skip to content

Commit 892f1ee

Browse files
authored
Quality enhancement: Immediately interrupt execution when memory OOM (#3932)
### What this PR does / why we need it? Protect the scene where the first problem occurs. The execution should be interrupted when the video memory application fails, rather than waiting until an illegal address is accessed. ### Does this PR introduce _any_ user-facing change? NA ### How was this patch tested? NA - vLLM version: v0.11.0 - vLLM main: vllm-project/vllm@83f478b Signed-off-by: leo-pony <[email protected]>
1 parent 5453033 commit 892f1ee

File tree

1 file changed

+35
-26
lines changed

1 file changed

+35
-26
lines changed

csrc/camem_allocator.cpp

Lines changed: 35 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
*/
1616

1717
#include <iostream>
18+
#include <stdexcept>
19+
#include <string>
1820

1921
extern "C" {
2022

@@ -49,7 +51,7 @@ void create_and_map(unsigned long long device, ssize_t size, void* d_mem,
4951
ensure_context(device);
5052
// Define memory allocation properties
5153
aclrtPhysicalMemProp prop = {};
52-
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
54+
prop.handleType = ACL_MEM_HANDLE_TYPE_NONE;
5355
prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
5456
prop.memAttr = ACL_HBM_MEM_HUGE;
5557
prop.location.id = device;
@@ -59,15 +61,21 @@ void create_and_map(unsigned long long device, ssize_t size, void* d_mem,
5961
// Allocate memory using aclrtMallocPhysical
6062
aclError error_code = aclrtMallocPhysical(p_memHandle, size, &prop, 0);
6163
if (error_code != 0) {
62-
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
63-
<< __LINE__ << std::endl;
64-
return;
64+
if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
65+
throw std::runtime_error("aclrtMallocPhysical failed with acl error code: " +
66+
std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " +
67+
__FILE__ + ":" + std::to_string(__LINE__));
68+
} else {
69+
throw std::runtime_error("aclrtMallocPhysical failed with acl error code: " +
70+
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
71+
}
6572
}
73+
74+
// Map memory
6675
error_code = aclrtMapMem(d_mem, size, 0, *p_memHandle, 0);
6776
if (error_code != 0) {
68-
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
69-
<< __LINE__ << std::endl;
70-
return;
77+
throw std::runtime_error("aclrtMapMem failed with acl error code: " +
78+
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
7179
}
7280
}
7381

@@ -79,15 +87,13 @@ void unmap_and_release(unsigned long long device, ssize_t size,
7987
ensure_context(device);
8088
aclError error_code = aclrtUnmapMem(d_mem);
8189
if (error_code != 0) {
82-
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
83-
<< __LINE__ << std::endl;
84-
return;
90+
throw std::runtime_error("aclrtUnmapMem failed with acl error code: " +
91+
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
8592
}
8693
error_code = aclrtFreePhysical(*p_memHandle);
8794
if (error_code != 0) {
88-
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
89-
<< __LINE__ << std::endl;
90-
return;
95+
throw std::runtime_error("aclrtFreePhysical failed with acl error code: " +
96+
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
9197
}
9298
}
9399

@@ -139,25 +145,29 @@ __attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device
139145
ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
140146
&granularity);
141147
if (error_code != 0) {
142-
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
143-
<< __LINE__ << std::endl;
144-
return nullptr;
148+
throw std::runtime_error("aclrtMemGetAllocationGranularity failed with acl error code: " +
149+
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
145150
}
146151
size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
147152
void *d_mem;
148153
error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
149154
if (error_code != 0) {
150-
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
151-
<< __LINE__ << std::endl;
152-
return nullptr;
155+
if (error_code == ACL_ERROR_RT_MEMORY_ALLOCATION) {
156+
throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
157+
std::to_string(error_code) + "(OOM: Out of Memory, allocation failed) " +
158+
__FILE__ + ":" + std::to_string(__LINE__));
159+
} else {
160+
throw std::runtime_error("aclrtReserveMemAddress failed with acl error code: " +
161+
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
162+
}
153163
}
154164
// allocate the aclrtDrvMemHandle
155165
aclrtDrvMemHandle* p_memHandle =
156166
(aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
157167

158168
if (!g_python_malloc_callback) {
159-
std::cerr << "ERROR: g_python_malloc_callback not set.\n";
160-
return nullptr;
169+
throw std::runtime_error("my_malloc ERROR: g_python_malloc_callback not set." +
170+
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
161171
}
162172

163173
// Acquire GIL (not in stable ABI officially, but often works)
@@ -189,8 +199,8 @@ __attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device
189199
__attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, int device, aclrtStream stream) {
190200
// get memory handle from the pointer
191201
if (!g_python_free_callback) {
192-
std::cerr << "ERROR: g_python_free_callback not set.\n";
193-
return;
202+
throw std::runtime_error("aclrtDrvMemHandle ERROR: g_python_malloc_callback not set." +
203+
std::string(" ") + __FILE__ + ":" + std::to_string(__LINE__));
194204
}
195205

196206
// Acquire GIL (not in stable ABI officially, but often works)
@@ -232,9 +242,8 @@ __attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, in
232242
// free address and the handle
233243
aclError error_code = aclrtReleaseMemAddress(d_mem);
234244
if (error_code != 0) {
235-
std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
236-
<< __LINE__ << std::endl;
237-
return;
245+
throw std::runtime_error("aclrtReleaseMemAddress failed with acl error code: " +
246+
std::to_string(error_code) + " " + __FILE__ + ":" + std::to_string(__LINE__));
238247
}
239248
free(p_memHandle);
240249
}

0 commit comments

Comments
 (0)