-
Notifications
You must be signed in to change notification settings - Fork 596
Replace CpuBoundWork
#10572
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Replace CpuBoundWork
#10572
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,63 +16,6 @@ | |
|
||
using namespace icinga; | ||
|
||
CpuBoundWork::CpuBoundWork(boost::asio::yield_context yc) | ||
: m_Done(false) | ||
{ | ||
auto& ioEngine (IoEngine::Get()); | ||
|
||
for (;;) { | ||
auto availableSlots (ioEngine.m_CpuBoundSemaphore.fetch_sub(1)); | ||
|
||
if (availableSlots < 1) { | ||
ioEngine.m_CpuBoundSemaphore.fetch_add(1); | ||
IoEngine::YieldCurrentCoroutine(yc); | ||
continue; | ||
} | ||
|
||
break; | ||
} | ||
} | ||
|
||
CpuBoundWork::~CpuBoundWork() | ||
{ | ||
if (!m_Done) { | ||
IoEngine::Get().m_CpuBoundSemaphore.fetch_add(1); | ||
} | ||
} | ||
|
||
void CpuBoundWork::Done() | ||
{ | ||
if (!m_Done) { | ||
IoEngine::Get().m_CpuBoundSemaphore.fetch_add(1); | ||
|
||
m_Done = true; | ||
} | ||
} | ||
|
||
IoBoundWorkSlot::IoBoundWorkSlot(boost::asio::yield_context yc) | ||
: yc(yc) | ||
{ | ||
IoEngine::Get().m_CpuBoundSemaphore.fetch_add(1); | ||
} | ||
|
||
IoBoundWorkSlot::~IoBoundWorkSlot() | ||
{ | ||
auto& ioEngine (IoEngine::Get()); | ||
|
||
for (;;) { | ||
auto availableSlots (ioEngine.m_CpuBoundSemaphore.fetch_sub(1)); | ||
|
||
if (availableSlots < 1) { | ||
ioEngine.m_CpuBoundSemaphore.fetch_add(1); | ||
IoEngine::YieldCurrentCoroutine(yc); | ||
continue; | ||
} | ||
|
||
break; | ||
} | ||
} | ||
|
||
LazyInit<std::unique_ptr<IoEngine>> IoEngine::m_Instance ([]() { return std::unique_ptr<IoEngine>(new IoEngine()); }); | ||
|
||
IoEngine& IoEngine::Get() | ||
|
@@ -85,10 +28,14 @@ boost::asio::io_context& IoEngine::GetIoContext() | |
return m_IoContext; | ||
} | ||
|
||
IoEngine::IoEngine() : m_IoContext(), m_KeepAlive(boost::asio::make_work_guard(m_IoContext)), m_Threads(decltype(m_Threads)::size_type(Configuration::Concurrency * 2u)), m_AlreadyExpiredTimer(m_IoContext) | ||
IoEngine::IoEngine() | ||
: m_IoContext(), | ||
m_KeepAlive(boost::asio::make_work_guard(m_IoContext)), | ||
m_Threads(decltype(m_Threads)::size_type(Configuration::Concurrency * 2u)), | ||
m_AlreadyExpiredTimer(m_IoContext), | ||
m_SlowSlotsAvailable(Configuration::Concurrency) | ||
{ | ||
m_AlreadyExpiredTimer.expires_at(boost::posix_time::neg_infin); | ||
m_CpuBoundSemaphore.store(Configuration::Concurrency * 3u / 2u); | ||
|
||
for (auto& thread : m_Threads) { | ||
thread = std::thread(&IoEngine::RunEventLoop, this); | ||
|
@@ -124,6 +71,32 @@ void IoEngine::RunEventLoop() | |
} | ||
} | ||
|
||
/** | ||
* Try to acquire a slot for a slow operation. This is intended to limit the number of concurrent slow operations. In | ||
* case no slot is returned, the caller should reject the operation (for example by sending an HTTP error) to prevent an | ||
* overload situation. | ||
* | ||
* @return A RAII-style object representing the slot. operator bool() can be used to check if the operation was | ||
* successful and the caller now owns a slot. Its destructor automatically releases the slot. | ||
*/ | ||
IoEngine::SlowSlot IoEngine::TryAcquireSlowSlot() | ||
{ | ||
// This is basically an ad-hoc (partial) semaphore implementation. | ||
// TODO(C++20): Use std::counting_semaphore instead. | ||
Al2Klimov marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
std::unique_lock lock(m_SlowSlotsMutex); | ||
if (m_SlowSlotsAvailable > 0) { | ||
m_SlowSlotsAvailable--; | ||
lock.unlock(); | ||
|
||
return std::make_unique<Defer>([this] { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems a good reason to make Defer movable. |
||
std::unique_lock lock(m_SlowSlotsMutex); | ||
m_SlowSlotsAvailable++; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, my semaphore from #9990 seems pretty fast. Especially a release could be just an atomic subtraction. |
||
}); | ||
} | ||
return {}; | ||
} | ||
|
||
AsioEvent::AsioEvent(boost::asio::io_context& io, bool init) | ||
: m_Timer(io) | ||
{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -416,15 +416,37 @@ void ProcessRequest( | |
HttpRequest& request, | ||
HttpResponse& response, | ||
const WaitGroup::Ptr& waitGroup, | ||
std::chrono::steady_clock::duration& cpuBoundWorkTime, | ||
boost::asio::yield_context& yc | ||
) | ||
{ | ||
try { | ||
// Cache the elapsed time to acquire a CPU semaphore used to detect extremely heavy workloads. | ||
auto start (std::chrono::steady_clock::now()); | ||
CpuBoundWork handlingRequest (yc); | ||
cpuBoundWorkTime = std::chrono::steady_clock::now() - start; | ||
/* Place some restrictions on the total number of HTTP requests handled concurrently to prevent HTTP requests | ||
* from hogging the entire coroutine thread pool by running too many requests handlers at once that don't | ||
* regularly yield, starving other coroutines. | ||
* | ||
* We need to consider two types of handlers here: | ||
* | ||
* 1. Those performing a more or less expensive operation and then returning the whole response at once. | ||
* Not too many of such handlers should run concurrently. | ||
* 2. Those already streaming the response while they are running, for example using chunked transfer encoding. | ||
* For these, we assume that they will frequently yield to other coroutines, in particular when writing parts | ||
* of the response to the client, or in case of EventsHandler also when waiting for new events. | ||
* | ||
* The following approach handles both of this automatically: we acquire one of a limited number of slots for | ||
* each request and release it automatically the first time anything (either the full response after the handler | ||
* finished or the first chunk from within the handler) is written using the response object. This means that | ||
* we don't have to handle acquiring or releasing that slot inside individual handlers. | ||
* | ||
* Overall, this is more or less a safeguard preventing long-running HTTP handlers from taking down the entire | ||
* Icinga 2 process by blocking the execution of JSON-RPC message handlers. In general, (new) HTTP handlers | ||
* shouldn't rely on this behavior but rather ensure that they are quick or at least yield regularly. | ||
*/ | ||
if (!response.TryAcquireSlowSlot()) { | ||
HttpUtility::SendJsonError(response, request.Params(), 503, | ||
"Too many requests already in progress, please try again later."); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This indeed seems clever:
But at least waiting some time before giving up looks clever AND smart to me: https://www.haproxy.com/blog/protect-servers-with-haproxy-connection-limits-and-queues#:~:text=a%20client%20will%20wait%20for%20up%20to%2030%20seconds%20in%20the%20queue,%20after%20which%20HAProxy%20returns%20a%20503%20Service%20Unavailable%20response%20to%20them I consider even the current behavior better (accumulating and increased response times) than the 503 approach. This handles rush hours smoother. While on HTTP status codes! This PR lets 1 ApiUser DoS the cluster now, I'd expect a 429 before even running into 503 and/or waiting. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡At least I'd introduce a new permission making an ApiUser wait as long as necessary, preventing 503. This is especially useful for critical components which consume the API. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This PR doesn't increase the time an ApiUser can block a slot, so wasn't this the same before, it's just hidden on the server side by waiting instead of retrying on the client side? If you send another request, you get another chance at obtaining that slot, just like you get when the server waits for you.
That's a whole other issue. Regardless of what we do to |
||
response.Flush(yc); | ||
return; | ||
} | ||
|
||
HttpHandler::ProcessRequest(waitGroup, request, response, yc); | ||
response.body().Finish(); | ||
|
@@ -497,14 +519,9 @@ void HttpServerConnection::ProcessMessages(boost::asio::yield_context yc) | |
<< ", user: " << (request.User() ? request.User()->GetName() : "<unauthenticated>") | ||
<< ", agent: " << request[http::field::user_agent]; //operator[] - Returns the value for a field, or "" if it does not exist. | ||
|
||
ch::steady_clock::duration cpuBoundWorkTime(0); | ||
Defer addRespCode ([&response, start, &logMsg, &cpuBoundWorkTime]() { | ||
logMsg << ", status: " << response.result() << ")"; | ||
if (cpuBoundWorkTime >= ch::seconds(1)) { | ||
logMsg << " waited " << ch::duration_cast<ch::milliseconds>(cpuBoundWorkTime).count() << "ms on semaphore and"; | ||
} | ||
|
||
logMsg << " took total " << ch::duration_cast<ch::milliseconds>(ch::steady_clock::now() - start).count() << "ms."; | ||
Defer addRespCode ([&response, start, &logMsg]() { | ||
logMsg << ", status: " << response.result() << ")" << " took total " | ||
<< ch::duration_cast<ch::milliseconds>(ch::steady_clock::now() - start).count() << "ms."; | ||
}); | ||
|
||
if (!HandleAccessControl(request, response, yc)) { | ||
|
@@ -525,7 +542,7 @@ void HttpServerConnection::ProcessMessages(boost::asio::yield_context yc) | |
|
||
m_Seen = ch::steady_clock::time_point::max(); | ||
|
||
ProcessRequest(request, response, m_WaitGroup, cpuBoundWorkTime, yc); | ||
ProcessRequest(request, response, m_WaitGroup, yc); | ||
|
||
if (!request.keep_alive() || !m_ConnectionReusable) { | ||
break; | ||
|
Uh oh!
There was an error while loading. Please reload this page.