Skip to content

Commit 3684391

Browse files
committed
[Clang][Driver] Add jobserver support for --offload-jobs
This patch introduces support for the jobserver protocol to control parallelism for device offloading tasks. When running a parallel build with a modern build system like `make -jN` or `ninja -jN`, each Clang process might also be configured to use multiple threads for its own tasks (e.g., via `--offload-jobs=4`). This can lead to an explosion of threads (N * 4), causing heavy system load, CPU contention, and ultimately slowing down the entire build. This patch allows Clang to act as a cooperative client of the build system's jobserver. It extends the `--offload-jobs` option to accept the value 'jobserver'. With the recent addition of jobserver support to the Ninja build system, this functionality now benefits users of both Make and Ninja. When `--offload-jobs=jobserver` is specified, Clang's thread pool will: 1. Parse the MAKEFLAGS environment variable to find the jobserver details. 2. Before dispatching a task, acquire a job slot from the jobserver. If none are available, the worker thread will block. 3. Release the job slot once the task is complete. This ensures that the total number of active offload tasks across all Clang processes does not exceed the limit defined by the parent build system, leading to more efficient and controlled parallel builds. Implementation: - A new library, `llvm/Support/Jobserver`, is added to provide a platform-agnostic client for the jobserver protocol, with backends for Unix (FIFO) and Windows (semaphores). - `llvm/Support/ThreadPool` and `llvm/Support/Parallel` are updated with a `jobserver_concurrency` strategy to integrate this logic. - The Clang driver and linker-wrapper are modified to recognize the 'jobserver' argument and enable the new thread pool strategy. - New unit and integration tests are added to validate the feature.
1 parent 13f7786 commit 3684391

File tree

18 files changed

+1348
-23
lines changed

18 files changed

+1348
-23
lines changed

clang/include/clang/Driver/Options.td

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1258,8 +1258,9 @@ def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
12581258
HelpText<"Compression level for offload device binaries (HIP only)">;
12591259

12601260
def offload_jobs_EQ : Joined<["--"], "offload-jobs=">,
1261-
HelpText<"Specify the number of threads to use for device offloading tasks"
1262-
" during compilation.">;
1261+
HelpText<"Specify the number of threads to use for device offloading tasks "
1262+
"during compilation. Can be a positive integer or the string "
1263+
"'jobserver' to use the make-style jobserver from the environment.">;
12631264

12641265
defm offload_via_llvm : BoolFOption<"offload-via-llvm",
12651266
LangOpts<"OffloadViaLLVM">, DefaultFalse,

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9298,14 +9298,20 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
92989298
addOffloadCompressArgs(Args, CmdArgs);
92999299

93009300
if (Arg *A = Args.getLastArg(options::OPT_offload_jobs_EQ)) {
9301-
int NumThreads;
9302-
if (StringRef(A->getValue()).getAsInteger(10, NumThreads) ||
9303-
NumThreads <= 0)
9304-
C.getDriver().Diag(diag::err_drv_invalid_int_value)
9305-
<< A->getAsString(Args) << A->getValue();
9306-
else
9307-
CmdArgs.push_back(
9308-
Args.MakeArgString("--wrapper-jobs=" + Twine(NumThreads)));
9301+
StringRef Val = A->getValue();
9302+
9303+
if (Val.equals_insensitive("jobserver"))
9304+
CmdArgs.push_back(Args.MakeArgString("--wrapper-jobs=jobserver"));
9305+
else {
9306+
int NumThreads;
9307+
if (Val.getAsInteger(10, NumThreads) || NumThreads <= 0) {
9308+
C.getDriver().Diag(diag::err_drv_invalid_int_value)
9309+
<< A->getAsString(Args) << Val;
9310+
} else {
9311+
CmdArgs.push_back(
9312+
Args.MakeArgString("--wrapper-jobs=" + Twine(NumThreads)));
9313+
}
9314+
}
93099315
}
93109316

93119317
const char *Exec =

clang/test/Driver/hip-options.hip

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,3 +259,9 @@
259259
// RUN: --offload-arch=gfx1100 --offload-new-driver --offload-jobs=0x4 %s 2>&1 | \
260260
// RUN: FileCheck -check-prefix=INVJOBS %s
261261
// INVJOBS: clang: error: invalid integral value '0x4' in '--offload-jobs=0x4'
262+
263+
// RUN: %clang -### -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib \
264+
// RUN: --offload-arch=gfx1100 --offload-new-driver --offload-jobs=jobserver %s 2>&1 | \
265+
// RUN: FileCheck -check-prefix=JOBSV %s
266+
// JOBSV: clang-linker-wrapper{{.*}} "--wrapper-jobs=jobserver"
267+

clang/test/Driver/linker-wrapper.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ __attribute__((visibility("protected"), used)) int x;
114114
// RUN: -fembed-offload-object=%t.out
115115
// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=4 \
116116
// RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-PAR
117+
// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=jobserver \
118+
// RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-PAR
117119

118120
// CUDA-PAR: fatbinary{{.*}}-64 --create {{.*}}.fatbin
119121

clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1424,12 +1424,18 @@ int main(int Argc, char **Argv) {
14241424

14251425
parallel::strategy = hardware_concurrency(1);
14261426
if (auto *Arg = Args.getLastArg(OPT_wrapper_jobs)) {
1427-
unsigned Threads = 0;
1428-
if (!llvm::to_integer(Arg->getValue(), Threads) || Threads == 0)
1429-
reportError(createStringError("%s: expected a positive integer, got '%s'",
1430-
Arg->getSpelling().data(),
1431-
Arg->getValue()));
1432-
parallel::strategy = hardware_concurrency(Threads);
1427+
StringRef Val = Arg->getValue();
1428+
if (Val.equals_insensitive("jobserver"))
1429+
parallel::strategy = jobserver_concurrency();
1430+
else {
1431+
unsigned Threads = 0;
1432+
if (!llvm::to_integer(Val, Threads) || Threads == 0)
1433+
reportError(createStringError(
1434+
"%s: expected a positive integer or 'jobserver', got '%s'",
1435+
Arg->getSpelling().data(), Val.data()));
1436+
else
1437+
parallel::strategy = hardware_concurrency(Threads);
1438+
}
14331439
}
14341440

14351441
if (Args.hasArg(OPT_wrapper_time_trace_eq)) {

clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ def wrapper_time_trace_granularity : Joined<["--"], "wrapper-time-trace-granular
5353

5454
def wrapper_jobs : Joined<["--"], "wrapper-jobs=">,
5555
Flags<[WrapperOnlyOption]>, MetaVarName<"<number>">,
56-
HelpText<"Sets the number of parallel jobs to use for device linking">;
56+
HelpText<"Sets the number of parallel jobs for device linking. Can be a "
57+
"positive integer or 'jobserver'.">;
5758

5859
def override_image : Joined<["--"], "override-image=">,
5960
Flags<[WrapperOnlyOption]>, MetaVarName<"<kind=file>">,
Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
//===- llvm/Support/Jobserver.h - Jobserver Client --------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This file defines a client for the GNU Make jobserver protocol. This allows
10+
// LLVM tools to coordinate parallel execution with a parent `make` process.
11+
//
12+
// The jobserver protocol is a mechanism for GNU Make to share its pool of
13+
// available "job slots" with the subprocesses it invokes. This is particularly
14+
// useful for tools that can perform parallel operations themselves (e.g., a
15+
// multi-threaded linker or compiler). By participating in this protocol, a
16+
// tool can ensure the total number of concurrent jobs does not exceed the
17+
// limit specified by the user (e.g., `make -j8`).
18+
//
19+
// How it works:
20+
//
21+
// 1. Establishment:
22+
// A child process discovers the jobserver by inspecting the `MAKEFLAGS`
23+
// environment variable. If a jobserver is active, this variable will
24+
// contain a `--jobserver-auth=<value>` argument. The format of `<value>`
25+
// determines how to communicate with the server.
26+
//
27+
// 2. The Implicit Slot:
28+
// Every command invoked by `make` is granted one "implicit" job slot. This
29+
// means a tool can always perform at least one unit of work without needing
30+
// to communicate with the jobserver. This implicit slot should NEVER be
31+
// released back to the jobserver.
32+
//
33+
// 3. Acquiring and Releasing Slots:
34+
// On POSIX systems, the jobserver is implemented as a pipe. The
35+
// `--jobserver-auth` value specifies either a path to a named pipe
36+
// (`fifo:PATH`) or a pair of file descriptors (`R,W`). The pipe is
37+
// pre-loaded with single-character tokens, one for each available job slot.
38+
//
39+
// - To acquire an additional slot, a client reads a single-character token
40+
// from the pipe.
41+
// - To release a slot, the client must write the *exact same* character
42+
// token back to the pipe.
43+
//
44+
// It is critical that a client releases all acquired slots before it exits,
45+
// even in cases of error, to avoid deadlocking the build.
46+
//
47+
// Example:
48+
// A multi-threaded linker invoked by `make -j8` wants to use multiple
49+
// threads. It first checks for the jobserver. It knows it has one implicit
50+
// slot, so it can use one thread. It then tries to acquire 7 more slots by
51+
// reading 7 tokens from the jobserver pipe. If it only receives 3 tokens,
52+
// it knows it can use a total of 1 (implicit) + 3 (acquired) = 4 threads.
53+
// Before exiting, it must write the 3 tokens it read back to the pipe.
54+
//
55+
// For more context, see:
56+
// - LLVM RFC discussion on jobserver support:
57+
// https://discourse.llvm.org/t/rfc-adding-gnu-make-jobserver-
58+
// support-to-llvm-for-coordinated-parallelism/87034
59+
// - Ninja’s jobserver support PR:
60+
// https://github.com/ninja-build/ninja/pull/2260//
61+
//
62+
//===----------------------------------------------------------------------===//
63+
64+
#ifndef LLVM_SUPPORT_JOBSERVER_H
65+
#define LLVM_SUPPORT_JOBSERVER_H
66+
67+
#include "llvm/ADT/StringRef.h"
68+
#include <memory>
69+
#include <string>
70+
71+
namespace llvm {
72+
73+
/// A JobSlot represents a single job slot that can be acquired from or released
74+
/// to a jobserver pool. This class is move-only.
75+
class JobSlot {
76+
public:
77+
/// Default constructor creates an invalid instance.
78+
JobSlot() = default;
79+
80+
// Move operations are allowed.
81+
JobSlot(JobSlot &&Other) noexcept : Value(Other.Value) {
82+
Other.Value = kInvalidValue;
83+
}
84+
JobSlot &operator=(JobSlot &&Other) noexcept {
85+
if (this != &Other) {
86+
this->Value = Other.Value;
87+
Other.Value = kInvalidValue;
88+
}
89+
return *this;
90+
}
91+
92+
// Copy operations are disallowed.
93+
JobSlot(const JobSlot &) = delete;
94+
JobSlot &operator=(const JobSlot &) = delete;
95+
96+
/// Returns true if this instance is valid (either implicit or explicit).
97+
bool isValid() const { return Value >= 0; }
98+
99+
/// Returns true if this instance represents the implicit job slot.
100+
bool isImplicit() const { return Value == kImplicitValue; }
101+
102+
static JobSlot createExplicit(uint8_t V) {
103+
return JobSlot(static_cast<int16_t>(V));
104+
}
105+
106+
static JobSlot createImplicit() { return JobSlot(kImplicitValue); }
107+
108+
uint8_t getExplicitValue() const;
109+
bool isExplicit() const { return isValid() && !isImplicit(); }
110+
111+
private:
112+
friend class JobserverClient;
113+
friend class JobserverClientImpl;
114+
115+
JobSlot(int16_t V) : Value(V) {}
116+
117+
/// The jobserver pipe carries explicit tokens (bytes 0–255). We reserve two
118+
/// sentinels in Value for special cases:
119+
/// kInvalidValue (-1): no slot held
120+
/// kImplicitValue (INT16_MAX): implicit slot granted at startup (no pipe
121+
/// I/O)
122+
///
123+
/// We use int16_t so Value can store 0–255 explicit tokens and
124+
/// sentinels without overflow, enforces fixed 16-bit width, and avoids
125+
/// unsigned/signed mix-ups.
126+
static constexpr int16_t kInvalidValue = -1;
127+
static constexpr int16_t kImplicitValue = INT16_MAX;
128+
int16_t Value = kInvalidValue;
129+
};
130+
131+
/// The public interface for a jobserver client.
132+
/// This client is a lazy-initialized singleton that is created on first use.
133+
class JobserverClient {
134+
public:
135+
virtual ~JobserverClient();
136+
137+
/// Tries to acquire a job slot from the pool. On failure (e.g., if the pool
138+
/// is empty), this returns an invalid JobSlot instance. The first successful
139+
/// call will always return the implicit slot.
140+
virtual JobSlot tryAcquire() = 0;
141+
142+
/// Releases a job slot back to the pool.
143+
virtual void release(JobSlot Slot) = 0;
144+
145+
/// Returns the number of job slots available, as determined on first use.
146+
/// This value is cached. Returns 0 if no jobserver is active.
147+
virtual unsigned getNumJobs() const = 0;
148+
149+
/// Returns the singleton instance of the JobserverClient.
150+
/// The instance is created on the first call to this function.
151+
/// Returns a nullptr if no jobserver is configured or an error occurs.
152+
static JobserverClient *getInstance();
153+
154+
/// Resets the singleton instance. For testing purposes only.
155+
static void resetForTesting();
156+
};
157+
158+
} // end namespace llvm
159+
160+
#endif // LLVM_SUPPORT_JOBSERVER_H

llvm/include/llvm/Support/ThreadPool.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "llvm/ADT/DenseMap.h"
1717
#include "llvm/Config/llvm-config.h"
1818
#include "llvm/Support/Compiler.h"
19+
#include "llvm/Support/Jobserver.h"
1920
#include "llvm/Support/RWMutex.h"
2021
#include "llvm/Support/Threading.h"
2122
#include "llvm/Support/thread.h"
@@ -184,6 +185,7 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface {
184185
void grow(int requested);
185186

186187
void processTasks(ThreadPoolTaskGroup *WaitingForGroup);
188+
void processTasksWithJobserver();
187189

188190
/// Threads in flight
189191
std::vector<llvm::thread> Threads;
@@ -212,6 +214,8 @@ class LLVM_ABI StdThreadPool : public ThreadPoolInterface {
212214

213215
/// Maximum number of threads to potentially grow this pool to.
214216
const unsigned MaxThreadCount;
217+
218+
JobserverClient *TheJobserver = nullptr;
215219
};
216220
#endif // LLVM_ENABLE_THREADS
217221

llvm/include/llvm/Support/Threading.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,11 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
142142
/// the thread shall remain on the actual CPU socket.
143143
LLVM_ABI std::optional<unsigned>
144144
compute_cpu_socket(unsigned ThreadPoolNum) const;
145+
146+
/// If true, the thread pool will attempt to coordinate with a GNU Make
147+
/// jobserver, acquiring a job slot before processing a task. If no
148+
/// jobserver is found in the environment, this is ignored.
149+
bool UseJobserver = false;
145150
};
146151

147152
/// Build a strategy from a number of threads as a string provided in \p Num.
@@ -210,6 +215,19 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
210215
return S;
211216
}
212217

218+
/// Returns a thread strategy that attempts to coordinate with a GNU Make
219+
/// jobserver. The number of active threads will be limited by the number of
220+
/// available job slots. If no jobserver is detected in the environment, this
221+
/// strategy falls back to the default hardware_concurrency() behavior.
222+
inline ThreadPoolStrategy jobserver_concurrency() {
223+
ThreadPoolStrategy S;
224+
S.UseJobserver = true;
225+
// We can still request all threads be created, as they will simply
226+
// block waiting for a job slot if the jobserver is the limiting factor.
227+
S.ThreadsRequested = 0; // 0 means 'use all available'
228+
return S;
229+
}
230+
213231
/// Return the current thread id, as used in various OS system calls.
214232
/// Note that not all platforms guarantee that the value returned will be
215233
/// unique across the entire system, so portable code should not assume

llvm/lib/Support/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ add_llvm_component_library(LLVMSupport
205205
InstructionCost.cpp
206206
IntEqClasses.cpp
207207
IntervalMap.cpp
208+
Jobserver.cpp
208209
JSON.cpp
209210
KnownBits.cpp
210211
KnownFPClass.cpp

0 commit comments

Comments
 (0)