diff --git a/.bazelci/run_server_test.sh b/.bazelci/run_server_test.sh index 21b6d70389..54e4658b16 100755 --- a/.bazelci/run_server_test.sh +++ b/.bazelci/run_server_test.sh @@ -8,11 +8,11 @@ bazel build //src/main/java/build/buildfarm:buildfarm-shard-worker bazel build //src/main/java/build/buildfarm:buildfarm-server # Start a single worker -bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker $(pwd)/examples/config.minimal.yml > server.log 2>&1 & +bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker $(pwd)/examples/config.minimal.yml > worker.log 2>&1 & echo "Started buildfarm-shard-worker..." # Start a single server -bazel run //src/main/java/build/buildfarm:buildfarm-server $(pwd)/examples/config.minimal.yml > worker.log 2>&1 & +bazel run //src/main/java/build/buildfarm:buildfarm-server $(pwd)/examples/config.minimal.yml > server.log 2>&1 & echo "Started buildfarm-server..." echo "Wait for startup to finish..." diff --git a/.bazelrc b/.bazelrc index cfea58ba6a..c3caf4d8fb 100644 --- a/.bazelrc +++ b/.bazelrc @@ -1,3 +1,9 @@ +build --java_language_version=17 +build --java_runtime_version=remotejdk_17 + +build --tool_java_language_version=17 +build --tool_java_runtime_version=remotejdk_17 + common --enable_platform_specific_config build:fuse --define=fuse=true @@ -14,3 +20,7 @@ test --test_tag_filters=-redis,-integration # Ensure buildfarm is compatible with future versions of bazel. # https://buildkite.com/bazel/bazelisk-plus-incompatible-flags common --incompatible_disallow_empty_glob + +# TODO: migrate all dependencies from WORKSPACE to MODULE.bazel +# https://github.com/bazelbuild/bazel-buildfarm/issues/1479 +common --noenable_bzlmod diff --git a/.bazelversion b/.bazelversion index 5e3254243a..19b860c187 100644 --- a/.bazelversion +++ b/.bazelversion @@ -1 +1 @@ -6.1.2 +6.4.0 diff --git a/BUILD b/BUILD index fc380f9846..f1cf17c796 100644 --- a/BUILD +++ b/BUILD @@ -121,7 +121,7 @@ sh_binary( java_image( name = "buildfarm-server", args = ["/app/build_buildfarm/examples/config.minimal.yml"], - base = "@amazon_corretto_java_image_base//image", + base = "@ubuntu-mantic//image", classpath_resources = [ "//src/main/java/build/buildfarm:configs", ], @@ -149,14 +149,14 @@ oss_audit( # Download cgroup-tools so that the worker is able to restrict actions via control groups. download_pkgs( name = "worker_pkgs", - image_tar = "@ubuntu-jammy//image", + image_tar = "@ubuntu-mantic//image", packages = ["cgroup-tools"], tags = ["container"], ) install_pkgs( name = "worker_pkgs_image", - image_tar = "@ubuntu-jammy//image", + image_tar = "@ubuntu-mantic//image", installables_tar = ":worker_pkgs.tar", installation_cleanup_commands = "rm -rf /var/lib/apt/lists/*", output_image_name = "worker_pkgs_image", diff --git a/MODULE.bazel b/MODULE.bazel new file mode 100644 index 0000000000..cabbbe697c --- /dev/null +++ b/MODULE.bazel @@ -0,0 +1,2 @@ +# TODO: migrate all dependencies from WORKSPACE to MODULE.bazel +# https://github.com/bazelbuild/bazel-buildfarm/issues/1479 diff --git a/README.md b/README.md index 8dc99874b3..cd9caf8ee3 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,8 @@ All commandline options override corresponding config settings. Run via -``` -docker run -d --rm --name buildfarm-redis -p 6379:6379 redis:5.0.9 +```shell +$ docker run -d --rm --name buildfarm-redis -p 6379:6379 redis:5.0.9 redis-cli config set stop-writes-on-bgsave-error no ``` @@ -28,10 +28,10 @@ redis-cli config set stop-writes-on-bgsave-error no Run via -``` -bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- +```shell +$ bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- -Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` **`logfile`** has to be in the [standard java util logging format](https://docs.oracle.com/cd/E57471_01/bigData.100/data_processing_bdd/src/rdp_logging_config.html) and passed as a --jvm_flag=-Dlogging.config=file: **`configfile`** has to be in [yaml format](https://bazelbuild.github.io/bazel-buildfarm/docs/configuration). @@ -40,10 +40,10 @@ Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag= Run via -``` -bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- +```shell +$ bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- -Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` **`logfile`** has to be in the [standard java util logging format](https://docs.oracle.com/cd/E57471_01/bigData.100/data_processing_bdd/src/rdp_logging_config.html) and passed as a --jvm_flag=-Dlogging.config=file: @@ -53,9 +53,9 @@ Ex: bazelisk run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm To use the example configured buildfarm with bazel (version 1.0 or higher), you can configure your `.bazelrc` as follows: -``` +```shell $ cat .bazelrc -build --remote_executor=grpc://localhost:8980 +$ build --remote_executor=grpc://localhost:8980 ``` Then run your build as you would normally do. @@ -67,20 +67,20 @@ Buildfarm uses [Java's Logging framework](https://docs.oracle.com/javase/10/core You can use typical Java logging configuration to filter these results and observe the flow of executions through your running services. An example `logging.properties` file has been provided at [examples/logging.properties](examples/logging.properties) for use as follows: -``` -bazel run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +```shell +$ bazel run //src/main/java/build/buildfarm:buildfarm-server -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` and -``` -bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Dlogging.config=file:$PWD/examples/logging.properties $PWD/examples/config.minimal.yml +``` shell +$ bazel run //src/main/java/build/buildfarm:buildfarm-shard-worker -- --jvm_flag=-Djava.util.logging.config.file=$PWD/examples/logging.properties $PWD/examples/config.minimal.yml ``` To attach a remote debugger, run the executable with the `--debug=` flag. For example: -``` -bazel run //src/main/java/build/buildfarm:buildfarm-server -- --debug=5005 $PWD/examples/config.minimal.yml +```shell +$ bazel run //src/main/java/build/buildfarm:buildfarm-server -- --debug=5005 $PWD/examples/config.minimal.yml ``` diff --git a/_site/docs/architecture/content_addressable_storage.md b/_site/docs/architecture/content_addressable_storage.md index a8955b4ce3..41b50d9948 100644 --- a/_site/docs/architecture/content_addressable_storage.md +++ b/_site/docs/architecture/content_addressable_storage.md @@ -38,9 +38,9 @@ This is the example presentation of a CAS in the memory instance available [here ``` worker: - cas: - type: MEMORY - maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 + storages: + - type: MEMORY + maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 ``` ## GRPC @@ -53,9 +53,11 @@ A grpc config example is available in the alternate instance specification in th server: name: shard worker: - cas: - type: GRPC - target: + storages: + - type: FILESYSTEM + path: "cache" + - type: GRPC + target: ``` ## HTTP/1 @@ -89,11 +91,10 @@ The CASFileCache is also available on MemoryInstance servers, where it can repre ``` worker: - cas: - type: FILESYSTEM - path: "cache" - maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 - maxEntrySizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 + storages: + - type: FILESYSTEM + path: "cache" + maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 ``` CASTest is a standalone tool to load the cache and print status information about it. diff --git a/_site/docs/configuration/configuration.md b/_site/docs/configuration/configuration.md index c6df9b58d3..466375373f 100644 --- a/_site/docs/configuration/configuration.md +++ b/_site/docs/configuration/configuration.md @@ -7,7 +7,7 @@ has_children: true Minimal required: -``` +```yaml backplane: redisUri: "redis://localhost:6379" queues: @@ -21,24 +21,25 @@ worker: publicName: "localhost:8981" ``` -The configuration can be provided to the server and worker as a CLI argument or through the env variable `CONFIG_PATH` -For an example config containing all of the configuration values, see `examples/config.yml`. +The configuration can be provided to the server and worker as a CLI argument or through the environment variable `CONFIG_PATH` +For an example configuration containing all of the configuration values, see `examples/config.yml`. ## All Configurations ### Common -| Configuration | Accepted and _Default_ Values | Description | -|----------------------|-------------------------------|---------------------------------------------------| -| digestFunction | _SHA256_, SHA1 | Digest function for this implementation | -| defaultActionTimeout | Integer, _600_ | Default timeout value for an action (seconds) | -| maximumActionTimeout | Integer, _3600_ | Maximum allowed action timeout (seconds) | -| maxEntrySizeBytes | Long, _2147483648_ | Maximum size of a single blob accepted (bytes) | -| prometheusPort | Integer, _9090_ | Listening port of the Prometheus metrics endpoint | +| Configuration | Accepted and _Default_ Values | Command Line Argument | Description | +|------------------------------|-------------------------------|-----------------------|--------------------------------------------------------------| +| digestFunction | _SHA256_, SHA1 | | Digest function for this implementation | +| defaultActionTimeout | Integer, _600_ | | Default timeout value for an action (seconds) | +| maximumActionTimeout | Integer, _3600_ | | Maximum allowed action timeout (seconds) | +| maxEntrySizeBytes | Long, _2147483648_ | | Maximum size of a single blob accepted (bytes) | +| prometheusPort | Integer, _9090_ | --prometheus_port | Listening port of the Prometheus metrics endpoint | +| allowSymlinkTargetAbsolute | boolean, _false_ | | Permit inputs to contain symlinks with absolute path targets | Example: -``` +```yaml digestFunction: SHA1 defaultActionTimeout: 1800 maximumActionTimeout: 1800 @@ -51,33 +52,35 @@ worker: ### Server -| Configuration | Accepted and _Default_ Values | Environment Var | Description | -|----------------------------------|-------------------------------|-------------------------|-----------------------------------------------------------------------------------------------------------------------------| -| instanceType | _SHARD_ | | Type of implementation (SHARD is the only one supported) | -| name | String, _shard_ | | Implementation name | -| publicName | String, _DERIVED:port_ | INSTANCE_NAME | Host:port of the GRPC server, required to be accessible by all servers | -| actionCacheReadOnly | boolean, _false_ | | Allow/Deny writing to action cache | -| port | Integer, _8980_ | | Listening port of the GRPC server | -| casWriteTimeout | Integer, _3600_ | | CAS write timeout (seconds) | -| bytestreamTimeout | Integer, _3600_ | | Byte Stream write timeout (seconds) | -| sslCertificatePath | String, _null_ | | Absolute path of the SSL certificate (if TLS used) | -| sslPrivateKeyPath | String, _null_ | | Absolute path of the SSL private key (if TLS used) | -| runDispatchedMonitor | boolean, _true_ | | Enable an agent to monitor the operation store to ensure that dispatched operations with expired worker leases are requeued | -| dispatchedMonitorIntervalSeconds | Integer, _1_ | | Dispatched monitor's lease expiration check interval (seconds) | -| runOperationQueuer | boolean, _true_ | | Aquire execute request entries cooperatively from an arrival queue on the backplane | -| ensureOutputsPresent | boolean, _false_ | | Decide if all outputs are also present in the CAS. If any outputs are missing a cache miss is returned | -| maxCpu | Integer, _0_ | | Maximum number of CPU cores that any min/max-cores property may request (0 = unlimited) | -| maxRequeueAttempts | Integer, _5_ | | Maximum number of requeue attempts for an operation | -| useDenyList | boolean, _true_ | | Allow usage of a deny list when looking up actions and invocations (for cache only it is recommended to disable this check) | -| grpcTimeout | Integer, _3600_ | | GRPC request timeout (seconds) | -| executeKeepaliveAfterSeconds | Integer, _60_ | | Execute keep alive (seconds) | -| recordBesEvents | boolean, _false_ | | Allow recording of BES events | -| clusterId | String, _local_ | | Buildfarm cluster ID | -| cloudRegion | String, _us-east_1_ | | Deployment region in the cloud | +| Configuration | Accepted and _Default_ Values | Environment Var | Description | +|----------------------------------|-------------------------------|-----------------|-----------------------------------------------------------------------------------------------------------------------------| +| instanceType | _SHARD_ | | Type of implementation (SHARD is the only one supported) | +| name | String, _shard_ | | Implementation name | +| publicName | String, _DERIVED:port_ | INSTANCE_NAME | Host:port of the GRPC server, required to be accessible by all servers | +| actionCacheReadOnly | boolean, _false_ | | Allow/Deny writing to action cache | +| port | Integer, _8980_ | | Listening port of the GRPC server | +| casWriteTimeout | Integer, _3600_ | | CAS write timeout (seconds) | +| bytestreamTimeout | Integer, _3600_ | | Byte Stream write timeout (seconds) | +| sslCertificatePath | String, _null_ | | Absolute path of the SSL certificate (if TLS used) | +| sslPrivateKeyPath | String, _null_ | | Absolute path of the SSL private key (if TLS used) | +| runDispatchedMonitor | boolean, _true_ | | Enable an agent to monitor the operation store to ensure that dispatched operations with expired worker leases are requeued | +| dispatchedMonitorIntervalSeconds | Integer, _1_ | | Dispatched monitor's lease expiration check interval (seconds) | +| runOperationQueuer | boolean, _true_ | | Acquire execute request entries cooperatively from an arrival queue on the backplane | +| ensureOutputsPresent | boolean, _false_ | | Decide if all outputs are also present in the CAS. If any outputs are missing a cache miss is returned | +| maxCpu | Integer, _0_ | | Maximum number of CPU cores that any min/max-cores property may request (0 = unlimited) | +| maxRequeueAttempts | Integer, _5_ | | Maximum number of requeue attempts for an operation | +| useDenyList | boolean, _true_ | | Allow usage of a deny list when looking up actions and invocations (for cache only it is recommended to disable this check) | +| grpcTimeout | Integer, _3600_ | | GRPC request timeout (seconds) | +| executeKeepaliveAfterSeconds | Integer, _60_ | | Execute keep alive (seconds) | +| recordBesEvents | boolean, _false_ | | Allow recording of BES events | +| clusterId | String, _local_ | | Buildfarm cluster ID | +| cloudRegion | String, _us-east_1_ | | Deployment region in the cloud | +| gracefulShutdownSeconds | Integer, 0 | | Time in seconds to allow for connections in flight to finish when shutdown signal is received | + Example: -``` +```yaml server: instanceType: SHARD name: shard @@ -94,7 +97,7 @@ server: Example: -``` +```yaml server: grpcMetrics: enabled: false @@ -103,16 +106,16 @@ server: ### Server Caches -| Configuration | Accepted and _Default_ Values | Description | -|--------------------------|-------------------------------|--------------------------------------------------------| -| directoryCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the directory cache will hold. | -| commandCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the command cache will hold. | -| digestToActionCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the digest-to-action cache will hold. | -| recentServedExecutionsCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the executions cache will hold. | +| Configuration | Accepted and _Default_ Values | Description | +|---------------------------------------|-------------------------------|----------------------------------------------------------------------| +| directoryCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the directory cache will hold. | +| commandCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the command cache will hold. | +| digestToActionCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the digest-to-action cache will hold. | +| recentServedExecutionsCacheMaxEntries | Long, _64 * 1024_ | The max number of entries that the executions cache will hold. | Example: -``` +```yaml server: caches: directoryCacheMaxEntries: 10000 @@ -130,7 +133,7 @@ server: Example: -``` +```yaml server: admin: deploymentEnvironment: AWS @@ -149,14 +152,14 @@ server: Example: -``` +```yaml server: metrics: publisher: log logLevel: INFO ``` -``` +```yaml server: metrics: publisher: aws @@ -167,45 +170,45 @@ server: ### Redis Backplane -| Configuration | Accepted and _Default_ Values | Environment Var | Description | -|------------------------------|------------------------------------------|-----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| type | _SHARD_ | | Type of backplane. Currently, the only implemntation is SHARD utilizing Redis | -| redisUri | String, redis://localhost:6379 | REDIS_URI | Redis cluster endpoint. This must be a single URI | -| redisPassword | String, _null_ | | Redis password, if applicable | -| redisNodes | List of Strings, _null_ | | List of individual Redis nodes, if applicable | -| jedisPoolMaxTotal | Integer, _4000_ | | The size of the Redis connection pool | -| workersHashName | String, _Workers_ | | Redis key used to store a hash of registered workers | -| workerChannel | String, _WorkerChannel_ | | Redis pubsub channel key where changes of the cluster membership are announced | -| actionCachePrefix | String, _ActionCache_ | | Redis key prefix for all ActionCache entries | -| actionCacheExpire | Integer, _2419200_ | | The TTL maintained for ActionCache entries, not refreshed on getActionResult hit | -| actionBlacklistPrefix | String, _ActionBlacklist_ | | Redis key prefix for all blacklisted actions, which are rejected | -| actionBlacklistExpire | Integer, _3600_ | | The TTL maintained for action blacklist entries | -| invocationBlacklistPrefix | String, _InvocationBlacklist_ | | Redis key prefix for blacklisted invocations, suffixed with a a tool invocation ID | -| operationPrefix | String, _Operation_ | | Redis key prefix for all operations, suffixed wtih the operation's name | -| operationExpire | Integer, _604800_ | | The TTL maintained for all operations, updated on each modification | -| preQueuedOperationsListName | String, _{Arrival}:PreQueuedOperations_ | | Redis key used to store a list of ExecuteEntry awaiting transformation into QueryEntry | -| processingListName | String, _{Arrival}:ProcessingOperations_ | | Redis key of a list used to ensure reliable processing of arrival queue etries with operation watch monitoring | -| processingPrefix | String, _Processing_ | | Redis key prefix for operations which are being dequeued from the arrival queue | -| processingTimeoutMillis | Integer, _20000_ | | Delay (in ms) used to populate processing operation entries | -| queuedOperationsListName | String, _{Execution}:QueuedOperations_ | | Redis key used to store a list of QueueEntry awaiting execution by workers | -| dispatchingPrefix | String, _Dispatching_ | | Redis key prefix for operations which are being dequeued from the ready to run queue | -| dispatchingTimeoutMillis | Integer, _10000_ | | Delay (in ms) used to populate dispatching operation entries | -| dispatchedOperationsHashName | String, _DispatchedOperations_ | | Redis key of a hash of operation names to the worker lease for its execution, which are monitored by the dispatched monitor | -| operationChannelPrefix | String, _OperationChannel_ | | Redis pubsub channel prefix suffixed by an operation name | -| casPrefix | String, _ContentAddressableStorage_ | | Redis key prefix suffixed with a blob digest that maps to a set of workers with that blob's availability | -| casExpire | Integer, _604800_ | | The TTL maintained for CAS entries, which is not refreshed on any read access of the blob | -| subscribeToBackplane | boolean, _true_ | | Enable an agent of the backplane client which subscribes to worker channel and operation channel events. If disabled, responsiveness of watchers and CAS are reduced | -| runFailsafeOperation | boolean, _true_ | | Enable an agent in the backplane client which monitors watched operations and ensures they are in a known maintained, or expirable state | -| maxQueueDepth | Integer, _100000_ | | Maximum length that the ready to run queue is allowed to reach to control an arrival flow for execution | -| maxPreQueueDepth | Integer, _1000000_ | | Maximum lengh that the arrival queue is allowed to reach to control load on the Redis cluster | -| priorityQueue | boolean, _false_ | | Priority queue type allows prioritizing operations based on Bazel's --remote_execution_priority= flag | -| timeout | Integer, _10000_ | | Default timeout | -| maxAttempts | Integer, _20_ | | Maximum number of execution attempts | -| cacheCas | boolean, _false_ | | | +| Configuration | Accepted and _Default_ Values | Environment Var | Command Line Argument | Description | +|------------------------------|------------------------------------------|-----------------|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| type | _SHARD_ | | | Type of backplane. Currently, the only implementation is SHARD utilizing Redis | +| redisUri | String, redis://localhost:6379 | REDIS_URI | --redis_uri | Redis cluster endpoint. This must be a single URI | +| redisPassword | String, _null_ | | | Redis password, if applicable | +| redisNodes | List of Strings, _null_ | | | List of individual Redis nodes, if applicable | +| jedisPoolMaxTotal | Integer, _4000_ | | | The size of the Redis connection pool | +| workersHashName | String, _Workers_ | | | Redis key used to store a hash of registered workers | +| workerChannel | String, _WorkerChannel_ | | | Redis pubsub channel key where changes of the cluster membership are announced | +| actionCachePrefix | String, _ActionCache_ | | | Redis key prefix for all ActionCache entries | +| actionCacheExpire | Integer, _2419200_ | | | The TTL maintained for ActionCache entries, not refreshed on getActionResult hit | +| actionBlacklistPrefix | String, _ActionBlacklist_ | | | Redis key prefix for all blacklisted actions, which are rejected | +| actionBlacklistExpire | Integer, _3600_ | | | The TTL maintained for action blacklist entries | +| invocationBlacklistPrefix | String, _InvocationBlacklist_ | | | Redis key prefix for blacklisted invocations, suffixed with a a tool invocation ID | +| operationPrefix | String, _Operation_ | | | Redis key prefix for all operations, suffixed with the operation's name | +| operationExpire | Integer, _604800_ | | | The TTL maintained for all operations, updated on each modification | +| preQueuedOperationsListName | String, _{Arrival}:PreQueuedOperations_ | | | Redis key used to store a list of ExecuteEntry awaiting transformation into QueryEntry | +| processingListName | String, _{Arrival}:ProcessingOperations_ | | | Redis key of a list used to ensure reliable processing of arrival queue entries with operation watch monitoring | +| processingPrefix | String, _Processing_ | | | Redis key prefix for operations which are being dequeued from the arrival queue | +| processingTimeoutMillis | Integer, _20000_ | | | Delay (in ms) used to populate processing operation entries | +| queuedOperationsListName | String, _{Execution}:QueuedOperations_ | | | Redis key used to store a list of QueueEntry awaiting execution by workers | +| dispatchingPrefix | String, _Dispatching_ | | | Redis key prefix for operations which are being dequeued from the ready to run queue | +| dispatchingTimeoutMillis | Integer, _10000_ | | | Delay (in ms) used to populate dispatching operation entries | +| dispatchedOperationsHashName | String, _DispatchedOperations_ | | | Redis key of a hash of operation names to the worker lease for its execution, which are monitored by the dispatched monitor | +| operationChannelPrefix | String, _OperationChannel_ | | | Redis pubsub channel prefix suffixed by an operation name | +| casPrefix | String, _ContentAddressableStorage_ | | | Redis key prefix suffixed with a blob digest that maps to a set of workers with that blob's availability | +| casExpire | Integer, _604800_ | | | The TTL maintained for CAS entries, which is not refreshed on any read access of the blob | +| subscribeToBackplane | boolean, _true_ | | | Enable an agent of the backplane client which subscribes to worker channel and operation channel events. If disabled, responsiveness of watchers and CAS are reduced | +| runFailsafeOperation | boolean, _true_ | | | Enable an agent in the backplane client which monitors watched operations and ensures they are in a known maintained, or expirable state | +| maxQueueDepth | Integer, _100000_ | | | Maximum length that the ready to run queue is allowed to reach to control an arrival flow for execution | +| maxPreQueueDepth | Integer, _1000000_ | | | Maximum lengh that the arrival queue is allowed to reach to control load on the Redis cluster | +| priorityQueue | boolean, _false_ | | | Priority queue type allows prioritizing operations based on Bazel's --remote_execution_priority= flag | +| timeout | Integer, _10000_ | | | Default timeout | +| maxAttempts | Integer, _20_ | | | Maximum number of execution attempts | +| cacheCas | boolean, _false_ | | | | Example: -``` +```yaml backplane: type: SHARD redisUri: "redis://localhost:6379" @@ -222,7 +225,7 @@ backplane: Example: -``` +```yaml backplane: type: SHARD redisUri: "redis://localhost:6379" @@ -238,28 +241,30 @@ backplane: ### Worker -| Configuration | Accepted and _Default_ Values | Environment Var | Description | -|----------------------------------|-------------------------------|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| port | Integer, _8981_ | | Listening port of the worker | -| publicName | String, _DERIVED:port_ | INSTANCE_NAME | Host:port of the GRPC server, required to be accessible by all servers | -| root | String, _/tmp/worker_ | | Path for all operation content storage | -| inlineContentLimit | Integer, _1048567_ | | Total size in bytes of inline content for action results, output files, stdout, stderr content | -| operationPollPeriod | Integer, _1_ | | Period between poll operations at any stage | -| executeStageWidth | Integer, _0_ | EXECUTION_STAGE_WIDTH | Number of CPU cores available for execution (0 = system available cores) | -| executeStageWidthOffset | Integer, _0_ | | Offset number of CPU cores available for execution (to allow for use by other processes) | -| inputFetchStageWidth | Integer, _0_ | | Number of concurrently available slots to fetch inputs (0 = system calculated based on CPU cores) | -| inputFetchDeadline | Integer, _60_ | | Limit on time (seconds) for input fetch stage to fetch inputs | -| linkInputDirectories | boolean, _true_ | | Use an input directory creation strategy which creates a single directory tree at the highest level containing no output paths of any kind, and symlinks that directory into an action's execroot, saving large amounts of time spent manufacturing the same read-only input hierirchy over multiple actions' executions | -| execOwner | String, _null_ | | Create exec trees containing directories that are owned by this user | -| hexBucketLevels | Integer, _0_ | | Number of levels to create for directory storage by leading byte of the hash (problematic, not recommended) | -| defaultMaxCores | Integer, _0_ | | Constrain all executions to this logical core count unless otherwise specified via min/max-cores (0 = no limit) | -| limitGlobalExecution | boolean, _false_ | | Constrain all executions to a pool of logical cores specified in executeStageWidth | -| onlyMulticoreTests | boolean, _false_ | | Only permit tests to exceed the default coresvalue for their min/max-cores range specification (only works with non-zero defaultMaxCores) | -| allowBringYourOwnContainer | boolean, _false_ | | Enable execution in a custom Docker container | -| errorOperationRemainingResources | boolean, _false_ | | | -| realInputDirectories | List of Strings, _external_ | | A list of paths that will not be subject to the effects of linkInputDirectories setting, may also be used to provide writable directories as input roots for actions which expect to be able to write to an input location and will fail if they cannot | - -``` +| Configuration | Accepted and _Default_ Values | Environment Var | Description | +|----------------------------------|-------------------------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| port | Integer, _8981_ | | Listening port of the worker | +| publicName | String, _DERIVED:port_ | INSTANCE_NAME | Host:port of the GRPC server, required to be accessible by all servers | +| root | String, _/tmp/worker_ | | Path for all operation content storage | +| inlineContentLimit | Integer, _1048567_ | | Total size in bytes of inline content for action results, output files, stdout, stderr content | +| operationPollPeriod | Integer, _1_ | | Period between poll operations at any stage | +| executeStageWidth | Integer, _0_ | EXECUTION_STAGE_WIDTH | Number of CPU cores available for execution (0 = system available cores) | +| executeStageWidthOffset | Integer, _0_ | | Offset number of CPU cores available for execution (to allow for use by other processes) | +| inputFetchStageWidth | Integer, _0_ | | Number of concurrently available slots to fetch inputs (0 = system calculated based on CPU cores) | +| inputFetchDeadline | Integer, _60_ | | Limit on time (seconds) for input fetch stage to fetch inputs | +| linkInputDirectories | boolean, _true_ | | Use an input directory creation strategy which creates a single directory tree at the highest level containing no output paths of any kind, and symlinks that directory into an action's execroot, saving large amounts of time spent manufacturing the same read-only input hierirchy over multiple actions' executions | +| execOwner | String, _null_ | | Create exec trees containing directories that are owned by this user | +| hexBucketLevels | Integer, _0_ | | Number of levels to create for directory storage by leading byte of the hash (problematic, not recommended) | +| defaultMaxCores | Integer, _0_ | | Constrain all executions to this logical core count unless otherwise specified via min/max-cores (0 = no limit) | +| limitGlobalExecution | boolean, _false_ | | Constrain all executions to a pool of logical cores specified in executeStageWidth | +| onlyMulticoreTests | boolean, _false_ | | Only permit tests to exceed the default coresvalue for their min/max-cores range specification (only works with non-zero defaultMaxCores) | +| allowBringYourOwnContainer | boolean, _false_ | | Enable execution in a custom Docker container | +| errorOperationRemainingResources | boolean, _false_ | | | +| realInputDirectories | List of Strings, _external_ | | A list of paths that will not be subject to the effects of linkInputDirectories setting, may also be used to provide writable directories as input roots for actions which expect to be able to write to an input location and will fail if they cannot | +| gracefulShutdownSeconds | Integer, 0 | | Time in seconds to allow for operations in flight to finish when shutdown signal is received | +| createSymlinkOutputs | boolean, _false_ | | Creates SymlinkNodes for symbolic links discovered in output paths for actions. No verification of the symlink target path occurs. Buildstream, for example, requires this. | + +```yaml worker: port: 8981 publicName: "localhost:8981" @@ -276,7 +281,7 @@ worker: Example: -``` +```yaml worker: capabilities: cas: true @@ -293,7 +298,7 @@ worker: Example: -``` +```yaml worker: sandboxSettings: alwaysUse: true @@ -310,7 +315,7 @@ worker: Example: -``` +```yaml worker: dequeueMatchSettings: acceptEverything: true @@ -330,22 +335,23 @@ worker: Example: -``` +```yaml worker: - cas: - type: FILESYSTEM - path: "cache" - maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 - maxEntrySizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 - target: + storages: + - type: FILESYSTEM + path: "cache" + maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 + maxEntrySizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 + target: ``` ``` worker: - cas: - type: GRPC - instanceName: external-cas - target: "cas.external.com:1234" + storages: + - type: FILESYSTEM + path: "cache" + - type: GRPC + target: "cas.external.com:1234" ``` ### Execution Policies @@ -357,7 +363,7 @@ worker: Example: -``` +```yaml worker: executionPolicies: - name: test diff --git a/_site/docs/contribute/design-documents.md b/_site/docs/contribute/design-documents.md index f7b1c0b80d..ea608696f3 100644 --- a/_site/docs/contribute/design-documents.md +++ b/_site/docs/contribute/design-documents.md @@ -5,4 +5,5 @@ parent: Contribute nav_order: 2 --- -[Infinite Cache (Storage Workers)](https://docs.google.com/document/d/1IQQbWPzjSluDL25FZ9ADtNIOT90PLijQGIAC4RbwMjY/edit?usp=sharing) \ No newline at end of file +[Infinite Cache (Storage Workers)](https://docs.google.com/document/d/1IQQbWPzjSluDL25FZ9ADtNIOT90PLijQGIAC4RbwMjY/edit?usp=sharing) +[Local and Global Resources](https://docs.google.com/document/d/1u0TkmVmdMS53PWR1hgh-a_cj3NmQYE0Favv9aGFfQZs/edit?usp=sharing) \ No newline at end of file diff --git a/_site/docs/execution/builds-without-the-bytes.md b/_site/docs/execution/builds-without-the-bytes.md index abda8c86b5..5c49025877 100644 --- a/_site/docs/execution/builds-without-the-bytes.md +++ b/_site/docs/execution/builds-without-the-bytes.md @@ -7,7 +7,7 @@ nav_order: 4 # Builds Without The Bytes -tl;dr: add `--build_request_id=https://host?ENSURE_OUTPUTS_PRESENT=true#$(uuidgen)` to your BWOB bazel invocations. +tl;dr: add `--build_request_id=https://host?ENSURE_OUTPUTS_PRESENT=true#$(uuidgen)`, to your BWOB bazel invocations, or enable `ensureOutputsPresent` in your config to set it globally. As proposed in this [issue](https://github.com/bazelbuild/bazel/issues/6862) and the accompanying document, bazel endeavors to provide a mechanism to be 'content-light' for remote execution, using only content reference addresses to request action execution and construct successively dependent action definitions. @@ -17,4 +17,4 @@ This puts BuildFarm in the uncomfortable position of never being able to expire To combat this, you can provide some metadata to buildfarm that will help to limit (but will not remove the possibility of) failed builds. -Bazel presents a 'correlated_invocations_id' on every request to BuildFarm, including the GetActionResult request, which it uses to retrieve cached results. Since ActionResults are the long tail survivor of actions, being retained for much longer after one executes and produces its content, this represents the most likely position where content may have been removed, and a stale reference might be provided. BuildFarm recognizes this correlated_invocations_id and if it is a URI, can parse its query parameters for behavior control. One such control is ENSURE_OUTPUTS_PRESENT for the GetActionResult request - if this query value is the string "true", BuildFarm will make a silent FindMissingBlobs check for all of the outputs of an ActionResult before responding with it. If any are missing, BuildFarm will instead return code NOT_FOUND, inspiring the client to see a cache miss, and attempt a [remote] execution. \ No newline at end of file +Bazel presents a 'correlated_invocations_id' on every request to BuildFarm, including the GetActionResult request, which it uses to retrieve cached results. Since ActionResults are the long tail survivor of actions, being retained for much longer after one executes and produces its content, this represents the most likely position where content may have been removed, and a stale reference might be provided. BuildFarm recognizes this correlated_invocations_id and if it is a URI, can parse its query parameters for behavior control. One such control is ENSURE_OUTPUTS_PRESENT for the GetActionResult request - if this query value is the string "true", BuildFarm will make a silent FindMissingBlobs check for all of the outputs of an ActionResult before responding with it. If any are missing, BuildFarm will instead return code NOT_FOUND, inspiring the client to see a cache miss, and attempt a [remote] execution. diff --git a/_site/docs/execution/environment.md b/_site/docs/execution/environment.md index 4a17aabe6b..d2cbca0b51 100644 --- a/_site/docs/execution/environment.md +++ b/_site/docs/execution/environment.md @@ -116,7 +116,7 @@ Next we will create a BUILD file to create our target image. We will use the sha ``` load("@io_bazel_rules_docker//container:container.bzl", "container_image") -java_image( +container_image( name = "buildfarm-shard-worker-ubuntu20-java14", base = "@ubuntu20_java14_image_base//image", files = [ diff --git a/_site/docs/execution/execution_policies.md b/_site/docs/execution/execution_policies.md index 19698a9925..f0eaf295d9 100644 --- a/_site/docs/execution/execution_policies.md +++ b/_site/docs/execution/execution_policies.md @@ -17,7 +17,7 @@ This policy type specifies that a worker should prepend a single path, and a num This example will use the buildfarm-provided executable `as-nobody`, which will upon execution demote itself to a `nobody` effective process owner uid, and perform an `execvp(2)` with the remaining provided program arguments, which will subsequently execute as a user that no longer matches the worker process. -``` +```yaml # default wrapper policy application worker: executionPolicies: @@ -50,7 +50,8 @@ These wrappers are used for detecting actions that rely on time. Below is a dem This addresses two problems in regards to an action's dependence on time. The 1st problem is when an action takes longer than it should because it's sleeping unnecessarily. The 2nd problem is when an action relies on time which causes it to eventually be broken on master despite the code not changing. Both problems are expressed below as unit tests. We demonstrate a time-spoofing mechanism (the re-writing of syscalls) which allows us to detect these problems generically over any action. The objective is to analyze builds for performance inefficiency and discover future instabilities before they occur. ### Issue 1 (slow test) -``` + +```bash #!/bin/bash set -euo pipefail @@ -58,16 +59,19 @@ echo -n "testing... " sleep 10; echo "done" ``` + The test takes 10 seconds to run on average. -``` -bazel test --runs_per_test=10 --config=remote //cloud/buildfarm:sleep_test + +```shell +$ bazel test --runs_per_test=10 --config=remote //cloud/buildfarm:sleep_test //cloud/buildfarm:sleep_test PASSED in 10.2s Stats over 10 runs: max = 10.2s, min = 10.1s, avg = 10.2s, dev = 0.0s ``` We can check for performance improvements by using the `skip-sleep` option. -``` -bazel test --runs_per_test=10 --config=remote --remote_default_exec_properties='skip-sleep=true' //cloud/buildfarm:sleep_test + +```shell +$ bazel test --runs_per_test=10 --config=remote --remote_default_exec_properties='skip-sleep=true' //cloud/buildfarm:sleep_test //cloud/buildfarm:sleep_test PASSED in 1.0s Stats over 10 runs: max = 1.0s, min = 0.9s, avg = 1.0s, dev = 0.0s ``` @@ -75,7 +79,8 @@ bazel test --runs_per_test=10 --config=remote --remote_default_exec_properties=' Now the test is 10x faster. If skipping sleep makes an action perform significantly faster without affecting its success rate, that would warrant further investigation into the action's implementation. ### Issue 2 (future failing test) -``` + +```bash #!/bin/bash set -euo pipefail @@ -89,12 +94,15 @@ echo "Times change." date exit -1; ``` + The test passes today, but will it pass tomorrow? Will it pass a year from now? We can find out by using the `time-shift` option. -``` -bazel test --test_output=streamed --remote_default_exec_properties='time-shift=31556952' --config=remote //cloud/buildfarm:future_fail + +```shell +$ bazel test --test_output=streamed --remote_default_exec_properties='time-shift=31556952' --config=remote //cloud/buildfarm:future_fail INFO: Found 1 test target... Times change. Mon Sep 25 18:31:09 UTC 2023 //cloud/buildfarm:future_fail FAILED in 18.0s ``` + Time is shifted to the year 2023 and the test now fails. We can fix the problem before others see it. diff --git a/_site/docs/execution/execution_properties.md b/_site/docs/execution/execution_properties.md index 85579ed099..7966c70dd2 100644 --- a/_site/docs/execution/execution_properties.md +++ b/_site/docs/execution/execution_properties.md @@ -76,37 +76,42 @@ Despite being given 1 core, they see all of the cpus and decide to spawn that ma **Standard Example:** This test will succeed when env var TESTVAR is foobar, and fail otherwise. -``` + +```shell #!/bin/bash [ "$TESTVAR" = "foobar" ] ``` -``` -./bazel test \ + +```shell +$ ./bazel test \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main FAIL ``` -``` -./bazel test --remote_default_exec_properties='env-vars={"TESTVAR": "foobar"}' \ +```shell +$ ./bazel test --remote_default_exec_properties='env-vars={"TESTVAR": "foobar"}' \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main PASS ``` + **Template Example:** If you give a range of cores, buildfarm has the authority to decide how many your operation actually claims. You can let buildfarm resolve this value for you (via [mustache](https://mustache.github.io/)). -``` +```bash #!/bin/bash [ "$MKL_NUM_THREADS" = "1" ] ``` -``` -./bazel test \ + +```shell +$ ./bazel test \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main FAIL ``` -``` -./bazel test \ + +```shell +$ ./bazel test \ --remote_default_exec_properties='env-vars="MKL_NUM_THREADS": "{{limits.cpu.claimed}}"' \ --remote_executor=grpc://127.0.0.1:8980 --noremote_accept_cached --nocache_test_results \ //env_test:main diff --git a/_site/docs/metrics/metrics.md b/_site/docs/metrics/metrics.md index 5aa96fe56f..6079b7e9b7 100644 --- a/_site/docs/metrics/metrics.md +++ b/_site/docs/metrics/metrics.md @@ -124,6 +124,10 @@ Gauge for the number of operations in each stage (using a stage_name for each in Gauge for the completed operations status (using a status_code label for each individual GRPC code) +**operation_exit_code** + +Gauge for the completed operations exit code (using a exit_code label for each individual execution exit code) + **operation_worker** Gauge for the number of operations executed on each worker (using a worker_name label for each individual worker) diff --git a/_site/docs/quick_start.md b/_site/docs/quick_start.md index 5383c31199..8a9a9234db 100644 --- a/_site/docs/quick_start.md +++ b/_site/docs/quick_start.md @@ -10,7 +10,7 @@ Here we describe how to use bazel remote caching or remote execution with buildf ## Setup -You can run this quick start on a single computer running nearly any flavor of linux. This computer is the localhost for the rest of the description. +You can run this quick start on a single computer running any flavor of linux that bazel supports. A C++ compiler is used here to demonstrate action execution. This computer is the localhost for the rest of the description. ### Backplane @@ -25,7 +25,8 @@ Let's start with a bazel workspace with a single file to compile into an executa Create a new directory for our workspace and add the following files: `main.cc`: -``` + +```c #include int main( int argc, char *argv[] ) @@ -35,7 +36,8 @@ int main( int argc, char *argv[] ) ``` `BUILD`: -``` + +```starlark cc_binary( name = "main", srcs = ["main.cc"], @@ -44,7 +46,7 @@ cc_binary( And an empty WORKSPACE file. -As a test, verify that `bazel run :main` builds your main program and runs it, and prints `Hello, World!`. This will ensure that you have properly installed bazel and a C++ compiler, and have a working target before moving on to remote execution. +As a test, verify that `bazel run :main` builds your main program and runs it, and prints `Hello, World!`. This will ensure that you have properly installed `bazel` and a C++ compiler, and have a working target before moving on to remote caching or remote execution. Download and extract the buildfarm repository. Each command sequence below will have the intended working directory indicated, between the client (workspace running bazel), and buildfarm. @@ -52,25 +54,35 @@ This tutorial assumes that you have a bazel binary in your path and you are in t ## Remote Caching -A Buildfarm server with an instance can be used strictly as an ActionCache and ContentAddressableStorage to improve build performance. This is an example of running a bazel client that will retrieve results if available, and store them if the cache is missed and the execution needs to run locally. +A Buildfarm cluster can be used strictly as an ActionCache (AC) and ContentAddressableStorage (CAS) to improve build performance. This is an example of running a bazel client that will retrieve results if available, otherwise store them on a cache miss after executing locally. Download the buildfarm repository and change into its directory, then: -run `bazelisk run src/main/java/build/buildfarm:buildfarm-server $PWD/examples/config.minimal.yml` + * run `bazel run src/main/java/build/buildfarm:buildfarm-server $PWD/examples/config.minimal.yml` This will wait while the server runs, indicating that it is ready for requests. -From another prompt (i.e. a separate terminal) in your newly created workspace directory from above: +A server alone does not itself store the content of action results. It acts as an endpoint for any number of workers that present storage, so we must also start a single worker. + +From another prompt (i.e. a separate terminal) in the buildfarm repository directory: -run `bazel clean` -run `bazel run --remote_cache=grpc://localhost:8980 :main` + * run `bazel run src/main/java/build/buildfarm:buildfarm-shard-worker -- --prometheus_port=9091 $PWD/examples/config.minimal.yml` + +The `--` option is bazel convention to treat all subsequent arguments as parameters to the running app, like our `--prometheus_port`, instead of interpreting them with `run` +The `--prometheus_port=9091` option allows this worker to run alongside our server, who will have started and logged that it has started a service on port `9090`. You can also turn this option off (with `--` separator), with `--prometheus_option=0` for either server or worker. +This will also wait while the worker runs, indicating it will be available to store cache content. + +From another prompt in your newly created workspace directory from above: + + * run `bazel clean` + * run `bazel run --remote_cache=grpc://localhost:8980 :main` Why do we clean here? Since we're verifying re-execution and caching, this ensures that we will execute any actions in the `run` step and interact with the remote cache. We should be attempting to retrieve cached results, and then when we miss - since we just started this memory resident server - bazel will upload the results of the execution for later use. There will be no change in the output of this bazel run if everything worked, since bazel does not provide output each time it uploads results. To prove that we have placed something in the action cache, we need to do the following: -run `bazel clean` -run `bazel run --remote_cache=localhost:8980 :main` + * run `bazel clean` + * run `bazel run --remote_cache=localhost:8980 :main` This should now print statistics on the `processes` line that indicate that you've retrieved results from the cache for your actions: @@ -80,20 +92,22 @@ INFO: 2 processes: 2 remote cache hit. ## Remote Execution (and caching) -Now we will use buildfarm for remote execution with a minimal configuration - a single memory instance, with a worker on the localhost that can execute a single process at a time - via a bazel invocation on our workspace. +Now we will use buildfarm for remote execution with a minimal configuration with a worker on the localhost that can execute a single process at a time, via a bazel invocation on our workspace. + +First, to clean out the results from the previous cached actions, flush your local redis database: -First, we should restart the buildfarm server to ensure that we get remote execution (this can also be forced from the client by using `--noremote_accept_cached`). From the buildfarm server prompt and directory: + * run `redis-cli flushdb` -interrupt a running `buildfarm-server` -run `bazelisk run src/main/java/build/buildfarm:buildfarm-server $PWD/examples/config.minimal.yml` +Next, we should restart the buildfarm server, and delete the worker's cas storage to ensure that we get remote execution (this can also be forced from the client by using `--noremote_accept_cached`). From the buildfarm server prompt and directory: -From another prompt in the buildfarm repository directory: + * interrupt the running `buildfarm-server` (i.e. Ctrl-C) + * run `bazel run src/main/java/build/buildfarm:buildfarm-server $PWD/examples/config.minimal.yml` -run `bazelisk run src/main/java/build/buildfarm:buildfarm-shard-worker $PWD/examples/config.minimal.yml` +You can leave the worker running from the Remote Caching step, it will not require a restart From another prompt, in your client workspace: -run `bazel run --remote_executor=grpc://localhost:8980 :main` + * run `bazel run --remote_executor=grpc://localhost:8980 :main` Your build should now print out the following on its `processes` summary line: @@ -106,23 +120,30 @@ That `2 remote` indicates that your compile and link ran remotely. Congratulatio ## Container Quick Start To bring up a minimal buildfarm cluster, you can run: + +```shell +$ ./examples/bf-run start ``` -./examples/bf-run start -``` + This will start all of the necessary containers at the latest version. Once the containers are up, you can build with `bazel run --remote_executor=grpc://localhost:8980 :main`. To stop the containers, run: + +```shell +$ ./examples/bf-run stop ``` -./examples/bf-run stop -``` + +## Next Steps + +We've started our worker on the same host as our server, and also the same host on which we built with bazel, but these services can be spread across many machines, per 'remote'. A large number of workers, with a relatively small number of servers (10:1 and 100:1 ratios have been used in practice), consolidating large disks and beefy multicore cpus/gpus on workers, with specialization of what work they perform for bazel builds (or other client work), and specializing servers to have hefty network connections to funnel content traffic. A buildfarm deployment can service hundreds or thousands of developers or CI processes, enabling them to benefit from each others' shared context in the AC/CAS, and the pooled execution of a fleet of worker hosts eager to consume operations and deliver results. ## Buildfarm Manager You can now easily launch a new Buildfarm cluster locally or in AWS using an open sourced [Buildfarm Manager](https://github.com/80degreeswest/bfmgr). -``` -wget https://github.com/80degreeswest/bfmgr/releases/download/1.0.7/bfmgr-1.0.7.jar -java -jar bfmgr-1.0.7.jar -Navigate to http://localhost +```shell +$ wget https://github.com/80degreeswest/bfmgr/releases/download/1.0.7/bfmgr-1.0.7.jar +$ java -jar bfmgr-1.0.7.jar +$ open http://localhost ``` diff --git a/admin/main/pom.xml b/admin/main/pom.xml index 006217ccb4..8e128ee64c 100644 --- a/admin/main/pom.xml +++ b/admin/main/pom.xml @@ -94,7 +94,7 @@ org.json json - 20230227 + 20231013 org.projectlombok diff --git a/admin/main/src/main/resources/proto/buildfarm.proto b/admin/main/src/main/resources/proto/buildfarm.proto index f99feb3ac9..4349babc16 100644 --- a/admin/main/src/main/resources/proto/buildfarm.proto +++ b/admin/main/src/main/resources/proto/buildfarm.proto @@ -532,7 +532,7 @@ message RedisShardBackplaneConfig { int32 max_attempts = 33; } -message ShardInstanceConfig { +message ServerInstanceConfig { bool run_dispatched_monitor = 1; int32 dispatched_monitor_interval_seconds = 2; @@ -556,7 +556,7 @@ message ShardInstanceConfig { google.protobuf.Duration grpc_timeout = 8; } -message ShardWorkerInstanceConfig { +message WorkerInstanceConfig { // whether to stream stdout from processes bool stream_stdout = 6; @@ -580,7 +580,7 @@ message ShardWorkerInstanceConfig { } message ShardWorkerConfig { - ShardWorkerInstanceConfig shard_worker_instance_config = 1; + WorkerInstanceConfig shard_worker_instance_config = 1; int32 port = 2; @@ -850,7 +850,7 @@ message InstanceConfig { oneof type { MemoryInstanceConfig memory_instance_config = 3; - ShardInstanceConfig shard_instance_config = 4; + ServerInstanceConfig shard_instance_config = 4; } } diff --git a/defs.bzl b/defs.bzl index 08f9a5c561..b0a4ececa2 100644 --- a/defs.bzl +++ b/defs.bzl @@ -8,7 +8,7 @@ load( "@io_bazel_rules_docker//repositories:repositories.bzl", container_repositories = "repositories", ) -load("@io_grpc_grpc_java//:repositories.bzl", "grpc_java_repositories") +load("@io_grpc_grpc_java//:repositories.bzl", "IO_GRPC_GRPC_JAVA_OVERRIDE_TARGETS", "grpc_java_repositories") load("@com_google_protobuf//:protobuf_deps.bzl", "protobuf_deps") load("@com_grail_bazel_toolchain//toolchain:rules.bzl", "llvm_toolchain") load("@io_bazel_rules_k8s//k8s:k8s.bzl", "k8s_repositories") @@ -44,27 +44,8 @@ IO_GRPC_MODULES = [ ] COM_AWS_MODULES = [ - "autoscaling", - "core", - "ec2", - "secretsmanager", - "sns", - "ssm", "s3", -] - -ORG_SPRING_MODULES = [ - "spring-beans", - "spring-core", - "spring-context", - "spring-web", -] - -ORG_SPRING_BOOT_MODULES = [ - "spring-boot-autoconfigure", - "spring-boot", - "spring-boot-starter-web", - "spring-boot-starter-thymeleaf", + "secretsmanager", ] def buildfarm_init(name = "buildfarm"): @@ -80,6 +61,7 @@ def buildfarm_init(name = "buildfarm"): "com.fasterxml.jackson.core:jackson-databind:2.15.0", "com.github.ben-manes.caffeine:caffeine:2.9.0", "com.github.docker-java:docker-java:3.2.11", + "com.github.fppt:jedis-mock:1.0.10", "com.github.jnr:jffi:1.2.16", "com.github.jnr:jffi:jar:native:1.2.16", "com.github.jnr:jnr-constants:0.9.9", @@ -96,20 +78,19 @@ def buildfarm_init(name = "buildfarm"): "com.google.errorprone:error_prone_annotations:2.9.0", "com.google.errorprone:error_prone_core:0.92", "com.google.guava:failureaccess:1.0.1", - "com.google.guava:guava:31.1-jre", + "com.google.guava:guava:32.1.1-jre", "com.google.j2objc:j2objc-annotations:1.1", "com.google.jimfs:jimfs:1.1", - "com.google.protobuf:protobuf-java-util:3.10.0", - "com.google.protobuf:protobuf-java:3.10.0", + "com.google.protobuf:protobuf-java-util:3.19.1", + "com.google.protobuf:protobuf-java:3.19.1", "com.google.truth:truth:0.44", "org.slf4j:slf4j-simple:1.7.35", "com.googlecode.json-simple:json-simple:1.1.1", "com.jayway.jsonpath:json-path:2.4.0", - "io.github.lognet:grpc-spring-boot-starter:4.5.4", "org.bouncycastle:bcprov-jdk15on:1.70", "net.jcip:jcip-annotations:1.0", - ] + ["io.netty:netty-%s:4.1.90.Final" % module for module in IO_NETTY_MODULES] + - ["io.grpc:grpc-%s:1.53.0" % module for module in IO_GRPC_MODULES] + + ] + ["io.netty:netty-%s:4.1.94.Final" % module for module in IO_NETTY_MODULES] + + ["io.grpc:grpc-%s:1.56.1" % module for module in IO_GRPC_MODULES] + [ "io.prometheus:simpleclient:0.10.0", "io.prometheus:simpleclient_hotspot:0.10.0", @@ -128,9 +109,6 @@ def buildfarm_init(name = "buildfarm"): "org.openjdk.jmh:jmh-core:1.23", "org.openjdk.jmh:jmh-generator-annprocess:1.23", "org.redisson:redisson:3.13.1", - ] + ["org.springframework.boot:%s:2.7.4" % module for module in ORG_SPRING_BOOT_MODULES] + - ["org.springframework:%s:5.3.23" % module for module in ORG_SPRING_MODULES] + - [ "org.threeten:threetenbp:1.3.3", "org.xerial:sqlite-jdbc:3.34.0", "org.jetbrains:annotations:16.0.2", @@ -138,9 +116,10 @@ def buildfarm_init(name = "buildfarm"): "org.projectlombok:lombok:1.18.24", ], generate_compat_repositories = True, + override_targets = IO_GRPC_GRPC_JAVA_OVERRIDE_TARGETS, repositories = [ - "https://repo.maven.apache.org/maven2", - "https://jcenter.bintray.com", + "https://repo1.maven.org/maven2", + "https://mirrors.ibiblio.org/pub/mirrors/maven2", ], ) diff --git a/deps.bzl b/deps.bzl index 5f5f0073b2..525bcff752 100644 --- a/deps.bzl +++ b/deps.bzl @@ -5,24 +5,24 @@ buildfarm dependencies that can be imported into other WORKSPACE files load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive", "http_file", "http_jar") load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe") -RULES_JVM_EXTERNAL_TAG = "4.2" -RULES_JVM_EXTERNAL_SHA = "cd1a77b7b02e8e008439ca76fd34f5b07aecb8c752961f9640dea15e9e5ba1ca" +RULES_JVM_EXTERNAL_TAG = "5.3" +RULES_JVM_EXTERNAL_SHA = "d31e369b854322ca5098ea12c69d7175ded971435e55c18dd9dd5f29cc5249ac" def archive_dependencies(third_party): return [ { "name": "platforms", "urls": [ - "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.6/platforms-0.0.6.tar.gz", - "https://github.com/bazelbuild/platforms/releases/download/0.0.6/platforms-0.0.6.tar.gz", + "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.7/platforms-0.0.7.tar.gz", + "https://github.com/bazelbuild/platforms/releases/download/0.0.7/platforms-0.0.7.tar.gz", ], - "sha256": "5308fc1d8865406a49427ba24a9ab53087f17f5266a7aabbfc28823f3916e1ca", + "sha256": "3a561c99e7bdbe9173aa653fd579fe849f1d8d67395780ab4770b1f381431d51", }, { "name": "rules_jvm_external", "strip_prefix": "rules_jvm_external-%s" % RULES_JVM_EXTERNAL_TAG, "sha256": RULES_JVM_EXTERNAL_SHA, - "url": "https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG, + "url": "https://github.com/bazelbuild/rules_jvm_external/releases/download/%s/rules_jvm_external-%s.tar.gz" % (RULES_JVM_EXTERNAL_TAG, RULES_JVM_EXTERNAL_TAG), }, { "name": "rules_pkg", @@ -41,9 +41,9 @@ def archive_dependencies(third_party): # Needed for "well-known protos" and @com_google_protobuf//:protoc. { "name": "com_google_protobuf", - "sha256": "dd513a79c7d7e45cbaeaf7655289f78fd6b806e52dbbd7018ef4e3cf5cff697a", - "strip_prefix": "protobuf-3.15.8", - "urls": ["https://github.com/protocolbuffers/protobuf/archive/v3.15.8.zip"], + "sha256": "25f1292d4ea6666f460a2a30038eef121e6c3937ae0f61d610611dfb14b0bd32", + "strip_prefix": "protobuf-3.19.1", + "urls": ["https://github.com/protocolbuffers/protobuf/archive/v3.19.1.zip"], }, { "name": "com_github_bazelbuild_buildtools", @@ -55,9 +55,9 @@ def archive_dependencies(third_party): # Needed for @grpc_java//compiler:grpc_java_plugin. { "name": "io_grpc_grpc_java", - "sha256": "78bf175f9a8fa23cda724bbef52ad9d0d555cdd1122bcb06484b91174f931239", - "strip_prefix": "grpc-java-1.54.1", - "urls": ["https://github.com/grpc/grpc-java/archive/v1.54.1.zip"], + "sha256": "b8fb7ae4824fb5a5ae6e6fa26ffe2ad7ab48406fdeee54e8965a3b5948dd957e", + "strip_prefix": "grpc-java-1.56.1", + "urls": ["https://github.com/grpc/grpc-java/archive/v1.56.1.zip"], }, { "name": "rules_pkg", @@ -97,9 +97,9 @@ def archive_dependencies(third_party): }, { "name": "rules_cc", - "sha256": "3d9e271e2876ba42e114c9b9bc51454e379cbf0ec9ef9d40e2ae4cec61a31b40", - "strip_prefix": "rules_cc-0.0.6", - "url": "https://github.com/bazelbuild/rules_cc/releases/download/0.0.6/rules_cc-0.0.6.tar.gz", + "sha256": "2037875b9a4456dce4a79d112a8ae885bbc4aad968e6587dca6e64f3a0900cdf", + "strip_prefix": "rules_cc-0.0.9", + "url": "https://github.com/bazelbuild/rules_cc/releases/download/0.0.9/rules_cc-0.0.9.tar.gz", }, # Used to format proto files @@ -111,10 +111,29 @@ def archive_dependencies(third_party): "patch_args": ["-p1"], "patches": ["%s:clang_toolchain.patch" % third_party], }, + + # Used to build release container images { "name": "io_bazel_rules_docker", "sha256": "b1e80761a8a8243d03ebca8845e9cc1ba6c82ce7c5179ce2b295cd36f7e394bf", "urls": ["https://github.com/bazelbuild/rules_docker/releases/download/v0.25.0/rules_docker-v0.25.0.tar.gz"], + "patch_args": ["-p0"], + "patches": ["%s:docker_go_toolchain.patch" % third_party], + }, + + # Updated versions of io_bazel_rules_docker dependencies for bazel compatibility + { + "name": "io_bazel_rules_go", + "sha256": "278b7ff5a826f3dc10f04feaf0b70d48b68748ccd512d7f98bf442077f043fe3", + "urls": [ + "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.41.0/rules_go-v0.41.0.zip", + "https://github.com/bazelbuild/rules_go/releases/download/v0.41.0/rules_go-v0.41.0.zip", + ], + }, + { + "name": "bazel_gazelle", + "sha256": "d3fa66a39028e97d76f9e2db8f1b0c11c099e8e01bf363a923074784e451f809", + "urls": ["https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.33.0/bazel-gazelle-v0.33.0.tar.gz"], }, # Bazel is referenced as a dependency so that buildfarm can access the linux-sandbox as a potential execution wrapper. @@ -188,9 +207,9 @@ def buildfarm_dependencies(repository_name = "build_buildfarm"): maybe( http_jar, "opentelemetry", - sha256 = "0523287984978c091be0d22a5c61f0bce8267eeafbbae58c98abaf99c9396832", + sha256 = "eccd069da36031667e5698705a6838d173d527a5affce6cc514a14da9dbf57d7", urls = [ - "https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.11.0/opentelemetry-javaagent.jar", + "https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.28.0/opentelemetry-javaagent.jar", ], ) diff --git a/examples/config.yml b/examples/config.yml index 3c435fccc7..008bf2e1ca 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -3,6 +3,7 @@ defaultActionTimeout: 600 maximumActionTimeout: 3600 maxEntrySizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 prometheusPort: 9090 +allowSymlinkTargetAbsolute: false server: instanceType: SHARD name: shard @@ -22,6 +23,7 @@ server: dispatchedMonitorIntervalSeconds: 1 runOperationQueuer: true ensureOutputsPresent: false + runFailsafeOperation: true maxCpu: 0 maxRequeueAttempts: 5 useDenyList: true @@ -30,6 +32,7 @@ server: recordBesEvents: false clusterId: local cloudRegion: us-east-1 + gracefulShutdownSeconds: 0 caches: directoryCacheMaxEntries: 10000 commandCacheMaxEntries: 10000 @@ -38,7 +41,6 @@ server: admin: deploymentEnvironment: AWS clusterEndpoint: "grpc://localhost" - enableGracefulShutdown: false metrics: publisher: LOG logLevel: FINEST @@ -71,8 +73,6 @@ backplane: operationChannelPrefix: "OperationChannel" casPrefix: "ContentAddressableStorage" casExpire: 604800 # 1 week - subscribeToBackplane: true - runFailsafeOperation: true maxQueueDepth: 100000 maxPreQueueDepth: 1000000 priorityQueue: false @@ -112,24 +112,26 @@ worker: skipLoad: false hexBucketLevels: 0 execRootCopyFallback: false - target: - publishTtlMetric: false + #- type: GRPC + # target: "grpc://host:port" executeStageWidth: 1 inputFetchStageWidth: 1 inputFetchDeadline: 60 linkInputDirectories: true - realInputDirectories: - - "external" + linkedInputDirectories: + - "(?!external)[^/]+" execOwner: defaultMaxCores: 0 limitGlobalExecution: false onlyMulticoreTests: false allowBringYourOwnContainer: false errorOperationRemainingResources: false + gracefulShutdownSeconds: 0 sandboxSettings: alwaysUse: false selectForBlockNetwork: false selectForTmpFs: false + createSymlinkOutputs: false executionPolicies: - name: test executionWrapper: diff --git a/images.bzl b/images.bzl index 9ab0a8b0b7..939a752ed5 100644 --- a/images.bzl +++ b/images.bzl @@ -27,25 +27,9 @@ def buildfarm_images(): ) container_pull( - name = "ubuntu-bionic", - digest = "sha256:4bc527c7a288da405f2041928c63d0a6479a120ad63461c2f124c944def54be2", + name = "ubuntu-mantic", + digest = "sha256:1419bba15470a95242e917b3abcd8981ae36707b99df5ac705e1eee4d920c51c", registry = "index.docker.io", repository = "bazelbuild/buildfarm-worker-base", - tag = "bionic-java11-gcc", - ) - - container_pull( - name = "ubuntu-jammy", - digest = "sha256:da847ee259ebe7f00631a2f0146d9add60ff0f94b031a2e522ce94c78b1335c2", - registry = "index.docker.io", - repository = "bazelbuild/buildfarm-worker-base", - tag = "jammy-java11-gcc", - ) - - container_pull( - name = "amazon_corretto_java_image_base", - registry = "index.docker.io", - repository = "amazoncorretto", - tag = "19", - digest = "sha256:81d0df4412140416b27211c999e1f3c4565ae89a5cd92889475d20af422ba507", + tag = "mantic-java17-gcc", ) diff --git a/jvm_flags.bzl b/jvm_flags.bzl index 363f161465..440e0718aa 100644 --- a/jvm_flags.bzl +++ b/jvm_flags.bzl @@ -54,6 +54,12 @@ def ensure_accurate_metadata(): "//config:windows": ["-Dsun.nio.fs.ensureAccurateMetadata=true"], }) +def add_opens_sun_nio_fs(): + return select({ + "//conditions:default": [], + "//config:windows": ["--add-opens java.base/sun.nio.fs=ALL-UNNAMED"], + }) + def server_telemetry(): return select({ "//config:open_telemetry": SERVER_TELEMETRY_JVM_FLAGS, @@ -67,7 +73,7 @@ def worker_telemetry(): }) def server_jvm_flags(): - return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + server_telemetry() + return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + add_opens_sun_nio_fs() + server_telemetry() def worker_jvm_flags(): - return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + worker_telemetry() + return RECOMMENDED_JVM_FLAGS + DEFAULT_LOGGING_CONFIG + ensure_accurate_metadata() + add_opens_sun_nio_fs() + worker_telemetry() diff --git a/kubernetes/helm-charts/buildfarm/.gitignore b/kubernetes/helm-charts/buildfarm/.gitignore new file mode 100644 index 0000000000..8d8946152c --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/.gitignore @@ -0,0 +1,2 @@ +charts +Chart.lock diff --git a/kubernetes/helm-charts/buildfarm/.helmignore b/kubernetes/helm-charts/buildfarm/.helmignore new file mode 100644 index 0000000000..0e8a0eb36f --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/kubernetes/helm-charts/buildfarm/Chart.yaml b/kubernetes/helm-charts/buildfarm/Chart.yaml new file mode 100644 index 0000000000..ce443957dd --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/Chart.yaml @@ -0,0 +1,30 @@ +apiVersion: v2 +name: buildfarm +description: A Helm chart for bazel buildfarm + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "v2.5.0" + +dependencies: + - condition: redis.enabled + name: redis + repository: https://charts.helm.sh/stable + version: 10.5.7 \ No newline at end of file diff --git a/kubernetes/helm-charts/buildfarm/templates/NOTES.txt b/kubernetes/helm-charts/buildfarm/templates/NOTES.txt new file mode 100644 index 0000000000..92421375fb --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.server.ingress.enabled }} +{{- range $host := .Values.server.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.server.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.server.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "buildfarm.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.server.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "buildfarm.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "buildfarm.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.server.service.port }} +{{- else if contains "ClusterIP" .Values.server.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "buildfarm.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/_helpers.tpl b/kubernetes/helm-charts/buildfarm/templates/_helpers.tpl new file mode 100644 index 0000000000..dffd8587bd --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/_helpers.tpl @@ -0,0 +1,73 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "buildfarm.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "buildfarm.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "buildfarm.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "buildfarm.labels" -}} +helm.sh/chart: {{ include "buildfarm.chart" . }} +{{ include "buildfarm.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "buildfarm.selectorLabels" -}} +app.kubernetes.io/name: {{ include "buildfarm.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "buildfarm.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "buildfarm.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + + +{{/* Checks for `externalRedis` */}} +{{- if .Values.externalRedis.host }} + {{/* check if they are using externalRedis (the default value for `externalRedis.host` is "localhost") */}} + {{- if not (eq .Values.externalRedis.host "localhost") }} + {{- if .Values.redis.enabled }} + {{ required "If `externalRedis.host` is set, then `redis.enabled` should be `false`!" nil }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/kubernetes/helm-charts/buildfarm/templates/configmap.yaml b/kubernetes/helm-charts/buildfarm/templates/configmap.yaml new file mode 100644 index 0000000000..809d83e049 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/configmap.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "buildfarm.fullname" . }}-config +data: + config.yml: |- + {{- range $key, $value := .Values.config }} + {{- if kindIs "map" $value }} + {{- else }} + {{ $key }}: {{ $value }}{{- end }} + {{- end }} + backplane: + {{- if .Values.redis.enabled }} + redisUri: "{{ .Values.redis.scheme }}://{{ printf "%s-redis-master.%s" (include "redis.fullname" .) (.Release.Namespace) }}:{{ "6379" }}" + {{- else }} + redisUri: "{{ .Values.externalRedis.uri }}" + {{- end }} + {{- with .Values.config.backplane }} + {{- toYaml . | nindent 6 }} + {{- end }} + {{- with .Values.config.server }} + server: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- with .Values.config.worker }} + worker: + {{- toYaml . | nindent 6 }} + {{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/server/deployment.yaml b/kubernetes/helm-charts/buildfarm/templates/server/deployment.yaml new file mode 100644 index 0000000000..e56261f1fd --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/server/deployment.yaml @@ -0,0 +1,78 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "buildfarm.fullname" . }}-server + labels: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.server.replicaCount }} + selector: + matchLabels: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + checksum/server-config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "buildfarm.serviceAccountName" . }} + containers: + - name: buildfarm-server + image: "{{ .Values.server.image.repository }}:{{ .Values.server.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.server.image.pullPolicy }} + command: + - bash + - /app/build_buildfarm/buildfarm-server.binary + args: + - /config/config.yml + env: + {{- if .Values.server.extraEnv }} + {{- toYaml .Values.server.extraEnv | nindent 12 }} + {{- end }} + ports: + - containerPort: 8980 + name: "server-comm" + - containerPort: 9090 + name: "metrics" + livenessProbe: + httpGet: + path: / + port: metrics + readinessProbe: + httpGet: + path: / + port: metrics + resources: + {{- toYaml .Values.server.resources | nindent 12 }} + volumeMounts: + - mountPath: /config + name: config + readOnly: true + {{- with .Values.server.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.server.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.server.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - configMap: + defaultMode: 420 + name: {{ include "buildfarm.fullname" . }}-config + name: config diff --git a/kubernetes/helm-charts/buildfarm/templates/server/service.yaml b/kubernetes/helm-charts/buildfarm/templates/server/service.yaml new file mode 100644 index 0000000000..6079f92dc0 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/server/service.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "buildfarm.fullname" . }}-server + labels: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.labels" . | nindent 4 }} + {{- with .Values.server.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.server.service.type }} + ports: + - port: {{ .Values.server.service.port }} + targetPort: server-comm + protocol: TCP + name: gprc + - port: 9090 + targetPort: metrics + protocol: TCP + name: metrics + selector: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.selectorLabels" . | nindent 4 }} diff --git a/kubernetes/helm-charts/buildfarm/templates/server/servicemonitor.yaml b/kubernetes/helm-charts/buildfarm/templates/server/servicemonitor.yaml new file mode 100644 index 0000000000..fe8a12b649 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/server/servicemonitor.yaml @@ -0,0 +1,37 @@ +{{- if .Values.server.serviceMonitor.enabled }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "buildfarm.fullname" . }}-server + labels: + {{- include "buildfarm.labels" . | nindent 4 }} +spec: + endpoints: + - port: "metrics" + {{- with .Values.server.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.server.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} + honorLabels: true + path: {{ .Values.server.serviceMonitor.path }} + scheme: {{ .Values.server.serviceMonitor.scheme }} + {{- with .Values.server.serviceMonitor.relabelings }} + relabelings: + {{- toYaml . | nindent 6 }} + {{- end }} + jobLabel: "{{ .Release.Name }}" + selector: + matchLabels: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.labels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + {{- with .Values.server.serviceMonitor.targetLabels }} + targetLabels: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/serviceaccount.yaml b/kubernetes/helm-charts/buildfarm/templates/serviceaccount.yaml new file mode 100644 index 0000000000..f28779e3e4 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "buildfarm.serviceAccountName" . }} + labels: + {{- include "buildfarm.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/shard-worker/autoscaler.yaml b/kubernetes/helm-charts/buildfarm/templates/shard-worker/autoscaler.yaml new file mode 100644 index 0000000000..4389c793b6 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/shard-worker/autoscaler.yaml @@ -0,0 +1,21 @@ +{{- if .Values.shardWorker.autoscaling.enabled -}} +apiVersion: autoscaling/v1 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "buildfarm.fullname" . }}-shard-worker + labels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.labels" . | nindent 4 }} + {{- with .Values.shardWorker.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + maxReplicas: {{ .Values.shardWorker.autoscaling.maxReplicas }} + minReplicas: {{ .Values.shardWorker.autoscaling.minReplicas }} + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "buildfarm.fullname" . }}-shard-worker + targetCPUUtilizationPercentage: {{ .Values.shardWorker.autoscaling.targetCPUUtilizationPercentage }} +{{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/shard-worker/service.yaml b/kubernetes/helm-charts/buildfarm/templates/shard-worker/service.yaml new file mode 100644 index 0000000000..135756bd5f --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/shard-worker/service.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "buildfarm.fullname" . }}-shard-worker + labels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.labels" . | nindent 4 }} + {{- with .Values.shardWorker.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.shardWorker.service.type }} + ports: + - port: {{ .Values.shardWorker.service.port }} + targetPort: worker-comm + protocol: TCP + name: gprc + - port: 9090 + targetPort: metrics + protocol: TCP + name: metrics + selector: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.selectorLabels" . | nindent 4 }} diff --git a/kubernetes/helm-charts/buildfarm/templates/shard-worker/servicemonitor.yaml b/kubernetes/helm-charts/buildfarm/templates/shard-worker/servicemonitor.yaml new file mode 100644 index 0000000000..8ff1a59a56 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/shard-worker/servicemonitor.yaml @@ -0,0 +1,37 @@ +{{- if .Values.shardWorker.serviceMonitor.enabled }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "buildfarm.fullname" . }}-shard-worker + labels: + {{- include "buildfarm.labels" . | nindent 4 }} +spec: + endpoints: + - port: "metrics" + {{- with .Values.shardWorker.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.shardWorker.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} + honorLabels: true + path: {{ .Values.shardWorker.serviceMonitor.path }} + scheme: {{ .Values.shardWorker.serviceMonitor.scheme }} + {{- with .Values.shardWorker.serviceMonitor.relabelings }} + relabelings: + {{- toYaml . | nindent 6 }} + {{- end }} + jobLabel: "{{ .Release.Name }}" + selector: + matchLabels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.labels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + {{- with .Values.shardWorker.serviceMonitor.targetLabels }} + targetLabels: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/shard-worker/statefulsets.yaml b/kubernetes/helm-charts/buildfarm/templates/shard-worker/statefulsets.yaml new file mode 100644 index 0000000000..62706a61ac --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/shard-worker/statefulsets.yaml @@ -0,0 +1,110 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ include "buildfarm.fullname" . }}-shard-worker + labels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.labels" . | nindent 4 }} +spec: + serviceName: {{ include "buildfarm.fullname" . }}-shard-worker + {{- if .Values.shardWorker.autoscaling.enabled }} + replicas: {{ .Values.shardWorker.autoscaling.minReplicas }} + {{- else }} + replicas: {{ .Values.shardWorker.replicaCount }} + {{- end }} + selector: + matchLabels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + checksum/worker-config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "buildfarm.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: buildfarm-worker + image: "{{ .Values.shardWorker.image.repository }}:{{ .Values.shardWorker.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.shardWorker.image.pullPolicy }} + args: + - /config/config.yml + - --public_name=$(POD_IP):8982 + command: + - bash + - /app/build_buildfarm/buildfarm-shard-worker.binary + env: + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + {{- if .Values.shardWorker.extraEnv }} + {{- toYaml .Values.shardWorker.extraEnv | nindent 12 }} + {{- end }} + ports: + - containerPort: 8981 + name: "worker-comm" + - containerPort: 9090 + name: "metrics" + livenessProbe: + httpGet: + path: / + port: metrics + readinessProbe: + httpGet: + path: / + port: metrics + resources: + {{- toYaml .Values.shardWorker.resources | nindent 12 }} + volumeMounts: + - mountPath: /config + name: config + readOnly: true + - mountPath: /tmp/worker + name: {{ include "buildfarm.fullname" . }}-shard-worker-data + {{- with .Values.extraVolumeMounts }} + {{- tpl (toYaml .) $ | nindent 12 -}} + {{- end }} + {{- with .Values.shardWorker.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.shardWorker.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.shardWorker.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - configMap: + defaultMode: 420 + name: {{ include "buildfarm.fullname" . }}-config + name: config + {{- with .Values.shardWorker.extraVolumes }} + {{- tpl (toYaml .) $ | nindent 8 }} + {{- end }} + volumeClaimTemplates: + - metadata: + name: {{ include "buildfarm.fullname" . }}-shard-worker-data + spec: + accessModes: ["ReadWriteOnce"] + {{- with .Values.shardWorker.storage.class }} + storageClassName: "{{ . }}" + {{- end }} + resources: + requests: + storage: "{{ .Values.shardWorker.storage.size }}" diff --git a/kubernetes/helm-charts/buildfarm/templates/tests/test-connection.yaml b/kubernetes/helm-charts/buildfarm/templates/tests/test-connection.yaml new file mode 100644 index 0000000000..7aea6f1cfe --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "buildfarm.fullname" . }}-test-connection" + labels: + {{- include "buildfarm.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: curl + image: appropriate/curl:latest + command: ['curl'] + args: ['--output', '/dev/null', '{{ include "buildfarm.fullname" . }}-server:{{ .Values.server.service.port }}'] + restartPolicy: Never diff --git a/kubernetes/helm-charts/buildfarm/values.yaml b/kubernetes/helm-charts/buildfarm/values.yaml new file mode 100644 index 0000000000..3bb2994b7a --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/values.yaml @@ -0,0 +1,206 @@ +# Default values for buildfarm. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +nameOverride: "" +fullnameOverride: "" + +imagePullSecrets: [] + +config: + # see: https://github.com/bazelbuild/bazel-buildfarm/blob/main/examples/config.yml + digestFunction: SHA256 + defaultActionTimeout: 600 + maximumActionTimeout: 3600 + maxEntrySizeBytes: "2147483648" # 2 * 1024 * 1024 * 1024 + prometheusPort: 9090 + backplane: + queues: + - name: "cpu" + allowUnmatched: true + properties: + - name: "min-cores" + value: "*" + - name: "max-cores" + value: "*" + server: + name: "shard" + recordBesEvents: true + worker: + port: 8982 + publicName: "localhost:8982" + executeStageWidth: 80 + inputFetchStageWidth: 8 + realInputDirectories: + - "external" + root: "/tmp/worker" + storages: + - type: FILESYSTEM + path: "cache" + maxSizeBytes: 536870912000 # 500 * 1024 * 1024 * 1024 + +server: + image: + repository: bazelbuild/buildfarm-server + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + replicaCount: 1 + resources: { } + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + service: + type: ClusterIP + port: 8980 + + ingress: + enabled: false + className: "" + annotations: { } + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [ ] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + + nodeSelector: {} + tolerations: [] + affinity: {} + extraVolumes: [] + # - name: additionalSecret + # secret: + # secretName: my-secret + # defaultMode: 0600 + + extraVolumeMounts: [] + # - name: customConfig + # mountPath: /mnt/config + # readOnly: true + extraEnv: + - name: JAVABIN + value: "/usr/bin/java" + + serviceMonitor: + ## If true, a ServiceMonitor CRD is created for a prometheus operator + ## https://github.com/coreos/prometheus-operator + ## + enabled: false + path: /metrics + # namespace: monitoring (defaults to use the namespace this chart is deployed to) + labels: {} + interval: 1m + scheme: http + tlsConfig: {} + scrapeTimeout: 30s + relabelings: [] + targetLabels: [] + +shardWorker: + image: + repository: bazelbuild/buildfarm-worker + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + replicaCount: 2 + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 4 + targetCPUUtilizationPercentage: 50 + + resources: { } + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + storage: + # the storage class for pv, leave empty will using default + class: "" + size: 50Gi + + service: + type: ClusterIP + port: 8982 + + nodeSelector: {} + tolerations: [] + affinity: {} + extraVolumes: [] + # - name: additionalSecret + # secret: + # secretName: my-secret + # defaultMode: 0600 + + extraVolumeMounts: [] + # - name: customConfig + # mountPath: /mnt/config + # readOnly: true + extraEnv: + - name: JAVABIN + value: "/usr/bin/java" + serviceMonitor: + ## If true, a ServiceMonitor CRD is created for a prometheus operator + ## https://github.com/coreos/prometheus-operator + ## + enabled: false + path: /metrics + # namespace: monitoring (defaults to use the namespace this chart is deployed to) + labels: {} + interval: 1m + scheme: http + tlsConfig: {} + scrapeTimeout: 30s + relabelings: [] + targetLabels: [] + +################################### +## DATABASE | Embedded Redis +################################### +redis: + ## - set to `false` if using `externalRedis.*` + ## + enabled: true + scheme: "redis" + ## See more redis configs: https://github.com/bitnami/charts/blob/main/bitnami/redis/README.md + usePassword: false + ## configs for redis cluster mode + ## + cluster: + ## if redis runs in cluster mode + ## + enabled: false + + ## the number of redis slaves + ## + slaveCount: 1 + +externalRedis: + uri: "redis://localhost:6379" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" diff --git a/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java b/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java index a27b6a9e99..89f2e6a5be 100644 --- a/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java +++ b/persistentworkers/src/main/java/persistent/common/processes/JavaProcessWrapper.java @@ -10,12 +10,16 @@ public class JavaProcessWrapper extends ProcessWrapper { + // Get the path of the JVM from the current process to avoid breaking the Bazel sandbox + public static final String CURRENT_JVM_COMMAND = + ProcessHandle.current().info().command().orElseThrow(() -> new RuntimeException("Unable to retrieve the path of the running JVM")); + public JavaProcessWrapper( Path workDir, String classPath, String fullClassName, String[] args ) throws IOException { super(workDir, cmdArgs( new String[]{ - "java", + CURRENT_JVM_COMMAND, "-cp", classPath, fullClassName diff --git a/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java b/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java index 0cdc68a7ff..9712394203 100644 --- a/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java +++ b/persistentworkers/src/test/java/persistent/bazel/processes/PersistentWorkerTest.java @@ -16,6 +16,7 @@ import persistent.bazel.client.PersistentWorker; import persistent.bazel.client.WorkerKey; +import persistent.common.processes.JavaProcessWrapper; import persistent.testutil.ProcessUtils; import persistent.testutil.WorkerUtils; @@ -55,7 +56,7 @@ public void endToEndAdder() throws Exception { ); ImmutableList initCmd = ImmutableList.of( - "java", + JavaProcessWrapper.CURRENT_JVM_COMMAND, "-cp", jarPath.toString(), "adder.Adder", diff --git a/src/main/java/build/buildfarm/BUILD b/src/main/java/build/buildfarm/BUILD index 601aa38eb4..3cbdeb5231 100644 --- a/src/main/java/build/buildfarm/BUILD +++ b/src/main/java/build/buildfarm/BUILD @@ -1,4 +1,4 @@ -load("//:jvm_flags.bzl", "ensure_accurate_metadata") +load("//:jvm_flags.bzl", "add_opens_sun_nio_fs", "ensure_accurate_metadata") package( default_visibility = ["//src:__subpackages__"], @@ -15,7 +15,7 @@ java_binary( classpath_resources = [ ":configs", ], - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), main_class = "build.buildfarm.server.BuildFarmServer", visibility = ["//visibility:public"], runtime_deps = [ @@ -29,7 +29,7 @@ java_binary( classpath_resources = [ ":configs", ], - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), main_class = "build.buildfarm.worker.shard.Worker", visibility = ["//visibility:public"], runtime_deps = [ diff --git a/src/main/java/build/buildfarm/admin/Admin.java b/src/main/java/build/buildfarm/admin/Admin.java deleted file mode 100644 index ca739f2347..0000000000 --- a/src/main/java/build/buildfarm/admin/Admin.java +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.admin; - -import build.buildfarm.v1test.GetHostsResult; - -public interface Admin { - void terminateHost(String hostId); - - void stopContainer(String hostId, String containerName); - - GetHostsResult getHosts(String filter, int ageInMinutes, String status); - - void scaleCluster( - String scaleGroupName, - Integer minHosts, - Integer maxHosts, - Integer targetHosts, - Integer targetReservedHostsPercent); - - void disableHostScaleInProtection(String instanceName); - - void disableHostScaleInProtection(String clusterEndpoint, String instanceIp); -} diff --git a/src/main/java/build/buildfarm/admin/BUILD b/src/main/java/build/buildfarm/admin/BUILD deleted file mode 100644 index a5e481399e..0000000000 --- a/src/main/java/build/buildfarm/admin/BUILD +++ /dev/null @@ -1,15 +0,0 @@ -java_library( - name = "admin", - srcs = glob(["*.java"]), - visibility = ["//visibility:public"], - deps = [ - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java", - "@maven//:com_google_protobuf_protobuf_java_util", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java b/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java deleted file mode 100644 index 5b971405cd..0000000000 --- a/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.admin.aws; - -import build.buildfarm.admin.Admin; -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.v1test.AdminGrpc; -import build.buildfarm.v1test.DisableScaleInProtectionRequest; -import build.buildfarm.v1test.GetHostsResult; -import build.buildfarm.v1test.Host; -import com.amazonaws.services.autoscaling.AmazonAutoScaling; -import com.amazonaws.services.autoscaling.AmazonAutoScalingClientBuilder; -import com.amazonaws.services.autoscaling.model.InstancesDistribution; -import com.amazonaws.services.autoscaling.model.MixedInstancesPolicy; -import com.amazonaws.services.autoscaling.model.SetInstanceProtectionRequest; -import com.amazonaws.services.autoscaling.model.SetInstanceProtectionResult; -import com.amazonaws.services.autoscaling.model.UpdateAutoScalingGroupRequest; -import com.amazonaws.services.ec2.AmazonEC2; -import com.amazonaws.services.ec2.AmazonEC2ClientBuilder; -import com.amazonaws.services.ec2.model.DescribeInstancesRequest; -import com.amazonaws.services.ec2.model.DescribeInstancesResult; -import com.amazonaws.services.ec2.model.Filter; -import com.amazonaws.services.ec2.model.Instance; -import com.amazonaws.services.ec2.model.Reservation; -import com.amazonaws.services.ec2.model.Tag; -import com.amazonaws.services.ec2.model.TerminateInstancesRequest; -import com.amazonaws.services.simplesystemsmanagement.AWSSimpleSystemsManagement; -import com.amazonaws.services.simplesystemsmanagement.AWSSimpleSystemsManagementClientBuilder; -import com.amazonaws.services.simplesystemsmanagement.model.SendCommandRequest; -import com.google.protobuf.util.Timestamps; -import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.TimeZone; -import java.util.logging.Level; -import lombok.extern.java.Log; -import org.springframework.stereotype.Component; - -@Log -@Component -public class AwsAdmin implements Admin { - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - private AmazonAutoScaling scale; - private AmazonEC2 ec2; - private AWSSimpleSystemsManagement ssm; - - public AwsAdmin() { - String region = configs.getServer().getCloudRegion(); - if (region != null) { - scale = AmazonAutoScalingClientBuilder.standard().withRegion(region).build(); - ec2 = AmazonEC2ClientBuilder.standard().withRegion(region).build(); - ssm = AWSSimpleSystemsManagementClientBuilder.standard().withRegion(region).build(); - } else { - log.warning("Missing cloudRegion configuration. AWS Admin will not be enabled."); - } - } - - @Override - public void terminateHost(String hostId) { - ec2.terminateInstances(new TerminateInstancesRequest().withInstanceIds(hostId)); - log.log(Level.INFO, String.format("Terminated host: %s", hostId)); - } - - @Override - public void stopContainer(String hostId, String containerName) { - String stopContainerCmd = - "docker ps | grep " + containerName + " | awk '{print $1 }' | xargs -I {} docker stop {}"; - Map> parameters = new HashMap<>(); - parameters.put("commands", Collections.singletonList(stopContainerCmd)); - ssm.sendCommand( - new SendCommandRequest() - .withDocumentName("AWS-RunShellScript") - .withInstanceIds(hostId) - .withParameters(parameters)); - log.log(Level.INFO, String.format("Stopped container: %s on host: %s", containerName, hostId)); - } - - @Override - public GetHostsResult getHosts(String filter, int ageInMinutes, String status) { - GetHostsResult.Builder resultBuilder = GetHostsResult.newBuilder(); - List hosts = new ArrayList<>(); - DescribeInstancesResult instancesResult = - ec2.describeInstances( - new DescribeInstancesRequest() - .withFilters(new Filter().withName("tag-value").withValues(filter))); - long hostNum = 1L; - for (Reservation r : instancesResult.getReservations()) { - for (Instance e : r.getInstances()) { - long uptime = getHostUptimeInMinutes(e.getLaunchTime()); - if (e.getPrivateIpAddress() != null - && uptime > ageInMinutes - && status.equalsIgnoreCase(e.getState().getName())) { - Host.Builder hostBuilder = Host.newBuilder(); - hostBuilder.setHostNum(hostNum++); - hostBuilder.setDnsName(e.getPrivateDnsName()); - hostBuilder.setHostId(e.getInstanceId()); - hostBuilder.setIpAddress(e.getPrivateIpAddress()); - hostBuilder.setLaunchTime(Timestamps.fromMillis(e.getLaunchTime().getTime())); - hostBuilder.setLifecycle( - e.getInstanceLifecycle() != null ? e.getInstanceLifecycle() : "on demand"); - hostBuilder.setNumCores(e.getCpuOptions().getCoreCount()); - hostBuilder.setState(e.getState().getName()); - hostBuilder.setType(e.getInstanceType()); - hostBuilder.setUptimeMinutes(uptime); - hosts.add(hostBuilder.build()); - } - } - } - resultBuilder.addAllHosts(hosts); - resultBuilder.setNumHosts(hosts.size()); - log.log(Level.FINE, String.format("Got %d hosts for filter: %s", hosts.size(), filter)); - return resultBuilder.build(); - } - - @Override - public void scaleCluster( - String scaleGroupName, - Integer minHosts, - Integer maxHosts, - Integer targetHosts, - Integer targetReservedHostsPercent) { - UpdateAutoScalingGroupRequest request = - new UpdateAutoScalingGroupRequest().withAutoScalingGroupName(scaleGroupName); - if (minHosts != null) { - request.setMinSize(minHosts); - } - if (maxHosts != null) { - request.setMaxSize(maxHosts); - } - if (targetHosts != null) { - request.setMaxSize(targetHosts); - } - if (targetReservedHostsPercent != null) { - request.setMixedInstancesPolicy( - new MixedInstancesPolicy() - .withInstancesDistribution( - new InstancesDistribution() - .withOnDemandPercentageAboveBaseCapacity(targetReservedHostsPercent))); - } - scale.updateAutoScalingGroup(request); - log.log(Level.INFO, String.format("Scaled: %s", scaleGroupName)); - } - - private long getHostUptimeInMinutes(Date launchTime) { - Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("UTC")); - return (cal.getTime().getTime() - launchTime.getTime()) / 60000; - } - - /** - * Disable instance scale in protection so that auto scaler can shutdown the instance. - * - * @param privateDnsName the private Dns name of instance (i.e. ip-xx-xxx-xx-xx.ec2.internal) - */ - @Override - public void disableHostScaleInProtection(String privateDnsName) { - // 1 get AutoScalingGroup and InstanceId - Instance workerInstance = getInstanceId(privateDnsName); - if (workerInstance == null) { - String errorMessage = "Cannot find instance with private DNS name " + privateDnsName; - log.log(Level.SEVERE, errorMessage); - throw new RuntimeException(errorMessage); - } - String instanceId = workerInstance.getInstanceId(); - String autoScalingGroup = getTagValue(workerInstance.getTags()); - if (autoScalingGroup == null || autoScalingGroup.length() == 0) { - String errorMessage = - "Cannot find AutoScalingGroup name of worker with private DNS name " + privateDnsName; - log.log(Level.SEVERE, errorMessage); - throw new RuntimeException(errorMessage); - } - - // 2 disable scale in protection of the worker - SetInstanceProtectionRequest disableProtectionRequest = - new SetInstanceProtectionRequest() - .withInstanceIds(instanceId) - .withAutoScalingGroupName(autoScalingGroup) - .withProtectedFromScaleIn(false); - SetInstanceProtectionResult result = scale.setInstanceProtection(disableProtectionRequest); - log.log( - Level.INFO, - String.format( - "Disable protection of host: %s in AutoScalingGroup: %s and get result: %s", - instanceId, autoScalingGroup, result.toString())); - } - - @Override - public void disableHostScaleInProtection(String clusterEndpoint, String instanceIp) { - ManagedChannel channel = null; - try { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(clusterEndpoint).negotiationType(NegotiationType.PLAINTEXT); - channel = builder.build(); - AdminGrpc.AdminBlockingStub adminBlockingStub = AdminGrpc.newBlockingStub(channel); - adminBlockingStub.disableScaleInProtection( - DisableScaleInProtectionRequest.newBuilder().setInstanceName(instanceIp).build()); - } finally { - if (channel != null) { - channel.shutdown(); - } - } - } - - private String getTagValue(List tags) { - for (Tag tag : tags) { - if ("aws:autoscaling:groupName".equalsIgnoreCase(tag.getKey())) { - return tag.getValue(); - } - } - return null; - } - - private Instance getInstanceId(String privateDnsName) { - DescribeInstancesRequest describeInstancesRequest = - new DescribeInstancesRequest() - .withFilters(new Filter().withName("private-dns-name").withValues(privateDnsName)); - DescribeInstancesResult instancesResult = ec2.describeInstances(describeInstancesRequest); - for (Reservation r : instancesResult.getReservations()) { - for (Instance e : r.getInstances()) { - if (e.getPrivateDnsName() != null && e.getPrivateDnsName().equals(privateDnsName)) { - return e; - } - } - } - return null; - } -} diff --git a/src/main/java/build/buildfarm/admin/aws/BUILD b/src/main/java/build/buildfarm/admin/aws/BUILD deleted file mode 100644 index ea544e0b68..0000000000 --- a/src/main/java/build/buildfarm/admin/aws/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -java_library( - name = "aws", - srcs = glob(["*.java"]), - plugins = ["//src/main/java/build/buildfarm/common:lombok"], - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/java/build/buildfarm/common/config", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_amazonaws_aws_java_sdk_autoscaling", - "@maven//:com_amazonaws_aws_java_sdk_core", - "@maven//:com_amazonaws_aws_java_sdk_ec2", - "@maven//:com_amazonaws_aws_java_sdk_secretsmanager", - "@maven//:com_amazonaws_aws_java_sdk_ssm", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:io_grpc_grpc_api", - "@maven//:io_grpc_grpc_netty", - "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/admin/gcp/BUILD b/src/main/java/build/buildfarm/admin/gcp/BUILD deleted file mode 100644 index 3d94b91f3f..0000000000 --- a/src/main/java/build/buildfarm/admin/gcp/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -java_library( - name = "gcp", - srcs = glob(["*.java"]), - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/admin/gcp/GcpAdmin.java b/src/main/java/build/buildfarm/admin/gcp/GcpAdmin.java deleted file mode 100644 index 34ee9a0163..0000000000 --- a/src/main/java/build/buildfarm/admin/gcp/GcpAdmin.java +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.admin.gcp; - -import build.buildfarm.admin.Admin; -import build.buildfarm.v1test.GetHostsResult; -import org.springframework.stereotype.Component; - -@Component -public class GcpAdmin implements Admin { - @Override - public void terminateHost(String hostId) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void stopContainer(String hostId, String containerName) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public GetHostsResult getHosts(String filter, int ageInMinutes, String status) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void scaleCluster( - String scaleGroupName, - Integer minHosts, - Integer maxHosts, - Integer targetHosts, - Integer targetReservedHostsPercent) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void disableHostScaleInProtection(String instanceName) { - throw new UnsupportedOperationException("Not Implemented."); - } - - @Override - public void disableHostScaleInProtection(String clusterEndpoint, String instanceIp) { - throw new UnsupportedOperationException("Not Implemented"); - } -} diff --git a/src/main/java/build/buildfarm/backplane/Backplane.java b/src/main/java/build/buildfarm/backplane/Backplane.java index 513f3451ac..b1b05d5c42 100644 --- a/src/main/java/build/buildfarm/backplane/Backplane.java +++ b/src/main/java/build/buildfarm/backplane/Backplane.java @@ -282,4 +282,7 @@ boolean pollOperation(QueueEntry queueEntry, ExecutionStage.Value stage, long re Boolean propertiesEligibleForQueue(List provisions); GetClientStartTimeResult getClientStartTime(GetClientStartTimeRequest request) throws IOException; + + /** Set expiry time for digests */ + void updateDigestsExpiry(Iterable digests) throws IOException; } diff --git a/src/main/java/build/buildfarm/cas/BUILD b/src/main/java/build/buildfarm/cas/BUILD index 301e922e13..146206caee 100644 --- a/src/main/java/build/buildfarm/cas/BUILD +++ b/src/main/java/build/buildfarm/cas/BUILD @@ -25,7 +25,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:io_prometheus_simpleclient", diff --git a/src/main/java/build/buildfarm/cas/ContentAddressableStorages.java b/src/main/java/build/buildfarm/cas/ContentAddressableStorages.java index ade381ff50..e1df38fae4 100644 --- a/src/main/java/build/buildfarm/cas/ContentAddressableStorages.java +++ b/src/main/java/build/buildfarm/cas/ContentAddressableStorages.java @@ -14,6 +14,7 @@ package build.buildfarm.cas; +import static build.buildfarm.common.grpc.Channels.createChannel; import static build.buildfarm.common.grpc.Retrier.NO_RETRIES; import static com.google.common.collect.Multimaps.synchronizedListMultimap; import static com.google.common.util.concurrent.MoreExecutors.directExecutor; @@ -29,8 +30,6 @@ import com.google.common.collect.ListMultimap; import com.google.common.collect.MultimapBuilder; import io.grpc.Channel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import java.io.IOException; import java.io.InputStream; import java.nio.file.NoSuchFileException; @@ -40,12 +39,6 @@ public final class ContentAddressableStorages { private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - private static Channel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static ContentAddressableStorage createGrpcCAS(Cas cas) { Channel channel = createChannel(cas.getTarget()); ByteStreamUploader byteStreamUploader = diff --git a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java index 6cd8c572c5..d7fa82ba42 100644 --- a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java +++ b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java @@ -146,7 +146,19 @@ public abstract class CASFileCache implements ContentAddressableStorage { Gauge.build().name("cas_size").help("CAS size.").register(); private static final Gauge casEntryCountMetric = Gauge.build().name("cas_entry_count").help("Number of entries in the CAS.").register(); - private static Histogram casTtl; + private static Histogram casTtl = + Histogram.build() + .name("cas_ttl_s") + .buckets( + 3600, // 1 hour + 21600, // 6 hours + 86400, // 1 day + 345600, // 4 days + 604800, // 1 week + 1210000 // 2 weeks + ) + .help("The amount of time CAS entries live on L1 storage before expiration (seconds)") + .register(); private static final Gauge casCopyFallbackMetric = Gauge.build() @@ -161,7 +173,6 @@ public abstract class CASFileCache implements ContentAddressableStorage { private final EntryPathStrategy entryPathStrategy; private final long maxSizeInBytes; private final long maxEntrySizeInBytes; - private final boolean publishTtlMetric; private final boolean execRootFallback; private final DigestUtil digestUtil; private final ConcurrentMap keyReferences; @@ -170,7 +181,7 @@ public abstract class CASFileCache implements ContentAddressableStorage { private final Consumer> onExpire; private final Executor accessRecorder; private final ExecutorService expireService; - private Thread prometheusMetricsThread; // TODO make this final, stop on shutdown + private Thread prometheusMetricsThread; private final Map directoryStorage = Maps.newConcurrentMap(); private final DirectoriesIndex directoriesIndex; @@ -308,7 +319,6 @@ public CASFileCache( maxEntrySizeInBytes, config.getHexBucketLevels(), config.isFileDirectoriesIndexInMemory(), - config.isPublishTtlMetric(), config.isExecRootCopyFallback(), digestUtil, expireService, @@ -327,7 +337,6 @@ public CASFileCache( long maxEntrySizeInBytes, int hexBucketLevels, boolean storeFileDirsIndexInMemory, - boolean publishTtlMetric, boolean execRootFallback, DigestUtil digestUtil, ExecutorService expireService, @@ -341,7 +350,6 @@ public CASFileCache( this.root = root; this.maxSizeInBytes = maxSizeInBytes; this.maxEntrySizeInBytes = maxEntrySizeInBytes; - this.publishTtlMetric = publishTtlMetric; this.execRootFallback = execRootFallback; this.digestUtil = digestUtil; this.expireService = expireService; @@ -353,23 +361,8 @@ public CASFileCache( this.delegateSkipLoad = delegateSkipLoad; this.directoriesIndexDbName = directoriesIndexDbName; this.keyReferences = Maps.newConcurrentMap(); - if (publishTtlMetric) { - casTtl = - Histogram.build() - .name("cas_ttl_s") - .buckets( - 3600, // 1 hour - 21600, // 6 hours - 86400, // 1 day - 345600, // 4 days - 604800, // 1 week - 1210000 // 2 weeks - ) - .help("The amount of time CAS entries live on L1 storage before expiration (seconds)") - .register(); - } - - entryPathStrategy = new HexBucketEntryPathStrategy(root, hexBucketLevels); + + entryPathStrategy = new HexBucketEntryPathStrategy(root, hexBucketLevels); String directoriesIndexUrl = "jdbc:sqlite:"; if (directoriesIndexDbName.equals(DIRECTORIES_INDEX_NAME_MEMORY)) { @@ -552,7 +545,7 @@ private InputStream compressorInputStream(Compressor.Value compressor, InputStre @SuppressWarnings("ResultOfMethodCallIgnored") InputStream newLocalInput(Compressor.Value compressor, Digest digest, long offset) throws IOException { - log.log(Level.FINE, format("getting input stream for %s", DigestUtil.toString(digest))); + log.log(Level.FINER, format("getting input stream for %s", DigestUtil.toString(digest))); boolean isExecutable = false; do { String key = getKey(digest, isExecutable); @@ -726,7 +719,7 @@ void invalidateWrite(Digest digest) { public void put(Blob blob, Runnable onExpiration) throws InterruptedException { String key = getKey(blob.getDigest(), false); try { - log.log(Level.FINE, format("put: %s", key)); + log.log(Level.FINER, format("put: %s", key)); OutputStream out = putImpl( Compressor.Value.IDENTITY, @@ -863,8 +856,14 @@ Write newWrite(BlobWriteKey key, ListenableFuture future) { Write write = new Write() { CancellableOutputStream out = null; + + @GuardedBy("this") boolean isReset = false; + + @GuardedBy("this") SettableFuture closedFuture = null; + + @GuardedBy("this") long fileCommittedSize = -1; @Override @@ -882,6 +881,9 @@ public synchronized void reset() { + key.getIdentifier(), e); } finally { + if (closedFuture != null) { + closedFuture.set(null); + } isReset = true; } } @@ -945,6 +947,11 @@ public synchronized ListenableFuture getOutputFuture( directExecutor()); } + private synchronized void syncCancelled() { + out = null; + isReset = true; + } + @Override public synchronized FeedbackOutputStream getOutput( long deadlineAfter, TimeUnit deadlineAfterUnits, Runnable onReadyHandler) @@ -953,6 +960,9 @@ public synchronized FeedbackOutputStream getOutput( // will block until it is returned via a close. if (closedFuture != null) { try { + while (!closedFuture.isDone()) { + wait(); + } closedFuture.get(); } catch (ExecutionException e) { throw new IOException(e.getCause()); @@ -969,8 +979,7 @@ public synchronized FeedbackOutputStream getOutput( UUID.fromString(key.getIdentifier()), cancelled -> { if (cancelled) { - out = null; - isReset = true; + syncCancelled(); } outClosedFuture.set(null); }, @@ -980,7 +989,11 @@ public synchronized FeedbackOutputStream getOutput( return uniqueOut; } - private void commitOpenState( + private synchronized void syncNotify() { + notify(); + } + + private synchronized void commitOpenState( CancellableOutputStream out, SettableFuture closedFuture) { // transition the Write to an open state, and modify all internal state required // atomically @@ -988,6 +1001,7 @@ private void commitOpenState( this.out = out; this.closedFuture = closedFuture; + closedFuture.addListener(this::syncNotify, directExecutor()); // they will likely write to this, so we can no longer assume isReset. // might want to subscribe to a write event on the stream isReset = false; @@ -1052,7 +1066,7 @@ CancellableOutputStream newOutput( String key = getKey(digest, false); final CancellableOutputStream cancellableOut; try { - log.log(Level.FINE, format("getWrite: %s", key)); + log.log(Level.FINER, format("getWrite: %s", key)); cancellableOut = putImpl( compressor, @@ -1260,6 +1274,13 @@ public void initializeRootDirectory() throws IOException { fileStore = Files.getFileStore(root); } + public void stop() throws InterruptedException { + if (prometheusMetricsThread != null) { + prometheusMetricsThread.interrupt(); + prometheusMetricsThread.join(); + } + } + public StartupCacheResults start(boolean skipLoad) throws IOException, InterruptedException { return start(newDirectExecutorService(), skipLoad); } @@ -2159,7 +2180,7 @@ private void removeFilePath(Path path) throws IOException { } if (Files.isDirectory(temp)) { - log.log(Level.FINE, "removing existing directory " + path + " for fetch"); + log.log(Level.FINER, "removing existing directory " + path + " for fetch"); Directories.remove(temp, fileStore); } else { Files.delete(temp); @@ -2296,23 +2317,23 @@ private void getDirectoryKeys( } } - public ListenableFuture putDirectory( + public ListenableFuture putDirectory( Digest digest, Map directoriesIndex, ExecutorService service) { // Claim lock. // Claim the directory path so no other threads try to create/delete it. Path path = getDirectoryPath(digest); Lock l = locks.acquire(path); - log.log(Level.FINE, format("locking directory %s", path.getFileName())); + log.log(Level.FINER, format("locking directory %s", path.getFileName())); try { l.lockInterruptibly(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); return immediateFailedFuture(e); } - log.log(Level.FINE, format("locked directory %s", path.getFileName())); + log.log(Level.FINER, format("locked directory %s", path.getFileName())); // Now that a lock has been claimed, we can proceed to create the directory. - ListenableFuture putFuture; + ListenableFuture putFuture; try { putFuture = putDirectorySynchronized(path, digest, directoriesIndex, service); } catch (IOException e) { @@ -2323,7 +2344,7 @@ public ListenableFuture putDirectory( putFuture.addListener( () -> { l.unlock(); - log.log(Level.FINE, format("directory %s has been unlocked", path.getFileName())); + log.log(Level.FINER, format("directory %s has been unlocked", path.getFileName())); }, service); return putFuture; @@ -2367,11 +2388,29 @@ private boolean directoryEntryExists( return false; } + public static class PathResult { + private final Path path; + private final boolean missed; + + public PathResult(Path path, boolean missed) { + this.path = path; + this.missed = missed; + } + + public Path getPath() { + return path; + } + + public boolean getMissed() { + return missed; + } + } + @SuppressWarnings("ConstantConditions") - private ListenableFuture putDirectorySynchronized( + private ListenableFuture putDirectorySynchronized( Path path, Digest digest, Map directoriesByDigest, ExecutorService service) throws IOException { - log.log(Level.FINE, format("directory %s has been locked", path.getFileName())); + log.log(Level.FINER, format("directory %s has been locked", path.getFileName())); ListenableFuture expireFuture; synchronized (this) { DirectoryEntry e = directoryStorage.get(digest); @@ -2398,9 +2437,9 @@ private ListenableFuture putDirectorySynchronized( } if (e != null) { - log.log(Level.FINE, format("found existing entry for %s", path.getFileName())); + log.log(Level.FINER, format("found existing entry for %s", path.getFileName())); if (directoryEntryExists(path, e, directoriesByDigest)) { - return immediateFuture(path); + return immediateFuture(new PathResult(path, /* missed=*/ false)); } log.log( Level.SEVERE, @@ -2411,7 +2450,7 @@ private ListenableFuture putDirectorySynchronized( decrementReferencesSynchronized(inputsBuilder.build(), ImmutableList.of()); expireFuture = expireDirectory(digest, service); - log.log(Level.FINE, format("expiring existing entry for %s", path.getFileName())); + log.log(Level.FINER, format("expiring existing entry for %s", path.getFileName())); } } @@ -2433,7 +2472,7 @@ private ListenableFuture putDirectorySynchronized( transformAsync( deindexFuture, result -> { - log.log(Level.FINE, format("expiry complete, fetching %s", path.getFileName())); + log.log(Level.FINER, format("expiry complete, fetching %s", path.getFileName())); ImmutableList.Builder> putFuturesBuilder = ImmutableList.builder(); fetchDirectory( @@ -2507,7 +2546,7 @@ private ListenableFuture putDirectorySynchronized( } } try { - log.log(Level.FINE, "removing directory to roll back " + path); + log.log(Level.FINER, "removing directory to roll back " + path); Directories.remove(path, fileStore); } catch (IOException removeException) { log.log( @@ -2523,7 +2562,8 @@ private ListenableFuture putDirectorySynchronized( return transform( rollbackFuture, (results) -> { - log.log(Level.FINE, format("directory fetch complete, inserting %s", path.getFileName())); + log.log( + Level.FINER, format("directory fetch complete, inserting %s", path.getFileName())); DirectoryEntry e = new DirectoryEntry( // might want to have this treatment ahead of this @@ -2532,7 +2572,7 @@ private ListenableFuture putDirectorySynchronized( : directoriesByDigest.get(digest), Deadline.after(10, SECONDS)); directoryStorage.put(digest, e); - return path; + return new PathResult(path, /* missed=*/ true); }, service); } @@ -2575,13 +2615,13 @@ Path putAndCopy(Digest digest, boolean isExecutable) throws IOException, Interru complete = true; } finally { try { - log.log(Level.FINE, format("closing output stream for %s", DigestUtil.toString(digest))); + log.log(Level.FINER, format("closing output stream for %s", DigestUtil.toString(digest))); if (complete) { out.close(); } else { out.cancel(); } - log.log(Level.FINE, format("output stream closed for %s", DigestUtil.toString(digest))); + log.log(Level.FINER, format("output stream closed for %s", DigestUtil.toString(digest))); } catch (IOException e) { if (Thread.interrupted()) { log.log( @@ -2593,7 +2633,7 @@ Path putAndCopy(Digest digest, boolean isExecutable) throws IOException, Interru throw new InterruptedException(); } else { log.log( - Level.FINE, + Level.FINER, format("failed output stream close for %s", DigestUtil.toString(digest)), e); } @@ -2625,7 +2665,7 @@ private static Exception extractStatusException(IOException e) { private void copyExternalInput(Digest digest, CancellableOutputStream out) throws IOException, InterruptedException { Retrier retrier = new Retrier(Backoff.sequential(5), Retrier.DEFAULT_IS_RETRIABLE); - log.log(Level.FINE, format("downloading %s", DigestUtil.toString(digest))); + log.log(Level.FINER, format("downloading %s", DigestUtil.toString(digest))); try { retrier.execute( () -> { @@ -2646,7 +2686,7 @@ private void copyExternalInput(Digest digest, CancellableOutputStream out) e); // prevent burial by early end of stream during close throw e; } - log.log(Level.FINE, format("download of %s complete", DigestUtil.toString(digest))); + log.log(Level.FINER, format("download of %s complete", DigestUtil.toString(digest))); } @FunctionalInterface @@ -2700,7 +2740,7 @@ private CancellableOutputStream putImpl( if (out == DUPLICATE_OUTPUT_STREAM) { return null; } - log.log(Level.FINE, format("entry %s is missing, downloading and populating", key)); + log.log(Level.FINER, format("entry %s is missing, downloading and populating", key)); return newCancellableOutputStream(out); } @@ -2860,25 +2900,19 @@ private final void renamePath(Path a, Path b) throws IOException, FileAlreadyExi } private void deleteExpiredKey(String key) throws IOException { - // We don't want publishing the metric to delay the deletion of the file. - // We publish the metric only after the file has been deleted. - long createdTime = 0; Path path = getRemovingPath(key); - if (publishTtlMetric) { - createdTime = path.toFile().lastModified(); - } + long createdTimeMs = Files.getLastModifiedTime(path).to(MILLISECONDS); deleteFilePath(path); - if (publishTtlMetric) { - publishExpirationMetric(createdTime); - } + publishExpirationMetric(createdTimeMs); } - private void publishExpirationMetric(long createdTime) { - long currentTime = new Date().getTime(); - long ttl = currentTime - createdTime; - casTtl.observe(Time.millisecondsToSeconds(ttl)); + private void publishExpirationMetric(long createdTimeMs) { + // TODO introduce ttl clock + long currentTimeMs = new Date().getTime(); + long ttlMs = currentTimeMs - createdTimeMs; + casTtl.observe(Time.millisecondsToSeconds(ttlMs)); } @SuppressWarnings({"ConstantConditions", "ResultOfMethodCallIgnored"}) @@ -2930,7 +2964,7 @@ private boolean charge(String key, long blobSizeInBytes, AtomicBoolean requiresD return immediateFuture(null); } expiredKeyCounter.inc(); - log.log(Level.INFO, format("expired key %s", expiredKey)); + log.log(Level.FINE, format("expired key %s", expiredKey)); return immediateFuture(fileEntryKey.getDigest()); }, expireService)); @@ -3136,7 +3170,7 @@ void commit() throws IOException { existingEntry = safeStorageInsertion(key, entry); inserted = existingEntry == null; } catch (FileAlreadyExistsException e) { - log.log(Level.FINE, "file already exists for " + key + ", nonexistent entry will fail"); + log.log(Level.FINER, "file already exists for " + key + ", nonexistent entry will fail"); } finally { if (Files.exists(writePath)) { Files.delete(writePath); @@ -3163,20 +3197,20 @@ void commit() throws IOException { } if (existingEntry != null) { - log.log(Level.FINE, "lost the race to insert " + key); + log.log(Level.FINER, "lost the race to insert " + key); if (!referenceIfExists(key)) { // we would lose our accountability and have a presumed reference if we returned throw new IllegalStateException("storage conflict with existing key for " + key); } } else if (writeWinner.get()) { - log.log(Level.FINE, "won the race to insert " + key); + log.log(Level.FINER, "won the race to insert " + key); try { onInsert.run(); } catch (RuntimeException e) { throw new IOException(e); } } else { - log.log(Level.FINE, "did not win the race to insert " + key); + log.log(Level.FINER, "did not win the race to insert " + key); } } }; @@ -3230,7 +3264,7 @@ public boolean incrementReference() { "entry " + key + " has " + referenceCount + " references and is being incremented..."); } log.log( - Level.FINER, + Level.FINEST, "incrementing references to " + key + " from " @@ -3260,7 +3294,7 @@ public boolean decrementReference(Entry header) { "entry " + key + " has 0 references and is being decremented..."); } log.log( - Level.FINER, + Level.FINEST, "decrementing references to " + key + " from " diff --git a/src/main/java/build/buildfarm/common/LoggingMain.java b/src/main/java/build/buildfarm/common/LoggingMain.java index 9a4d9bd2b0..4e60a6286a 100644 --- a/src/main/java/build/buildfarm/common/LoggingMain.java +++ b/src/main/java/build/buildfarm/common/LoggingMain.java @@ -7,24 +7,22 @@ public abstract class LoggingMain { protected abstract void onShutdown() throws InterruptedException; - class ShutdownThread extends Thread { - ShutdownThread(String applicationName) { - super(null, null, applicationName + "-Shutdown", 0); - } - - @Override - public void run() { - try { - LoggingMain.this.onShutdown(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } finally { - WaitingLogManager.release(); - } + private void shutdown() { + try { + onShutdown(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } finally { + WaitingLogManager.release(); } } protected LoggingMain(String applicationName) { - Runtime.getRuntime().addShutdownHook(new ShutdownThread(applicationName)); + Runtime.getRuntime() + .addShutdownHook( + new Thread( + /* group=*/ null, + /* target=*/ this::shutdown, + /* name=*/ applicationName + "-Shutdown")); } } diff --git a/src/main/java/build/buildfarm/common/config/Admin.java b/src/main/java/build/buildfarm/common/config/Admin.java index 07deb4ce70..f4f8168225 100644 --- a/src/main/java/build/buildfarm/common/config/Admin.java +++ b/src/main/java/build/buildfarm/common/config/Admin.java @@ -11,5 +11,7 @@ public enum DEPLOYMENT_ENVIRONMENT { private DEPLOYMENT_ENVIRONMENT deploymentEnvironment; private String clusterEndpoint; + // This configuration is deprecated but is left here for backwards compatibility. Use + // worker:gracefulShutdownSeconds instead. private boolean enableGracefulShutdown; } diff --git a/src/main/java/build/buildfarm/common/config/Backplane.java b/src/main/java/build/buildfarm/common/config/Backplane.java index d8e12de592..8cf6f41e8c 100644 --- a/src/main/java/build/buildfarm/common/config/Backplane.java +++ b/src/main/java/build/buildfarm/common/config/Backplane.java @@ -1,7 +1,10 @@ package build.buildfarm.common.config; -import com.google.common.base.Strings; +import java.util.ArrayList; +import java.util.List; +import lombok.AccessLevel; import lombok.Data; +import lombok.Getter; @Data public class Backplane { @@ -32,8 +35,13 @@ public enum BACKPLANE_TYPE { private String operationChannelPrefix = "OperationChannel"; private String casPrefix = "ContentAddressableStorage"; private int casExpire = 604800; // 1 Week - private boolean subscribeToBackplane = true; - private boolean runFailsafeOperation = true; + + @Getter(AccessLevel.NONE) + private boolean subscribeToBackplane = true; // deprecated + + @Getter(AccessLevel.NONE) + private boolean runFailsafeOperation = true; // deprecated + private int maxQueueDepth = 100000; private int maxPreQueueDepth = 1000000; private boolean priorityQueue = false; @@ -47,13 +55,7 @@ public enum BACKPLANE_TYPE { private boolean cacheCas = false; private long priorityPollIntervalMillis = 100; - public String getRedisUri() { - // use environment override (useful for containerized deployment) - if (!Strings.isNullOrEmpty(System.getenv("REDIS_URI"))) { - return System.getenv("REDIS_URI"); - } - - // use configured value - return redisUri; - } + // These limited resources are shared across all workers. + // An example would be a limited number of seats to a license server. + private List resources = new ArrayList<>(); } diff --git a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java index 432a11a90a..d6ebbe9e14 100644 --- a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java +++ b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java @@ -36,10 +36,10 @@ public final class BuildfarmConfigs { private long maximumActionTimeout = 3600; private long maxEntrySizeBytes = 2147483648L; // 2 * 1024 * 1024 * 1024 private int prometheusPort = 9090; + private boolean allowSymlinkTargetAbsolute = false; private Server server = new Server(); private Backplane backplane = new Backplane(); private Worker worker = new Worker(); - private WebUI ui = new WebUI(); private ExecutionWrappers executionWrappers = new ExecutionWrappers(); private BuildfarmConfigs() {} @@ -78,6 +78,12 @@ public static BuildfarmConfigs loadServerConfigs(String[] args) throws Configura if (options.port > 0) { buildfarmConfigs.getServer().setPort(options.port); } + if (options.prometheusPort >= 0) { + buildfarmConfigs.setPrometheusPort(options.prometheusPort); + } + if (!Strings.isNullOrEmpty(options.redisUri)) { + buildfarmConfigs.getBackplane().setRedisUri(options.redisUri); + } adjustServerConfigs(buildfarmConfigs); return buildfarmConfigs; } @@ -94,6 +100,12 @@ public static BuildfarmConfigs loadWorkerConfigs(String[] args) throws Configura if (!Strings.isNullOrEmpty(options.publicName)) { buildfarmConfigs.getWorker().setPublicName(options.publicName); } + if (options.prometheusPort >= 0) { + buildfarmConfigs.setPrometheusPort(options.prometheusPort); + } + if (!Strings.isNullOrEmpty(options.redisUri)) { + buildfarmConfigs.getBackplane().setRedisUri(options.redisUri); + } adjustWorkerConfigs(buildfarmConfigs); return buildfarmConfigs; } diff --git a/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java b/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java new file mode 100644 index 0000000000..1dbe0c7f98 --- /dev/null +++ b/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java @@ -0,0 +1,36 @@ +// Copyright 2023 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.common.config; + +import com.google.devtools.common.options.Option; +import com.google.devtools.common.options.OptionsBase; + +/** Command-line options definition for Worker. */ +public class BuildfarmOptions extends OptionsBase { + @Option(name = "help", abbrev = 'h', help = "Prints usage info.", defaultValue = "true") + public boolean help; + + @Option( + name = "prometheus_port", + help = "Port for the prometheus service. '0' will disable prometheus hosting", + defaultValue = "-1") + public int prometheusPort; + + @Option( + name = "redis_uri", + help = "URI for Redis connection. Use 'redis://' or 'rediss://' for the scheme", + defaultValue = "") + public String redisUri; +} diff --git a/src/main/java/build/buildfarm/common/config/Cas.java b/src/main/java/build/buildfarm/common/config/Cas.java index 5b16fb2168..9c671d4958 100644 --- a/src/main/java/build/buildfarm/common/config/Cas.java +++ b/src/main/java/build/buildfarm/common/config/Cas.java @@ -3,7 +3,9 @@ import com.google.common.base.Strings; import java.nio.file.Path; import javax.naming.ConfigurationException; +import lombok.AccessLevel; import lombok.Data; +import lombok.Getter; @Data public class Cas { @@ -30,8 +32,8 @@ public enum TYPE { private String target; private boolean readonly = false; - // Metrics - private boolean publishTtlMetric = false; + @Getter(AccessLevel.NONE) + private boolean publishTtlMetric = false; // deprecated public Path getValidPath(Path root) throws ConfigurationException { if (Strings.isNullOrEmpty(path)) { diff --git a/src/main/java/build/buildfarm/common/config/GrpcMetrics.java b/src/main/java/build/buildfarm/common/config/GrpcMetrics.java index a028035673..8855ae5fdb 100644 --- a/src/main/java/build/buildfarm/common/config/GrpcMetrics.java +++ b/src/main/java/build/buildfarm/common/config/GrpcMetrics.java @@ -27,7 +27,7 @@ public static void handleGrpcMetricIntercepts( // provide custom latency buckets if (grpcMetrics.getLatencyBuckets() != null) { - grpcConfig.withLatencyBuckets(grpcMetrics.getLatencyBuckets()); + grpcConfig = grpcConfig.withLatencyBuckets(grpcMetrics.getLatencyBuckets()); } // Apply config to create an interceptor and apply it to the GRPC server. diff --git a/src/main/java/build/buildfarm/common/config/LimitedResource.java b/src/main/java/build/buildfarm/common/config/LimitedResource.java new file mode 100644 index 0000000000..f3b09ff621 --- /dev/null +++ b/src/main/java/build/buildfarm/common/config/LimitedResource.java @@ -0,0 +1,43 @@ +// Copyright 2023 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.common.config; + +import lombok.Data; + +/** + * @class Limited Resource + * @brief A fixed amount of a specific resource. + * @details We define a limited resource as a counting semaphore whose configuration contains a name + * and a count representing a physical or logical group of units obtained by executors as a + * precondition to fulfill a long running operation. These units are released upon the + * operation's completion. The resource is requested by the action's platform properties. + */ +@Data +public class LimitedResource { + /** + * @field name + * @brief The name of the resource. + * @details This should correspond to the platform property's key name: + * resources:: + */ + public String name; + + /** + * @field amount + * @brief The total amount of the resource that's available for use during execution. + * @details As a counting semaphore, this amount becomes the limit. + */ + public int amount = 1; +} diff --git a/src/main/java/build/buildfarm/common/config/Metrics.java b/src/main/java/build/buildfarm/common/config/Metrics.java index 73a9113d7a..28e065ddbf 100644 --- a/src/main/java/build/buildfarm/common/config/Metrics.java +++ b/src/main/java/build/buildfarm/common/config/Metrics.java @@ -1,6 +1,8 @@ package build.buildfarm.common.config; +import lombok.AccessLevel; import lombok.Data; +import lombok.Getter; @Data public class Metrics { @@ -19,7 +21,9 @@ public enum LOG_LEVEL { FINEST, } - private PUBLISHER publisher = PUBLISHER.LOG; + @Getter(AccessLevel.NONE) + private PUBLISHER publisher = PUBLISHER.LOG; // deprecated + private LOG_LEVEL logLevel = LOG_LEVEL.FINEST; private String topic; private int topicMaxConnections; diff --git a/src/main/java/build/buildfarm/common/config/Server.java b/src/main/java/build/buildfarm/common/config/Server.java index e169c2575b..0487154c2f 100644 --- a/src/main/java/build/buildfarm/common/config/Server.java +++ b/src/main/java/build/buildfarm/common/config/Server.java @@ -24,6 +24,7 @@ public enum INSTANCE_TYPE { private String sslPrivateKeyPath = null; private boolean runDispatchedMonitor = true; private int dispatchedMonitorIntervalSeconds = 1; + private boolean runFailsafeOperation = true; private boolean runOperationQueuer = true; private boolean ensureOutputsPresent = false; private int maxRequeueAttempts = 5; @@ -41,6 +42,7 @@ public enum INSTANCE_TYPE { private int maxInboundMetadataSize = 0; private ServerCacheConfigs caches = new ServerCacheConfigs(); private boolean findMissingBlobsViaBackplane = false; + private int gracefulShutdownSeconds = 0; public String getSession() { return String.format("buildfarm-server-%s-%s", getPublicName(), sessionGuid); diff --git a/src/main/java/build/buildfarm/common/config/ServerOptions.java b/src/main/java/build/buildfarm/common/config/ServerOptions.java index 5c1e00a0d5..35f47d6d13 100644 --- a/src/main/java/build/buildfarm/common/config/ServerOptions.java +++ b/src/main/java/build/buildfarm/common/config/ServerOptions.java @@ -15,13 +15,9 @@ package build.buildfarm.common.config; import com.google.devtools.common.options.Option; -import com.google.devtools.common.options.OptionsBase; /** Command-line options definition for example server. */ -public class ServerOptions extends OptionsBase { - @Option(name = "help", abbrev = 'h', help = "Prints usage info.", defaultValue = "true") - public boolean help; - +public class ServerOptions extends BuildfarmOptions { @Option(name = "port", abbrev = 'p', help = "Port to use.", defaultValue = "-1") public int port; diff --git a/src/main/java/build/buildfarm/common/config/ShardWorkerOptions.java b/src/main/java/build/buildfarm/common/config/ShardWorkerOptions.java index 7671ae0a4f..c116d31673 100644 --- a/src/main/java/build/buildfarm/common/config/ShardWorkerOptions.java +++ b/src/main/java/build/buildfarm/common/config/ShardWorkerOptions.java @@ -15,13 +15,9 @@ package build.buildfarm.common.config; import com.google.devtools.common.options.Option; -import com.google.devtools.common.options.OptionsBase; /** Command-line options definition for Worker. */ -public class ShardWorkerOptions extends OptionsBase { - @Option(name = "help", abbrev = 'h', help = "Prints usage info.", defaultValue = "true") - public boolean help; - +public class ShardWorkerOptions extends BuildfarmOptions { @Option( name = "root", help = "Root base directory for all work being performed.", diff --git a/src/main/java/build/buildfarm/common/config/WebUI.java b/src/main/java/build/buildfarm/common/config/WebUI.java deleted file mode 100644 index 443d210691..0000000000 --- a/src/main/java/build/buildfarm/common/config/WebUI.java +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.common.config; - -import lombok.Data; - -/** - * @class WebUI - * @brief Settings for buildfarm's web UI. - * @details Buildfarm provides a web frontend for developers to introspect builds. - */ -@Data -public class WebUI { - /** - * @field enable - * @brief Whether to enable the web frontend. - * @details When disabled there will be no ports opened or routes available. - */ - public boolean enable = false; - - /** - * @field port - * @brief HTTP port for the web frontend. - * @details 8080 is useful for local testing since port 80 requires sudo. We choose the following - * default since many ports are blocked in upstream CI. - */ - public String port = "8982"; -} diff --git a/src/main/java/build/buildfarm/common/config/Worker.java b/src/main/java/build/buildfarm/common/config/Worker.java index 294b70dc2f..c446134fe2 100644 --- a/src/main/java/build/buildfarm/common/config/Worker.java +++ b/src/main/java/build/buildfarm/common/config/Worker.java @@ -5,6 +5,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; import javax.naming.ConfigurationException; @@ -28,15 +29,22 @@ public class Worker { private int inputFetchStageWidth = 0; private int inputFetchDeadline = 60; private boolean linkInputDirectories = true; - private List realInputDirectories = Arrays.asList("external"); + private List linkedInputDirectories = Arrays.asList("(?!external)[^/]+"); private String execOwner; private int defaultMaxCores = 0; private boolean limitGlobalExecution = false; private boolean onlyMulticoreTests = false; private boolean allowBringYourOwnContainer = false; private boolean errorOperationRemainingResources = false; + private int gracefulShutdownSeconds = 0; private ExecutionPolicy[] executionPolicies = {}; private SandboxSettings sandboxSettings = new SandboxSettings(); + private boolean createSymlinkOutputs = false; + + // These limited resources are only for the individual worker. + // An example would be hardware resources such as GPUs. + // If you want GPU actions to run exclusively, define a single GPU resource. + private List resources = new ArrayList<>(); public ExecutionPolicy[] getExecutionPolicies() { if (executionPolicies != null) { diff --git a/src/main/java/build/buildfarm/common/grpc/BUILD b/src/main/java/build/buildfarm/common/grpc/BUILD index 7bff874ad8..af68ba7d63 100644 --- a/src/main/java/build/buildfarm/common/grpc/BUILD +++ b/src/main/java/build/buildfarm/common/grpc/BUILD @@ -13,6 +13,7 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:org_projectlombok_lombok", diff --git a/src/main/java/build/buildfarm/common/grpc/Channels.java b/src/main/java/build/buildfarm/common/grpc/Channels.java new file mode 100644 index 0000000000..0531218f23 --- /dev/null +++ b/src/main/java/build/buildfarm/common/grpc/Channels.java @@ -0,0 +1,40 @@ +// Copyright 2023 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.common.grpc; + +import io.grpc.ManagedChannel; +import io.grpc.netty.NegotiationType; +import io.grpc.netty.NettyChannelBuilder; + +public final class Channels { + private static final String GRPCS_URL_PREFIX = "grpcs://"; + private static final String GRPC_URL_PREFIX = "grpc://"; + + private Channels() {} + + public static ManagedChannel createChannel(String target) { + NegotiationType negotiationType = NegotiationType.PLAINTEXT; + if (target.startsWith(GRPCS_URL_PREFIX)) { + target = target.substring(GRPCS_URL_PREFIX.length()); + negotiationType = NegotiationType.TLS; + } else if (target.startsWith(GRPC_URL_PREFIX)) { + target = target.substring(GRPC_URL_PREFIX.length()); + negotiationType = NegotiationType.PLAINTEXT; + } + NettyChannelBuilder builder = + NettyChannelBuilder.forTarget(target).negotiationType(negotiationType); + return builder.build(); + } +} diff --git a/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java b/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java index 26b67f5c6c..4e500398b2 100644 --- a/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java +++ b/src/main/java/build/buildfarm/common/grpc/StubWriteOutputStream.java @@ -227,6 +227,14 @@ public void onNext(WriteResponse response) { @Override public void onError(Throwable t) { + log.log( + WARNING, + format( + "%s: write(%s) on worker %s after %d bytes of content", + Status.fromThrowable(t).getCode().name(), + resourceName, + bsStub.get().getChannel().authority(), + writtenBytes)); writeFuture.setException(exceptionTranslator.apply(t)); } diff --git a/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java b/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java index 52f937c8ed..9123f5eaf9 100644 --- a/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java +++ b/src/main/java/build/buildfarm/common/redis/BalancedRedisQueue.java @@ -236,6 +236,17 @@ public String dequeue(JedisCluster jedis) throws InterruptedException { } } + /** + * @brief Pop element into internal dequeue and return value. + * @details Null is returned if the queue is empty. + * @return The value of the transfered element. null if queue is empty or thread was interrupted. + * @note Suggested return identifier: val. + */ + public String nonBlockingDequeue(JedisCluster jedis) throws InterruptedException { + QueueInterface queue = queues.get(roundRobinPopIndex()); + return queue.nonBlockingDequeue(jedis); + } + /** * @brief Get the current pop queue. * @details Get the queue that the balanced queue intends to pop from next. diff --git a/src/main/java/build/buildfarm/common/services/BUILD b/src/main/java/build/buildfarm/common/services/BUILD index f2f643001e..9a885cbafe 100644 --- a/src/main/java/build/buildfarm/common/services/BUILD +++ b/src/main/java/build/buildfarm/common/services/BUILD @@ -23,7 +23,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_services", "@maven//:io_grpc_grpc_stub", diff --git a/src/main/java/build/buildfarm/common/services/ByteStreamService.java b/src/main/java/build/buildfarm/common/services/ByteStreamService.java index f457eee7fe..d2003c32e3 100644 --- a/src/main/java/build/buildfarm/common/services/ByteStreamService.java +++ b/src/main/java/build/buildfarm/common/services/ByteStreamService.java @@ -341,7 +341,7 @@ public void read(ReadRequest request, StreamObserver responseObser long offset = request.getReadOffset(); long limit = request.getReadLimit(); log.log( - Level.FINER, + Level.FINEST, format("read resource_name=%s offset=%d limit=%d", resourceName, offset, limit)); try { @@ -356,7 +356,7 @@ public void queryWriteStatus( QueryWriteStatusRequest request, StreamObserver responseObserver) { String resourceName = request.getResourceName(); try { - log.log(Level.FINE, format("queryWriteStatus(%s)", resourceName)); + log.log(Level.FINER, format("queryWriteStatus(%s)", resourceName)); Write write = getWrite(resourceName); responseObserver.onNext( QueryWriteStatusResponse.newBuilder() @@ -365,7 +365,7 @@ public void queryWriteStatus( .build()); responseObserver.onCompleted(); log.log( - Level.FINE, + Level.FINER, format( "queryWriteStatus(%s) => committed_size = %d, complete = %s", resourceName, write.getCommittedSize(), write.isComplete())); diff --git a/src/main/java/build/buildfarm/common/services/ContentAddressableStorageService.java b/src/main/java/build/buildfarm/common/services/ContentAddressableStorageService.java index 9395d63fea..6e00e39f67 100644 --- a/src/main/java/build/buildfarm/common/services/ContentAddressableStorageService.java +++ b/src/main/java/build/buildfarm/common/services/ContentAddressableStorageService.java @@ -109,7 +109,7 @@ public void onSuccess(FindMissingBlobsResponse.Builder builder) { long elapsedMicros = stopwatch.elapsed(MICROSECONDS); missingBlobs.observe(request.getBlobDigestsList().size()); log.log( - Level.FINE, + Level.FINER, "FindMissingBlobs(" + instance.getName() + ") for " diff --git a/src/main/java/build/buildfarm/common/services/WriteStreamObserver.java b/src/main/java/build/buildfarm/common/services/WriteStreamObserver.java index 4e0d28d68b..5bb88bd010 100644 --- a/src/main/java/build/buildfarm/common/services/WriteStreamObserver.java +++ b/src/main/java/build/buildfarm/common/services/WriteStreamObserver.java @@ -111,7 +111,7 @@ public synchronized void onNext(WriteRequest request) { Status status = Status.fromThrowable(e); if (errorResponse(status.asException())) { log.log( - status.getCode() == Status.Code.CANCELLED ? Level.FINE : Level.SEVERE, + status.getCode() == Status.Code.CANCELLED ? Level.FINER : Level.SEVERE, format("error writing %s", (name == null ? request.getResourceName() : name)), e); } @@ -156,7 +156,7 @@ synchronized void commitSynchronized(long committedSize) { if (Context.current().isCancelled()) { log.log( - Level.FINER, + Level.FINEST, format("skipped delivering committed_size to %s for cancelled context", name)); } else { try { @@ -178,7 +178,7 @@ synchronized void commitSynchronized(long committedSize) { Status status = Status.fromThrowable(e); if (errorResponse(status.asException())) { log.log( - status.getCode() == Status.Code.CANCELLED ? Level.FINE : Level.SEVERE, + status.getCode() == Status.Code.CANCELLED ? Level.FINER : Level.SEVERE, format( "%s-%s: %s -> %s -> %s: error committing %s", requestMetadata.getToolDetails().getToolName(), @@ -198,7 +198,8 @@ void commitActive(long committedSize) { if (exception.compareAndSet(null, null)) { try { - log.log(Level.FINER, format("delivering committed_size for %s of %d", name, committedSize)); + log.log( + Level.FINEST, format("delivering committed_size for %s of %d", name, committedSize)); responseObserver.onNext(response); responseObserver.onCompleted(); } catch (Exception e) { @@ -218,9 +219,9 @@ private void initialize(WriteRequest request) throws InvalidResourceNameExceptio name = resourceName; try { write = getWrite(resourceName); - if (log.isLoggable(Level.FINER)) { + if (log.isLoggable(Level.FINEST)) { log.log( - Level.FINER, + Level.FINEST, format( "registering callback for %s: committed_size = %d (transient), complete = %s", resourceName, write.getCommittedSize(), write.isComplete())); @@ -287,6 +288,13 @@ private boolean errorResponse(Throwable t) { requestMetadata.getToolInvocationId(), requestMetadata.getActionId(), name)); + } else { + log.log( + Level.WARNING, + format( + "error %s after %d requests and %d bytes at offset %d", + name, requestCount, requestBytes, earliestOffset), + t); } return true; } @@ -380,7 +388,7 @@ private void handleWrite(String resourceName, long offset, ByteString data, bool data = data.substring(skipBytes); } log.log( - Level.FINER, + Level.FINEST, format( "writing %d to %s at %d%s", bytesToWrite, name, offset, finishWrite ? " with finish_write" : "")); @@ -396,7 +404,7 @@ private void handleWrite(String resourceName, long offset, ByteString data, bool @GuardedBy("this") private void close() { - log.log(Level.FINER, format("closing stream due to finishWrite for %s", name)); + log.log(Level.FINEST, format("closing stream due to finishWrite for %s", name)); try { getOutput().close(); } catch (DigestMismatchException e) { @@ -484,11 +492,11 @@ private FeedbackOutputStream getOutput() throws IOException { @Override public void onError(Throwable t) { - log.log(Level.FINE, format("write error for %s", name), t); + log.log(Level.FINER, format("write error for %s", name), t); } @Override public void onCompleted() { - log.log(Level.FINE, format("write completed for %s", name)); + log.log(Level.FINER, format("write completed for %s", name)); } } diff --git a/src/main/java/build/buildfarm/instance/MatchListener.java b/src/main/java/build/buildfarm/instance/MatchListener.java index 46f6dfc000..3da7df90eb 100644 --- a/src/main/java/build/buildfarm/instance/MatchListener.java +++ b/src/main/java/build/buildfarm/instance/MatchListener.java @@ -27,7 +27,4 @@ public interface MatchListener { boolean onEntry(@Nullable QueueEntry queueEntry) throws InterruptedException; void onError(Throwable t); - - // method that should be called when this match is cancelled and no longer valid - void setOnCancelHandler(Runnable onCancelHandler); } diff --git a/src/main/java/build/buildfarm/instance/server/BUILD b/src/main/java/build/buildfarm/instance/server/BUILD index 601519c867..1c63e9896c 100644 --- a/src/main/java/build/buildfarm/instance/server/BUILD +++ b/src/main/java/build/buildfarm/instance/server/BUILD @@ -1,8 +1,8 @@ java_library( name = "server", srcs = [ - "AbstractServerInstance.java", "GetDirectoryFunction.java", + "NodeInstance.java", "OperationsMap.java", "WatchFuture.java", ], @@ -27,7 +27,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:io_netty_netty_codec_http", diff --git a/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java b/src/main/java/build/buildfarm/instance/server/NodeInstance.java similarity index 98% rename from src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java rename to src/main/java/build/buildfarm/instance/server/NodeInstance.java index 611a23e9b9..c3cfe4d97e 100644 --- a/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java +++ b/src/main/java/build/buildfarm/instance/server/NodeInstance.java @@ -145,7 +145,7 @@ import lombok.extern.java.Log; @Log -public abstract class AbstractServerInstance implements Instance { +public abstract class NodeInstance implements Instance { private final String name; protected final ContentAddressableStorage contentAddressableStorage; protected final ActionCache actionCache; @@ -179,6 +179,8 @@ public abstract class AbstractServerInstance implements Instance { public static final String ENVIRONMENT_VARIABLES_NOT_SORTED = "The `Command`'s `environment_variables` are not correctly sorted by `name`."; + public static final String SYMLINK_TARGET_ABSOLUTE = "A symlink target is absolute."; + public static final String MISSING_ACTION = "The action was not found in the CAS."; public static final String MISSING_COMMAND = "The command was not found in the CAS."; @@ -228,7 +230,7 @@ public abstract class AbstractServerInstance implements Instance { public static final String NO_REQUEUE_COMPLETE_MESSAGE = "Operation %s not requeued. Operation has already completed."; - public AbstractServerInstance( + public NodeInstance( String name, DigestUtil digestUtil, ContentAddressableStorage contentAddressableStorage, @@ -802,6 +804,7 @@ public static void validateActionInputDirectory( Stack pathDigests, Set visited, Map directoriesIndex, + boolean allowSymlinkTargetAbsolute, Consumer onInputFile, Consumer onInputDirectory, Consumer onInputDigest, @@ -850,6 +853,14 @@ public static void validateActionInputDirectory( .setSubject("/" + directoryPath + ": " + lastSymlinkName + " > " + symlinkName) .setDescription(DIRECTORY_NOT_SORTED); } + String symlinkTarget = symlinkNode.getTarget(); + if (!allowSymlinkTargetAbsolute && symlinkTarget.charAt(0) == '/') { + preconditionFailure + .addViolationsBuilder() + .setType(VIOLATION_TYPE_INVALID) + .setSubject("/" + directoryPath + ": " + symlinkName + " -> " + symlinkTarget) + .setDescription(SYMLINK_TARGET_ABSOLUTE); + } /* FIXME serverside validity check? regex? Preconditions.checkState( isValidFilename(symlinkName), @@ -919,6 +930,7 @@ public static void validateActionInputDirectory( pathDigests, visited, directoriesIndex, + allowSymlinkTargetAbsolute, onInputFile, onInputDirectory, onInputDigest, @@ -934,6 +946,7 @@ private static void validateActionInputDirectoryDigest( Stack pathDigests, Set visited, Map directoriesIndex, + boolean allowSymlinkTargetAbsolute, Consumer onInputFile, Consumer onInputDirectory, Consumer onInputDigest, @@ -958,6 +971,7 @@ private static void validateActionInputDirectoryDigest( pathDigests, visited, directoriesIndex, + allowSymlinkTargetAbsolute, onInputFile, onInputDirectory, onInputDigest, @@ -1180,6 +1194,9 @@ void validateCommand( } else { Directory directory = directoriesIndex.get(inputRootDigest); for (String segment : workingDirectory.split("/")) { + if (segment.equals(".")) { + continue; + } Directory nextDirectory = directory; // linear for now for (DirectoryNode dirNode : directory.getDirectoriesList()) { @@ -1212,12 +1229,16 @@ protected void validateAction( ImmutableSet.Builder inputFilesBuilder = ImmutableSet.builder(); inputDirectoriesBuilder.add(ACTION_INPUT_ROOT_DIRECTORY_PATH); + boolean allowSymlinkTargetAbsolute = + getCacheCapabilities().getSymlinkAbsolutePathStrategy() + == SymlinkAbsolutePathStrategy.Value.ALLOWED; validateActionInputDirectoryDigest( ACTION_INPUT_ROOT_DIRECTORY_PATH, action.getInputRootDigest(), new Stack<>(), new HashSet<>(), directoriesIndex, + allowSymlinkTargetAbsolute, inputFilesBuilder::add, inputDirectoriesBuilder::add, onInputDigest, @@ -1958,19 +1979,18 @@ public ServerCapabilities getCapabilities() { @Override public WorkerProfileMessage getWorkerProfile() { throw new UnsupportedOperationException( - "AbstractServerInstance doesn't support getWorkerProfile() method."); + "NodeInstance doesn't support getWorkerProfile() method."); } @Override public WorkerListMessage getWorkerList() { - throw new UnsupportedOperationException( - "AbstractServerInstance doesn't support getWorkerList() method."); + throw new UnsupportedOperationException("NodeInstance doesn't support getWorkerList() method."); } @Override public PrepareWorkerForGracefulShutDownRequestResults shutDownWorkerGracefully() { throw new UnsupportedOperationException( - "AbstractServerInstance doesn't support drainWorkerPipeline() method."); + "NodeInstance doesn't support drainWorkerPipeline() method."); } @Override diff --git a/src/main/java/build/buildfarm/instance/shard/BUILD b/src/main/java/build/buildfarm/instance/shard/BUILD index 9b11f1543e..38e6992aae 100644 --- a/src/main/java/build/buildfarm/instance/shard/BUILD +++ b/src/main/java/build/buildfarm/instance/shard/BUILD @@ -28,7 +28,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:io_prometheus_simpleclient", diff --git a/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java b/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java index 794b296a1f..55b2933e42 100644 --- a/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java +++ b/src/main/java/build/buildfarm/instance/shard/CasWorkerMap.java @@ -121,4 +121,11 @@ Map> getMap(RedisClient client, Iterable blobDigests * @note Suggested return identifier: mapSize. */ int size(RedisClient client) throws IOException; + + /** + * @brief Set the expiry duration for the digests. + * @param client Client used for interacting with redis when not using cacheMap. + * @param blobDigests The blob digests to set new the expiry duration. + */ + void setExpire(RedisClient client, Iterable blobDigests) throws IOException; } diff --git a/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java b/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java index d035d10491..65d1f06b87 100644 --- a/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java +++ b/src/main/java/build/buildfarm/instance/shard/JedisCasWorkerMap.java @@ -234,6 +234,17 @@ public int size(RedisClient client) throws IOException { return client.call(jedis -> ScanCount.get(jedis, name + ":*", 1000)); } + @Override + public void setExpire(RedisClient client, Iterable blobDigests) throws IOException { + client.run( + jedis -> { + for (Digest blobDigest : blobDigests) { + String key = redisCasKey(blobDigest); + jedis.expire(key, keyExpiration_s); + } + }); + } + /** * @brief Get the redis key name. * @details This is to be used for the direct redis implementation. diff --git a/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java b/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java index 922558b50b..2bfb48ac9e 100644 --- a/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java +++ b/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java @@ -30,6 +30,7 @@ import redis.clients.jedis.JedisPoolConfig; import redis.clients.jedis.ScanParams; import redis.clients.jedis.ScanResult; +import redis.clients.jedis.util.JedisURIHelper; /** * @class JedisClusterFactory @@ -43,32 +44,31 @@ public class JedisClusterFactory { * @brief Create a jedis cluster instance. * @details Use proto configuration to connect to a redis cluster server and provide a jedis * client. - * @param config Configuration for connecting to a redis cluster server. + * @param identifier Redis Client name. * @return An established jedis client used to operate on the redis cluster. * @note Suggested return identifier: jedis. + * @link Redis Client name */ - public static Supplier create() throws ConfigurationException { + public static Supplier create(String identifier) throws ConfigurationException { // null password is required to elicit no auth in jedis String[] redisNodes = configs.getBackplane().getRedisNodes(); if (redisNodes != null && redisNodes.length > 0) { return createJedisClusterFactory( + identifier, list2Set(redisNodes), configs.getBackplane().getTimeout(), configs.getBackplane().getMaxAttempts(), - Strings.isNullOrEmpty(configs.getBackplane().getRedisPassword()) - ? null - : configs.getBackplane().getRedisPassword(), + Strings.emptyToNull(configs.getBackplane().getRedisPassword()), createJedisPoolConfig()); } // support "" as redis password. return createJedisClusterFactory( + identifier, parseUri(configs.getBackplane().getRedisUri()), configs.getBackplane().getTimeout(), configs.getBackplane().getMaxAttempts(), - Strings.isNullOrEmpty(configs.getBackplane().getRedisPassword()) - ? null - : configs.getBackplane().getRedisPassword(), + Strings.emptyToNull(configs.getBackplane().getRedisPassword()), createJedisPoolConfig()); } @@ -80,7 +80,7 @@ public static Supplier create() throws ConfigurationException { * @note Suggested return identifier: jedis. */ public static JedisCluster createTest() throws Exception { - JedisCluster redis = JedisClusterFactory.create().get(); + JedisCluster redis = JedisClusterFactory.create("test").get(); // use the client to create an empty redis cluster // this will prevent any persistent data across test runs @@ -151,7 +151,12 @@ private static void deleteExistingKeys(Jedis node) { * @note Suggested return identifier: jedis. */ private static Supplier createJedisClusterFactory( - URI redisUri, int timeout, int maxAttempts, String password, JedisPoolConfig poolConfig) { + String identifier, + URI redisUri, + int timeout, + int maxAttempts, + String password, + JedisPoolConfig poolConfig) { return () -> new JedisCluster( new HostAndPort(redisUri.getHost(), redisUri.getPort()), @@ -159,7 +164,9 @@ private static Supplier createJedisClusterFactory( /* soTimeout=*/ Integer.max(2000, timeout), Integer.max(5, maxAttempts), password, - poolConfig); + identifier, + poolConfig, + /* ssl=*/ JedisURIHelper.isRedisSSLScheme(redisUri)); } /** @@ -174,6 +181,7 @@ private static Supplier createJedisClusterFactory( * @note Suggested return identifier: jedis. */ private static Supplier createJedisClusterFactory( + String identifier, Set redisUrisNodes, int timeout, int maxAttempts, @@ -186,12 +194,13 @@ private static Supplier createJedisClusterFactory( /* soTimeout=*/ Integer.max(2000, timeout), Integer.max(5, maxAttempts), password, - poolConfig); + identifier, + poolConfig, + /* ssl=*/ false); } /** * @brief Create a jedis pool config. * @details Use configuration to build the appropriate jedis pool configuration. - * @param config Configuration for connecting to a redis cluster server. * @return A created jedis pool config. * @note Suggested return identifier: poolConfig. */ diff --git a/src/main/java/build/buildfarm/instance/shard/OperationQueue.java b/src/main/java/build/buildfarm/instance/shard/OperationQueue.java index 64f1f2befa..78dd77c769 100644 --- a/src/main/java/build/buildfarm/instance/shard/OperationQueue.java +++ b/src/main/java/build/buildfarm/instance/shard/OperationQueue.java @@ -24,6 +24,7 @@ import com.google.common.collect.SetMultimap; import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; import redis.clients.jedis.JedisCluster; /** @@ -48,6 +49,14 @@ public class OperationQueue { */ private final List queues; + /** + * @field currentDequeueIndex + * @brief The current queue index to dequeue from. + * @details Used in a round-robin fashion to ensure an even distribution of dequeues across + * matched queues. + */ + private int currentDequeueIndex = 0; + /** * @brief Constructor. * @details Construct the operation queue with various provisioned redis queues. @@ -186,8 +195,18 @@ public void push( */ public String dequeue(JedisCluster jedis, List provisions) throws InterruptedException { - BalancedRedisQueue queue = chooseEligibleQueue(provisions); - return queue.dequeue(jedis); + // Select all matched queues, and attempt dequeuing via round-robin. + List queues = chooseEligibleQueues(provisions); + int index = roundRobinPopIndex(queues); + String value = queues.get(index).nonBlockingDequeue(jedis); + + // Keep iterating over matched queues until we find one that is non-empty and provides a + // dequeued value. + while (value == null) { + index = roundRobinPopIndex(queues); + value = queues.get(index).nonBlockingDequeue(jedis); + } + return value; } /** @@ -270,6 +289,39 @@ private BalancedRedisQueue chooseEligibleQueue(List provision } } + throwNoEligibleQueueException(provisions); + return null; + } + + /** + * @brief Choose an eligible queues based on given properties. + * @details We use the platform execution properties of a queue entry to determine the appropriate + * queues. If there no eligible queues, an exception is thrown. + * @param provisions Provisions to check that requirements are met. + * @return The chosen queues. + * @note Suggested return identifier: queues. + */ + private List chooseEligibleQueues(List provisions) { + List eligibleQueues = + queues.stream() + .filter(provisionedQueue -> provisionedQueue.isEligible(toMultimap(provisions))) + .map(provisionedQueue -> provisionedQueue.queue()) + .collect(Collectors.toList()); + + if (eligibleQueues.isEmpty()) { + throwNoEligibleQueueException(provisions); + } + + return eligibleQueues; + } + + /** + * @brief Throw an exception that explains why there are no eligible queues. + * @details This function should only be called, when there were no matched queues. + * @param provisions Provisions to check that requirements are met. + * @return no return. + */ + private void throwNoEligibleQueueException(List provisions) { // At this point, we were unable to match an action to an eligible queue. // We will build an error explaining why the matching failed. This will help user's properly // configure their queue or adjust the execution_properties of their actions. @@ -286,6 +338,34 @@ private BalancedRedisQueue chooseEligibleQueue(List provision + eligibilityResults); } + /** + * @brief Get the current queue index for round-robin dequeues. + * @details Adjusts the round-robin index for next call. + * @param matchedQueues The queues to round robin. + * @return The current round-robin index. + * @note Suggested return identifier: queueIndex. + */ + private int roundRobinPopIndex(List matchedQueues) { + int currentIndex = currentDequeueIndex; + currentDequeueIndex = nextQueueInRoundRobin(currentDequeueIndex, matchedQueues); + return currentIndex; + } + + /** + * @brief Get the next queue in the round robin. + * @details If we are currently on the last queue it becomes the first queue. + * @param index Current queue index. + * @param matchedQueues The queues to round robin. + * @return And adjusted val based on the current queue index. + * @note Suggested return identifier: adjustedCurrentQueue. + */ + private int nextQueueInRoundRobin(int index, List matchedQueues) { + if (index >= matchedQueues.size() - 1) { + return 0; + } + return index + 1; + } + /** * @brief Convert proto provisions into java multimap. * @details This conversion is done to more easily check if a key/value exists in the provisions. diff --git a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java index cc780707f7..68da71f811 100644 --- a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java +++ b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java @@ -129,6 +129,8 @@ public class RedisShardBackplane implements Backplane { .build()); private final String source; // used in operation change publication + private final boolean subscribeToBackplane; + private final boolean runFailsafeOperation; private final Function onPublish; private final Function onComplete; private final Supplier jedisClusterFactory; @@ -149,18 +151,30 @@ public class RedisShardBackplane implements Backplane { public RedisShardBackplane( String source, + boolean subscribeToBackplane, + boolean runFailsafeOperation, Function onPublish, Function onComplete) throws ConfigurationException { - this(source, onPublish, onComplete, JedisClusterFactory.create()); + this( + source, + subscribeToBackplane, + runFailsafeOperation, + onPublish, + onComplete, + JedisClusterFactory.create(source)); } public RedisShardBackplane( String source, + boolean subscribeToBackplane, + boolean runFailsafeOperation, Function onPublish, Function onComplete, Supplier jedisClusterFactory) { this.source = source; + this.subscribeToBackplane = subscribeToBackplane; + this.runFailsafeOperation = runFailsafeOperation; this.onPublish = onPublish; this.onComplete = onComplete; this.jedisClusterFactory = jedisClusterFactory; @@ -206,7 +220,7 @@ public void visit(String entry) { JsonFormat.parser().merge(entry, executeEntry); visit(executeEntry.build(), entry); } catch (InvalidProtocolBufferException e) { - log.log(Level.FINE, "invalid ExecuteEntry json: " + entry, e); + log.log(Level.FINER, "invalid ExecuteEntry json: " + entry, e); } } } @@ -330,10 +344,10 @@ private void updateWatchers(JedisCluster jedis) { if (!expiringChannels.isEmpty()) { log.log( - Level.FINE, + Level.FINER, format("Scan %d watches, %s, expiresAt: %s", expiringChannels.size(), now, expiresAt)); - log.log(Level.FINE, "Scan prequeue"); + log.log(Level.FINER, "Scan prequeue"); // scan prequeue, pet watches scanPrequeue(jedis, resetChannel); } @@ -342,7 +356,7 @@ private void updateWatchers(JedisCluster jedis) { scanProcessing(jedis, resetChannel, now); if (!expiringChannels.isEmpty()) { - log.log(Level.FINE, "Scan queue"); + log.log(Level.FINER, "Scan queue"); // scan queue, pet watches scanQueue(jedis, resetChannel); } @@ -351,7 +365,7 @@ private void updateWatchers(JedisCluster jedis) { scanDispatching(jedis, resetChannel, now); if (!expiringChannels.isEmpty()) { - log.log(Level.FINE, "Scan dispatched"); + log.log(Level.FINER, "Scan dispatched"); // scan dispatched pet watches scanDispatched(jedis, resetChannel); } @@ -445,7 +459,7 @@ public void updateWatchedIfDone(JedisCluster jedis) { } subscriber.onOperation(operationChannel(operationName), operation, nextExpiresAt(now)); log.log( - Level.FINE, + Level.FINER, format( "operation %s done due to %s", operationName, operation == null ? "null" : "completed")); @@ -520,10 +534,10 @@ public void start(String clientPublicName) throws IOException { // Create containers that make up the backplane state = DistributedStateCreator.create(client); - if (configs.getBackplane().isSubscribeToBackplane()) { + if (subscribeToBackplane) { startSubscriptionThread(); } - if (configs.getBackplane().isRunFailsafeOperation()) { + if (runFailsafeOperation) { startFailsafeOperationThread(); } @@ -538,24 +552,24 @@ public synchronized void stop() throws InterruptedException { if (failsafeOperationThread != null) { failsafeOperationThread.interrupt(); failsafeOperationThread.join(); - log.log(Level.FINE, "failsafeOperationThread has been stopped"); + log.log(Level.FINER, "failsafeOperationThread has been stopped"); } if (operationSubscription != null) { operationSubscription.stop(); if (subscriptionThread != null) { subscriptionThread.join(); } - log.log(Level.FINE, "subscriptionThread has been stopped"); + log.log(Level.FINER, "subscriptionThread has been stopped"); } if (subscriberService != null) { subscriberService.shutdown(); subscriberService.awaitTermination(10, SECONDS); - log.log(Level.FINE, "subscriberService has been stopped"); + log.log(Level.FINER, "subscriberService has been stopped"); } if (client != null) { client.close(); client = null; - log.log(Level.FINE, "client has been closed"); + log.log(Level.FINER, "client has been closed"); } } @@ -1500,4 +1514,9 @@ public GetClientStartTimeResult getClientStartTime(GetClientStartTimeRequest req } return GetClientStartTimeResult.newBuilder().addAllClientStartTime(startTimes).build(); } + + @Override + public void updateDigestsExpiry(Iterable digests) throws IOException { + state.casWorkerMap.setExpire(client, digests); + } } diff --git a/src/main/java/build/buildfarm/instance/shard/RedisShardSubscriber.java b/src/main/java/build/buildfarm/instance/shard/RedisShardSubscriber.java index 50357bd30d..cd1d65972e 100644 --- a/src/main/java/build/buildfarm/instance/shard/RedisShardSubscriber.java +++ b/src/main/java/build/buildfarm/instance/shard/RedisShardSubscriber.java @@ -142,7 +142,7 @@ public ListenableFuture watch(String channel, TimedWatcher watcher) { new TimedWatchFuture(watcher) { @Override public void unwatch() { - log.log(Level.FINE, format("unwatching %s", channel)); + log.log(Level.FINER, format("unwatching %s", channel)); RedisShardSubscriber.this.unwatch(channel, this); } }; @@ -204,7 +204,7 @@ private void onOperation( @Nullable Instant expiresAt) { List operationWatchers = watchers.get(channel); boolean observe = operation == null || operation.hasMetadata() || operation.getDone(); - log.log(Level.FINE, format("onOperation %s: %s", channel, operation)); + log.log(Level.FINER, format("onOperation %s: %s", channel, operation)); synchronized (watchers) { ImmutableList.Builder> observers = ImmutableList.builder(); for (TimedWatchFuture watchFuture : operationWatchers) { @@ -220,7 +220,7 @@ private void onOperation( executor.execute( () -> { if (observe) { - log.log(Level.FINE, "observing " + operation); + log.log(Level.FINER, "observing " + operation); observer.accept(operation); } }); diff --git a/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java b/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java index 234e15fb15..52b010c31f 100644 --- a/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java +++ b/src/main/java/build/buildfarm/instance/shard/RedissonCasWorkerMap.java @@ -209,6 +209,14 @@ public int size(RedisClient client) { return cacheMap.size(); } + @Override + public void setExpire(RedisClient client, Iterable blobDigests) { + for (Digest blobDigest : blobDigests) { + String key = cacheMapCasKey(blobDigest); + cacheMap.expireKey(key, keyExpiration_s, TimeUnit.SECONDS); + } + } + /** * @brief Get a random element from the set. * @details Assumes the set is not empty. diff --git a/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java b/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java index 640878707f..8a00e9ae96 100644 --- a/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java +++ b/src/main/java/build/buildfarm/instance/shard/RemoteInputStreamFactory.java @@ -30,7 +30,7 @@ import build.buildfarm.common.DigestUtil; import build.buildfarm.common.InputStreamFactory; import build.buildfarm.instance.Instance; -import build.buildfarm.instance.shard.ShardInstance.WorkersCallback; +import build.buildfarm.instance.shard.ServerInstance.WorkersCallback; import com.google.common.base.Throwables; import com.google.common.cache.LoadingCache; import com.google.common.collect.Iterables; diff --git a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java b/src/main/java/build/buildfarm/instance/shard/ServerInstance.java similarity index 91% rename from src/main/java/build/buildfarm/instance/shard/ShardInstance.java rename to src/main/java/build/buildfarm/instance/shard/ServerInstance.java index 1ed4c8b41b..29edc29fd4 100644 --- a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java +++ b/src/main/java/build/buildfarm/instance/shard/ServerInstance.java @@ -47,6 +47,7 @@ import build.bazel.remote.execution.v2.Action; import build.bazel.remote.execution.v2.ActionResult; import build.bazel.remote.execution.v2.BatchReadBlobsResponse.Response; +import build.bazel.remote.execution.v2.CacheCapabilities; import build.bazel.remote.execution.v2.Command; import build.bazel.remote.execution.v2.Compressor; import build.bazel.remote.execution.v2.Digest; @@ -61,6 +62,7 @@ import build.bazel.remote.execution.v2.Platform.Property; import build.bazel.remote.execution.v2.RequestMetadata; import build.bazel.remote.execution.v2.ResultsCachePolicy; +import build.bazel.remote.execution.v2.SymlinkAbsolutePathStrategy; import build.buildfarm.actioncache.ActionCache; import build.buildfarm.actioncache.ShardActionCache; import build.buildfarm.backplane.Backplane; @@ -80,7 +82,7 @@ import build.buildfarm.common.grpc.UniformDelegateServerCallStreamObserver; import build.buildfarm.instance.Instance; import build.buildfarm.instance.MatchListener; -import build.buildfarm.instance.server.AbstractServerInstance; +import build.buildfarm.instance.server.NodeInstance; import build.buildfarm.operations.EnrichedOperation; import build.buildfarm.operations.FindOperationsResults; import build.buildfarm.v1test.BackplaneStatus; @@ -136,10 +138,12 @@ import java.io.InputStream; import java.io.OutputStream; import java.time.Instant; +import java.util.AbstractMap; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; import java.util.Deque; +import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; @@ -168,7 +172,7 @@ import lombok.extern.java.Log; @Log -public class ShardInstance extends AbstractServerInstance { +public class ServerInstance extends NodeInstance { private static final ListenableFuture IMMEDIATE_VOID_FUTURE = Futures.immediateFuture(null); private static final String TIMEOUT_OUT_OF_BOUNDS = @@ -257,13 +261,17 @@ public class ShardInstance extends AbstractServerInstance { private static Backplane createBackplane(String identifier) throws ConfigurationException { if (configs.getBackplane().getType().equals(SHARD)) { return new RedisShardBackplane( - identifier, ShardInstance::stripOperation, ShardInstance::stripQueuedOperation); + identifier, + /* subscribeToBackplane=*/ true, + configs.getServer().isRunFailsafeOperation(), + ServerInstance::stripOperation, + ServerInstance::stripQueuedOperation); } else { throw new IllegalArgumentException("Shard Backplane not set in config"); } } - public ShardInstance(String name, String identifier, DigestUtil digestUtil, Runnable onStop) + public ServerInstance(String name, String identifier, DigestUtil digestUtil, Runnable onStop) throws InterruptedException, ConfigurationException { this( name, @@ -273,7 +281,7 @@ public ShardInstance(String name, String identifier, DigestUtil digestUtil, Runn /* actionCacheFetchService=*/ BuildfarmExecutors.getActionCacheFetchServicePool()); } - private ShardInstance( + private ServerInstance( String name, DigestUtil digestUtil, Backplane backplane, @@ -325,7 +333,7 @@ void initializeCaches() { .build(); } - public ShardInstance( + public ServerInstance( String name, DigestUtil digestUtil, Backplane backplane, @@ -416,14 +424,14 @@ ListenableFuture iterate() throws IOException, InterruptedException { () -> {}, Deadline.after(5, MINUTES)); try { - log.log(Level.FINE, "queueing " + operationName); + log.log(Level.FINER, "queueing " + operationName); ListenableFuture queueFuture = queue(executeEntry, poller, queueTimeout); addCallback( queueFuture, new FutureCallback() { @Override public void onSuccess(Void result) { - log.log(Level.FINE, "successfully queued " + operationName); + log.log(Level.FINER, "successfully queued " + operationName); // nothing } @@ -437,7 +445,7 @@ public void onFailure(Throwable t) { long operationTransformDispatchUSecs = stopwatch.elapsed(MICROSECONDS) - canQueueUSecs; log.log( - Level.FINE, + Level.FINER, format( "OperationQueuer: Dispatched To Transform %s: %dus in canQueue, %dus in transform dispatch", operationName, canQueueUSecs, operationTransformDispatchUSecs)); @@ -452,7 +460,7 @@ public void onFailure(Throwable t) { @Override public void run() { - log.log(Level.FINE, "OperationQueuer: Running"); + log.log(Level.FINER, "OperationQueuer: Running"); try { while (transformTokensQueue.offer(new Object(), 5, MINUTES)) { stopwatch.start(); @@ -485,7 +493,7 @@ public void run() { } catch (Exception t) { log.log(Level.SEVERE, "OperationQueuer: fatal exception during iteration", t); } finally { - log.log(Level.FINE, "OperationQueuer: Exiting"); + log.log(Level.FINER, "OperationQueuer: Exiting"); } operationQueuer = null; try { @@ -570,12 +578,14 @@ public void stop() throws InterruptedException { return; } stopping = true; - log.log(Level.FINE, format("Instance %s is stopping", getName())); + log.log(Level.FINER, format("Instance %s is stopping", getName())); if (operationQueuer != null) { - operationQueuer.stop(); + operationQueuer.interrupt(); + operationQueuer.join(); } if (dispatchedMonitor != null) { dispatchedMonitor.interrupt(); + dispatchedMonitor.join(); } if (prometheusMetricsThread != null) { prometheusMetricsThread.interrupt(); @@ -606,7 +616,7 @@ public void stop() throws InterruptedException { } actionCacheFetchService.shutdownNow(); workerStubs.invalidateAll(); - log.log(Level.FINE, format("Instance %s has been stopped", getName())); + log.log(Level.FINER, format("Instance %s has been stopped", getName())); stopping = false; stopped = true; } @@ -647,81 +657,42 @@ public ListenableFuture> findMissingBlobs( return immediateFailedFuture(Status.fromThrowable(e).asException()); } - // Empty blobs are an exceptional case. Filter them out. - // If the user only requested empty blobs we can immedaitely tell them we already have it. + // Empty blobs are an exceptional case. Filter them out. + // If the user only requested empty blobs we can immediately tell them we already have it. Iterable nonEmptyDigests = Iterables.filter(blobDigests, (digest) -> digest.getSizeBytes() != 0); if (Iterables.isEmpty(nonEmptyDigests)) { return immediateFuture(ImmutableList.of()); } - // This is a faster strategy to check missing blobs which does not require querying the CAS. - // With hundreds of worker machines, it may be too expensive to query all of them for "find - // missing blobs". - // Workers register themselves with the backplane for a 30-second window, and if they fail to - // re-register within this time frame, they are automatically removed from the backplane. While - // this alternative strategy for finding missing blobs is faster and more cost-effective than - // the exhaustive approach of querying each worker to find the digest, it comes with a higher - // risk of returning expired workers despite filtering by active workers below. This is because - // the strategy may return workers that have expired in the last 30 seconds. However, checking - // workers directly is not a guarantee either since workers could leave the cluster after being - // queried. Ultimately, it will come down to the client's resiliency if the backplane is - // out-of-date and the server lies about which blobs are actually present. We provide this - // alternative strategy for calculating missing blobs. - if (configs.getServer().isFindMissingBlobsViaBackplane()) { - try { - Set uniqueDigests = new HashSet<>(); - nonEmptyDigests.forEach(uniqueDigests::add); - Map> foundBlobs = backplane.getBlobDigestsWorkers(uniqueDigests); - Set workerSet = backplane.getStorageWorkers(); - Map workersStartTime = backplane.getWorkersStartTimeInEpochSecs(workerSet); - return immediateFuture( - uniqueDigests.stream() - .filter( // best effort to present digests only missing on active workers - digest -> { - try { - Set initialWorkers = - foundBlobs.getOrDefault(digest, Collections.emptySet()); - Set activeWorkers = Sets.intersection(initialWorkers, workerSet); - long insertTime = backplane.getDigestInsertTime(digest); - Set workersStartedBeforeDigestInsertion = - activeWorkers.stream() - .filter( - worker -> - workersStartTime.getOrDefault( - worker, Instant.now().getEpochSecond()) - < insertTime) - .collect(Collectors.toSet()); - Set workersToBeRemoved = - Sets.difference(initialWorkers, workersStartedBeforeDigestInsertion) - .immutableCopy(); - if (!workersToBeRemoved.isEmpty()) { - log.log( - Level.INFO, format("adjusting locations for the digest %s", digest)); - backplane.adjustBlobLocations( - digest, Collections.emptySet(), workersToBeRemoved); - } - return workersStartedBeforeDigestInsertion.isEmpty(); - } catch (IOException e) { - // Treat error as missing digest. - log.log( - Level.WARNING, - format("failed to get digest (%s) insertion time", digest)); - return true; - } - }) - .collect(Collectors.toList())); - } catch (Exception e) { - log.log(Level.SEVERE, "find missing blob via backplane failed", e); - return immediateFailedFuture(Status.fromThrowable(e).asException()); - } + return findMissingBlobsViaBackplane(nonEmptyDigests, requestMetadata); } - // A more accurate way to verify missing blobs is to ask the CAS participants directly if they - // have the blobs. To do this, we get all of the worker nodes that are particpating in the CAS - // as a random list to begin our search. If there are no workers avaiable, tell the client all - // blobs are missing. + return findMissingBlobsQueryingEachWorker(nonEmptyDigests, requestMetadata); + } + + class FindMissingResponseEntry { + final String worker; + final long elapsedMicros; + final Throwable exception; + final int stillMissingAfter; + + FindMissingResponseEntry( + String worker, long elapsedMicros, Throwable exception, int stillMissingAfter) { + this.worker = worker; + this.elapsedMicros = elapsedMicros; + this.exception = exception; + this.stillMissingAfter = stillMissingAfter; + } + } + + // A more accurate way to verify missing blobs is to ask the CAS participants directly if they + // have the blobs. To do this, we get all the worker nodes that are participating in the CAS + // as a random list to begin our search. If there are no workers available, tell the client all + // blobs are missing. + private ListenableFuture> findMissingBlobsQueryingEachWorker( + Iterable nonEmptyDigests, RequestMetadata requestMetadata) { Deque workers; try { List workersList = new ArrayList<>(backplane.getStorageWorkers()); @@ -734,7 +705,7 @@ public ListenableFuture> findMissingBlobs( return immediateFuture(nonEmptyDigests); } - // Search through all of the workers to decide how many CAS blobs are missing. + // Search through all of the workers to decide which CAS blobs are missing. SettableFuture> missingDigestsFuture = SettableFuture.create(); findMissingBlobsOnWorker( UUID.randomUUID().toString(), @@ -748,18 +719,118 @@ public ListenableFuture> findMissingBlobs( return missingDigestsFuture; } - class FindMissingResponseEntry { - final String worker; - final long elapsedMicros; - final Throwable exception; - final int stillMissingAfter; + // This is a faster strategy to check missing blobs which does not require querying the CAS. + // With hundreds of worker machines, it may be too expensive to query all of them for "find + // missing blobs". + // Workers register themselves with the backplane for a 30-second window, and if they fail to + // re-register within this time frame, they are automatically removed from the backplane. While + // this alternative strategy for finding missing blobs is faster and more cost-effective than + // the exhaustive approach of querying each worker to find the digest, it comes with a higher + // risk of returning expired workers despite filtering by active workers below. This is because + // the strategy may return workers that have expired in the last 30 seconds. However, checking + // workers directly is not a guarantee either since workers could leave the cluster after being + // queried. Ultimately, it will come down to the client's resiliency if the backplane is + // out-of-date and the server lies about which blobs are actually present. We provide this + // alternative strategy for calculating missing blobs. + private ListenableFuture> findMissingBlobsViaBackplane( + Iterable nonEmptyDigests, RequestMetadata requestMetadata) { + try { + Set uniqueDigests = new HashSet<>(); + nonEmptyDigests.forEach(uniqueDigests::add); + Map> foundBlobs = backplane.getBlobDigestsWorkers(uniqueDigests); + Set workerSet = backplane.getStorageWorkers(); + Map workersStartTime = backplane.getWorkersStartTimeInEpochSecs(workerSet); + Map> digestAndWorkersMap = + uniqueDigests.stream() + .map( + digest -> { + Set initialWorkers = + foundBlobs.getOrDefault(digest, Collections.emptySet()); + return new AbstractMap.SimpleEntry<>( + digest, + filterAndAdjustWorkersForDigest( + digest, initialWorkers, workerSet, workersStartTime)); + }) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + + ListenableFuture> missingDigestFuture = + immediateFuture( + digestAndWorkersMap.entrySet().stream() + .filter(entry -> entry.getValue().isEmpty()) + .map(Map.Entry::getKey) + .collect(Collectors.toList())); + return transformAsync( + missingDigestFuture, + (missingDigest) -> { + extendLeaseForDigests(digestAndWorkersMap, requestMetadata); + return immediateFuture(missingDigest); + }, + // Propagate context values but don't cascade its cancellation for downstream calls. + Context.current().fork().fixedContextExecutor(directExecutor())); + } catch (Exception e) { + log.log(Level.SEVERE, "find missing blob via backplane failed", e); + return immediateFailedFuture(Status.fromThrowable(e).asException()); + } + } - FindMissingResponseEntry( - String worker, long elapsedMicros, Throwable exception, int stillMissingAfter) { - this.worker = worker; - this.elapsedMicros = elapsedMicros; - this.exception = exception; - this.stillMissingAfter = stillMissingAfter; + private Set filterAndAdjustWorkersForDigest( + Digest digest, + Set originalWorkerSetWithDigest, + Set activeWorkers, + Map workersStartTime) { + long insertTime; + try { + insertTime = backplane.getDigestInsertTime(digest); + } catch (IOException e) { + log.log(Level.WARNING, format("failed to get digest (%s) insertion time", digest)); + return Collections.emptySet(); + } + Set activeWorkersWithDigest = + Sets.intersection(originalWorkerSetWithDigest, activeWorkers); + Set workersStartedBeforeDigestInsertion = + activeWorkersWithDigest.stream() + .filter( + worker -> + workersStartTime.getOrDefault(worker, Instant.now().getEpochSecond()) + < insertTime) + .collect(Collectors.toSet()); + Set workersToBeRemoved = + Sets.difference(originalWorkerSetWithDigest, workersStartedBeforeDigestInsertion) + .immutableCopy(); + if (!workersToBeRemoved.isEmpty()) { + try { + log.log(Level.INFO, format("adjusting locations for the digest %s", digest)); + backplane.adjustBlobLocations(digest, Collections.emptySet(), workersToBeRemoved); + } catch (IOException e) { + log.log( + Level.WARNING, + format("error adjusting blob location for %s", DigestUtil.toString(digest)), + e); + } + } + return workersStartedBeforeDigestInsertion; + } + + private void extendLeaseForDigests( + Map> digestAndWorkersMap, RequestMetadata requestMetadata) { + Map> workerAndDigestMap = new HashMap<>(); + digestAndWorkersMap.forEach( + (digest, workers) -> + workers.forEach( + worker -> + workerAndDigestMap.computeIfAbsent(worker, w -> new HashSet<>()).add(digest))); + + workerAndDigestMap.forEach( + (worker, digests) -> workerStub(worker).findMissingBlobs(digests, requestMetadata)); + + try { + backplane.updateDigestsExpiry(digestAndWorkersMap.keySet()); + } catch (IOException e) { + log.log( + Level.WARNING, + format( + "Failed to update expiry duration for digests (%s) insertion time", + digestAndWorkersMap.keySet())); } } @@ -877,7 +948,7 @@ private void fetchBlobFromWorker( public void onNext(ByteString nextChunk) { blobObserver.onNext(nextChunk); received += nextChunk.size(); - ioMetric.observe(received); + ioMetric.observe(nextChunk.size()); } @Override @@ -890,7 +961,7 @@ public void onError(Throwable t) { } else if (status.getCode() == Code.NOT_FOUND) { casMissCounter.inc(); log.log( - configs.getServer().isEnsureOutputsPresent() ? Level.WARNING : Level.FINE, + configs.getServer().isEnsureOutputsPresent() ? Level.WARNING : Level.FINER, worker + " did not contain " + DigestUtil.toString(blobDigest)); // ignore this, the worker will update the backplane eventually } else if (status.getCode() != Code.DEADLINE_EXCEEDED @@ -901,8 +972,11 @@ public void onError(Throwable t) { log.log( Level.WARNING, format( - "DEADLINE_EXCEEDED: read(%s) on worker %s after %d bytes of content", - DigestUtil.toString(blobDigest), worker, received)); + "%s: read(%s) on worker %s after %d bytes of content", + status.getCode().name(), + DigestUtil.toString(blobDigest), + worker, + received)); blobObserver.onError(t); return; } @@ -1001,7 +1075,7 @@ public void getBlob( final ListenableFuture> populatedWorkerListFuture; if (emptyWorkerList) { log.log( - Level.FINE, + Level.FINER, format( "worker list was initially empty for %s, attempting to correct", DigestUtil.toString(blobDigest))); @@ -1017,7 +1091,7 @@ public void getBlob( RequestMetadata.getDefaultInstance()), (foundOnWorkers) -> { log.log( - Level.FINE, + Level.FINER, format( "worker list was corrected for %s to be %s", DigestUtil.toString(blobDigest), foundOnWorkers.toString())); @@ -1047,7 +1121,7 @@ public void onError(Throwable t) { workersList.clear(); final ListenableFuture> workersListFuture; log.log( - Level.FINE, + Level.FINER, format( "worker list was depleted for %s, attempting to correct", DigestUtil.toString(blobDigest))); @@ -1057,13 +1131,13 @@ public void onError(Throwable t) { backplane, workerSet, locationSet, - ShardInstance.this::workerStub, + ServerInstance.this::workerStub, blobDigest, directExecutor(), RequestMetadata.getDefaultInstance()), (foundOnWorkers) -> { log.log( - Level.FINE, + Level.FINER, format( "worker list was corrected after depletion for %s to be %s", DigestUtil.toString(blobDigest), foundOnWorkers.toString())); @@ -1395,7 +1469,7 @@ ListenableFuture expectDirectory( @Override public CompletableFuture apply(Digest digest, Executor executor) { log.log( - Level.FINE, + Level.FINER, format( "transformQueuedOperation(%s): fetching directory %s", reason, DigestUtil.toString(directoryBlobDigest))); @@ -1530,7 +1604,7 @@ private ListenableFuture transformQueuedOperation( expectCommand(commandDigest, requestMetadata), (command) -> { log.log( - Level.FINE, + Level.FINER, format("transformQueuedOperation(%s): fetched command", operationName)); if (command != null) { queuedOperationBuilder.setCommand(command); @@ -2020,7 +2094,7 @@ public ListenableFuture execute( executionSuccess.inc(); log.log( - Level.FINE, + Level.FINER, new StringBuilder() .append("ExecutionSuccess: ") .append(requestMetadata.getToolInvocationId()) @@ -2033,7 +2107,7 @@ public ListenableFuture execute( actionCache.invalidate(DigestUtil.asActionKey(actionDigest)); if (!skipCacheLookup && recentCacheServedExecutions.getIfPresent(requestMetadata) != null) { log.log( - Level.FINE, + Level.FINER, format("Operation %s will have skip_cache_lookup = true due to retry", operationName)); skipCacheLookup = true; } @@ -2258,9 +2332,9 @@ public ListenableFuture queue(ExecuteEntry executeEntry, Poller poller, Du poller.pause(); long checkCacheUSecs = stopwatch.elapsed(MICROSECONDS); log.log( - Level.FINE, + Level.FINER, format( - "ShardInstance(%s): checkCache(%s): %sus elapsed", + "ServerInstance(%s): checkCache(%s): %sus elapsed", getName(), operation.getName(), checkCacheUSecs)); return IMMEDIATE_VOID_FUTURE; } @@ -2285,9 +2359,9 @@ private ListenableFuture transformAndQueue( Digest actionDigest = metadata.getActionDigest(); SettableFuture queueFuture = SettableFuture.create(); log.log( - Level.FINE, + Level.FINER, format( - "ShardInstance(%s): queue(%s): fetching action %s", + "ServerInstance(%s): queue(%s): fetching action %s", getName(), operation.getName(), actionDigest.getHash())); RequestMetadata requestMetadata = executeEntry.getRequestMetadata(); ListenableFuture actionFuture = @@ -2328,9 +2402,9 @@ private ListenableFuture transformAndQueue( actionFuture, (action) -> { log.log( - Level.FINE, + Level.FINER, format( - "ShardInstance(%s): queue(%s): fetched action %s transforming queuedOperation", + "ServerInstance(%s): queue(%s): fetched action %s transforming queuedOperation", getName(), operation.getName(), actionDigest.getHash())); Stopwatch transformStopwatch = Stopwatch.createStarted(); return transform( @@ -2358,9 +2432,9 @@ private ListenableFuture transformAndQueue( queuedFuture, (profiledQueuedMetadata) -> { log.log( - Level.FINE, + Level.FINER, format( - "ShardInstance(%s): queue(%s): queuedOperation %s transformed, validating", + "ServerInstance(%s): queue(%s): queuedOperation %s transformed, validating", getName(), operation.getName(), DigestUtil.toString( @@ -2380,9 +2454,9 @@ private ListenableFuture transformAndQueue( validatedFuture, (profiledQueuedMetadata) -> { log.log( - Level.FINE, + Level.FINER, format( - "ShardInstance(%s): queue(%s): queuedOperation %s validated, uploading", + "ServerInstance(%s): queue(%s): queuedOperation %s validated, uploading", getName(), operation.getName(), DigestUtil.toString( @@ -2432,9 +2506,9 @@ public void onSuccess(ProfiledQueuedOperationMetadata profiledQueuedMetadata) { long elapsedUSecs = stopwatch.elapsed(MICROSECONDS); long queueUSecs = elapsedUSecs - startQueueUSecs; log.log( - Level.FINE, + Level.FINER, format( - "ShardInstance(%s): queue(%s): %dus checkCache, %dus transform, %dus validate, %dus upload, %dus queue, %dus elapsed", + "ServerInstance(%s): queue(%s): %dus checkCache, %dus transform, %dus validate, %dus upload, %dus queue, %dus elapsed", getName(), queueOperation.getName(), checkCacheUSecs, @@ -2737,4 +2811,16 @@ private boolean inDenyList(RequestMetadata requestMetadata) throws IOException { } return backplane.isBlacklisted(requestMetadata); } + + @Override + protected CacheCapabilities getCacheCapabilities() { + SymlinkAbsolutePathStrategy.Value symlinkAbsolutePathStrategy = + configs.isAllowSymlinkTargetAbsolute() + ? SymlinkAbsolutePathStrategy.Value.ALLOWED + : SymlinkAbsolutePathStrategy.Value.DISALLOWED; + return super.getCacheCapabilities() + .toBuilder() + .setSymlinkAbsolutePathStrategy(symlinkAbsolutePathStrategy) + .build(); + } } diff --git a/src/main/java/build/buildfarm/instance/shard/Util.java b/src/main/java/build/buildfarm/instance/shard/Util.java index 5e3493d0c6..7070097659 100644 --- a/src/main/java/build/buildfarm/instance/shard/Util.java +++ b/src/main/java/build/buildfarm/instance/shard/Util.java @@ -141,7 +141,7 @@ public void onFailure(Throwable t) { } }; log.log( - Level.FINE, + Level.FINER, format( "scanning through %d workers to find %s", workerSet.size(), DigestUtil.toString(digest))); @@ -184,7 +184,7 @@ static void checkMissingBlobOnInstance( public void onSuccess(Iterable missingDigests) { boolean found = Iterables.isEmpty(missingDigests); log.log( - Level.FINE, + Level.FINER, format( "check missing response for %s to %s was %sfound", DigestUtil.toString(digest), worker, found ? "" : "not ")); @@ -197,7 +197,7 @@ public void onFailure(Throwable t) { Status status = Status.fromThrowable(t); if (status.getCode() == Code.UNAVAILABLE) { log.log( - Level.FINE, + Level.FINER, format( "check missing response for %s to %s was not found for unavailable", DigestUtil.toString(digest), worker)); diff --git a/src/main/java/build/buildfarm/instance/shard/WorkerStubs.java b/src/main/java/build/buildfarm/instance/shard/WorkerStubs.java index 28a29afd7f..abaf5f71e1 100644 --- a/src/main/java/build/buildfarm/instance/shard/WorkerStubs.java +++ b/src/main/java/build/buildfarm/instance/shard/WorkerStubs.java @@ -14,6 +14,7 @@ package build.buildfarm.instance.shard; +import static build.buildfarm.common.grpc.Channels.createChannel; import static com.google.common.util.concurrent.MoreExecutors.listeningDecorator; import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; @@ -28,9 +29,6 @@ import com.google.common.cache.RemovalListener; import com.google.common.util.concurrent.ListeningScheduledExecutorService; import com.google.protobuf.Duration; -import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import java.util.concurrent.TimeUnit; public final class WorkerStubs { @@ -59,17 +57,12 @@ private static Instance newStubInstance(String worker, DigestUtil digestUtil, Du worker, digestUtil, createChannel(worker), + createChannel(worker), // separate write channel timeout, newStubRetrier(), newStubRetryService()); } - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - private static Retrier newStubRetrier() { return new Retrier( Backoff.exponential( diff --git a/src/main/java/build/buildfarm/instance/stub/BUILD b/src/main/java/build/buildfarm/instance/stub/BUILD index c8b2b82e77..1f5a2b916d 100644 --- a/src/main/java/build/buildfarm/instance/stub/BUILD +++ b/src/main/java/build/buildfarm/instance/stub/BUILD @@ -23,7 +23,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:org_projectlombok_lombok", diff --git a/src/main/java/build/buildfarm/instance/stub/StubInstance.java b/src/main/java/build/buildfarm/instance/stub/StubInstance.java index a4dd880c5c..8045dc5db9 100644 --- a/src/main/java/build/buildfarm/instance/stub/StubInstance.java +++ b/src/main/java/build/buildfarm/instance/stub/StubInstance.java @@ -161,6 +161,7 @@ public class StubInstance implements Instance { private final String identifier; private final DigestUtil digestUtil; private final ManagedChannel channel; + private final ManagedChannel writeChannel; private final @Nullable Duration grpcTimeout; private final Retrier retrier; private final @Nullable ListeningScheduledExecutorService retryService; @@ -187,12 +188,24 @@ public StubInstance( this(name, identifier, digestUtil, channel, grpcTimeout, NO_RETRIES, /* retryService=*/ null); } + public StubInstance( + String name, + String identifier, + DigestUtil digestUtil, + ManagedChannel channel, + Duration grpcTimeout, + Retrier retrier, + @Nullable ListeningScheduledExecutorService retryService) { + this(name, identifier, digestUtil, channel, channel, grpcTimeout, retrier, retryService); + } + @SuppressWarnings("NullableProblems") public StubInstance( String name, String identifier, DigestUtil digestUtil, ManagedChannel channel, + ManagedChannel writeChannel, Duration grpcTimeout, Retrier retrier, @Nullable ListeningScheduledExecutorService retryService) { @@ -200,6 +213,7 @@ public StubInstance( this.identifier = identifier; this.digestUtil = digestUtil; this.channel = channel; + this.writeChannel = writeChannel; this.grpcTimeout = grpcTimeout; this.retrier = retrier; this.retryService = retryService; @@ -359,8 +373,14 @@ public void start(String publicName) {} @Override public void stop() throws InterruptedException { isStopped = true; - channel.shutdownNow(); - channel.awaitTermination(0, TimeUnit.SECONDS); + if (!channel.isShutdown()) { + channel.shutdownNow(); + channel.awaitTermination(0, TimeUnit.SECONDS); + } + if (!writeChannel.isShutdown()) { + writeChannel.shutdownNow(); + writeChannel.awaitTermination(0, TimeUnit.SECONDS); + } if (retryService != null && !shutdownAndAwaitTermination(retryService, 10, TimeUnit.SECONDS)) { log.log(Level.SEVERE, format("Could not shut down retry service for %s", identifier)); } @@ -662,7 +682,7 @@ Write getWrite( deadlined(bsBlockingStub).withInterceptors(attachMetadataInterceptor(requestMetadata)), Suppliers.memoize( () -> - ByteStreamGrpc.newStub(channel) + ByteStreamGrpc.newStub(writeChannel) .withInterceptors(attachMetadataInterceptor(requestMetadata))), resourceName, exceptionTranslator, diff --git a/src/main/java/build/buildfarm/metrics/AbstractMetricsPublisher.java b/src/main/java/build/buildfarm/metrics/AbstractMetricsPublisher.java index 5f9422a421..7dd09b6d0f 100644 --- a/src/main/java/build/buildfarm/metrics/AbstractMetricsPublisher.java +++ b/src/main/java/build/buildfarm/metrics/AbstractMetricsPublisher.java @@ -51,6 +51,13 @@ public abstract class AbstractMetricsPublisher implements MetricsPublisher { .labelNames("worker_name") .help("Operations per worker.") .register(); + + private static final Gauge operationExitCode = + Gauge.build() + .name("operation_exit_code") + .labelNames("exit_code") + .help("Operation execution exit code.") + .register(); private static final Histogram queuedTime = Histogram.build().name("queued_time_ms").help("Queued time in ms.").register(); private static final Histogram outputUploadTime = @@ -97,6 +104,11 @@ protected OperationRequestMetadata populateRequestMetadata( Integer.toString( operationRequestMetadata.getExecuteResponse().getStatus().getCode())) .inc(); + operationExitCode + .labels( + Integer.toString( + operationRequestMetadata.getExecuteResponse().getResult().getExitCode())) + .inc(); if (operationRequestMetadata.getExecuteResponse().hasResult() && operationRequestMetadata.getExecuteResponse().getResult().hasExecutionMetadata()) { operationsPerWorker @@ -172,7 +184,7 @@ protected static String formatRequestMetadataToJson( .usingTypeRegistry(typeRegistry) .omittingInsignificantWhitespace() .print(operationRequestMetadata); - log.log(Level.FINE, "{}", formattedRequestMetadata); + log.log(Level.FINER, "{}", formattedRequestMetadata); return formattedRequestMetadata; } } diff --git a/src/main/java/build/buildfarm/metrics/aws/AwsMetricsPublisher.java b/src/main/java/build/buildfarm/metrics/aws/AwsMetricsPublisher.java deleted file mode 100644 index e0ae2785e0..0000000000 --- a/src/main/java/build/buildfarm/metrics/aws/AwsMetricsPublisher.java +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.metrics.aws; - -import build.bazel.remote.execution.v2.RequestMetadata; -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.metrics.AbstractMetricsPublisher; -import com.amazonaws.ClientConfiguration; -import com.amazonaws.auth.AWSCredentials; -import com.amazonaws.auth.AWSStaticCredentialsProvider; -import com.amazonaws.handlers.AsyncHandler; -import com.amazonaws.services.secretsmanager.AWSSecretsManager; -import com.amazonaws.services.secretsmanager.AWSSecretsManagerClientBuilder; -import com.amazonaws.services.secretsmanager.model.GetSecretValueRequest; -import com.amazonaws.services.secretsmanager.model.GetSecretValueResult; -import com.amazonaws.services.sns.AmazonSNSAsync; -import com.amazonaws.services.sns.AmazonSNSAsyncClientBuilder; -import com.amazonaws.services.sns.model.PublishRequest; -import com.amazonaws.services.sns.model.PublishResult; -import com.amazonaws.util.StringUtils; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.longrunning.Operation; -import java.io.IOException; -import java.util.Base64; -import java.util.HashMap; -import java.util.logging.Level; -import lombok.extern.java.Log; - -@Log -public class AwsMetricsPublisher extends AbstractMetricsPublisher { - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - private static AmazonSNSAsync snsClient; - - private final String snsTopicOperations; - private String accessKeyId = null; - private String secretKey = null; - private final String region; - private final int snsClientMaxConnections; - - public AwsMetricsPublisher() { - super(configs.getServer().getClusterId()); - snsTopicOperations = configs.getServer().getMetrics().getTopic(); - region = configs.getServer().getCloudRegion(); - getAwsSecret(configs.getServer().getMetrics().getSecretName()); - snsClientMaxConnections = configs.getServer().getMetrics().getTopicMaxConnections(); - if (!StringUtils.isNullOrEmpty(snsTopicOperations) - && snsClientMaxConnections > 0 - && !StringUtils.isNullOrEmpty(accessKeyId) - && !StringUtils.isNullOrEmpty(secretKey) - && !StringUtils.isNullOrEmpty(region)) { - snsClient = initSnsClient(); - } - } - - @Override - public void publishRequestMetadata(Operation operation, RequestMetadata requestMetadata) { - try { - if (snsClient != null) { - snsClient.publishAsync( - new PublishRequest( - snsTopicOperations, - formatRequestMetadataToJson(populateRequestMetadata(operation, requestMetadata))), - new AsyncHandler() { - @Override - public void onError(Exception e) { - log.log(Level.WARNING, "Could not publish metrics data to SNS.", e); - } - - @Override - public void onSuccess(PublishRequest request, PublishResult publishResult) {} - }); - } - } catch (Exception e) { - log.log( - Level.WARNING, - String.format("Could not publish request metadata to SNS for %s.", operation.getName()), - e); - } - } - - private AmazonSNSAsync initSnsClient() { - log.log(Level.INFO, "Initializing SNS Client."); - return AmazonSNSAsyncClientBuilder.standard() - .withRegion(region) - .withClientConfiguration( - new ClientConfiguration().withMaxConnections(snsClientMaxConnections)) - .withCredentials( - new AWSStaticCredentialsProvider( - new AWSCredentials() { - @Override - public String getAWSAccessKeyId() { - return accessKeyId; - } - - @Override - public String getAWSSecretKey() { - return secretKey; - } - })) - .build(); - } - - @Override - public void publishMetric(String metricName, Object metricValue) { - throw new UnsupportedOperationException(); - } - - @SuppressWarnings("unchecked") - private void getAwsSecret(String secretName) { - AWSSecretsManager client = AWSSecretsManagerClientBuilder.standard().withRegion(region).build(); - GetSecretValueRequest getSecretValueRequest = - new GetSecretValueRequest().withSecretId(secretName); - GetSecretValueResult getSecretValueResult; - try { - getSecretValueResult = client.getSecretValue(getSecretValueRequest); - } catch (Exception e) { - log.log(Level.SEVERE, String.format("Could not get secret %s from AWS.", secretName)); - return; - } - String secret; - if (getSecretValueResult.getSecretString() != null) { - secret = getSecretValueResult.getSecretString(); - } else { - secret = - new String(Base64.getDecoder().decode(getSecretValueResult.getSecretBinary()).array()); - } - - if (secret != null) { - try { - final ObjectMapper objectMapper = new ObjectMapper(); - final HashMap secretMap = objectMapper.readValue(secret, HashMap.class); - accessKeyId = secretMap.get("access_key"); - secretKey = secretMap.get("secret_key"); - } catch (IOException e) { - log.log(Level.SEVERE, String.format("Could not parse secret %s from AWS", secretName)); - } - } - } -} diff --git a/src/main/java/build/buildfarm/metrics/aws/BUILD b/src/main/java/build/buildfarm/metrics/aws/BUILD deleted file mode 100644 index 51d44ea8c9..0000000000 --- a/src/main/java/build/buildfarm/metrics/aws/BUILD +++ /dev/null @@ -1,22 +0,0 @@ -java_library( - name = "aws", - srcs = glob(["*.java"]), - plugins = ["//src/main/java/build/buildfarm/common:lombok"], - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/common/config", - "//src/main/java/build/buildfarm/metrics", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_amazonaws_aws_java_sdk_core", - "@maven//:com_amazonaws_aws_java_sdk_secretsmanager", - "@maven//:com_amazonaws_aws_java_sdk_sns", - "@maven//:com_fasterxml_jackson_core_jackson_databind", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:org_projectlombok_lombok", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/metrics/gcp/BUILD b/src/main/java/build/buildfarm/metrics/gcp/BUILD deleted file mode 100644 index 765902b905..0000000000 --- a/src/main/java/build/buildfarm/metrics/gcp/BUILD +++ /dev/null @@ -1,16 +0,0 @@ -java_library( - name = "gcp", - srcs = glob(["*.java"]), - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/common/config", - "//src/main/java/build/buildfarm/metrics", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_status_java_proto", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java_util", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/metrics/gcp/GcpMetricsPublisher.java b/src/main/java/build/buildfarm/metrics/gcp/GcpMetricsPublisher.java deleted file mode 100644 index 40a6ddb1f8..0000000000 --- a/src/main/java/build/buildfarm/metrics/gcp/GcpMetricsPublisher.java +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.metrics.gcp; - -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.metrics.AbstractMetricsPublisher; - -public class GcpMetricsPublisher extends AbstractMetricsPublisher { - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - - public GcpMetricsPublisher() { - super(configs.getServer().getClusterId()); - } - - @Override - public void publishMetric(String metricName, Object metricValue) { - throw new UnsupportedOperationException(); - } -} diff --git a/src/main/java/build/buildfarm/operations/finder/BUILD b/src/main/java/build/buildfarm/operations/finder/BUILD index 5c8342609b..8b6a2cc557 100644 --- a/src/main/java/build/buildfarm/operations/finder/BUILD +++ b/src/main/java/build/buildfarm/operations/finder/BUILD @@ -22,7 +22,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:org_apache_commons_commons_pool2", diff --git a/src/main/java/build/buildfarm/server/BUILD b/src/main/java/build/buildfarm/server/BUILD index b05d2e5ef7..2cd750a4ff 100644 --- a/src/main/java/build/buildfarm/server/BUILD +++ b/src/main/java/build/buildfarm/server/BUILD @@ -14,9 +14,10 @@ java_library( "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/shard", "//src/main/java/build/buildfarm/metrics/prometheus", - "//src/main/java/build/buildfarm/server/controllers:WebController", "//src/main/java/build/buildfarm/server/services", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "@io_grpc_grpc_proto//:health_java_proto", + "@io_grpc_grpc_proto//:health_proto", "@maven//:com_github_pcj_google_options", "@maven//:com_google_guava_guava", "@maven//:io_grpc_grpc_api", @@ -26,13 +27,5 @@ java_library( "@maven//:javax_annotation_javax_annotation_api", "@maven//:org_bouncycastle_bcprov_jdk15on", "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_boot_spring_boot", - "@maven//:org_springframework_boot_spring_boot_autoconfigure", - "@maven//:org_springframework_boot_spring_boot_starter_thymeleaf", - "@maven//:org_springframework_boot_spring_boot_starter_web", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", - "@maven//:org_springframework_spring_web", ], ) diff --git a/src/main/java/build/buildfarm/server/BuildFarmServer.java b/src/main/java/build/buildfarm/server/BuildFarmServer.java index d0d26a6f52..c448963209 100644 --- a/src/main/java/build/buildfarm/server/BuildFarmServer.java +++ b/src/main/java/build/buildfarm/server/BuildFarmServer.java @@ -15,30 +15,29 @@ package build.buildfarm.server; import static build.buildfarm.common.io.Utils.formatIOError; -import static com.google.common.base.Preconditions.checkState; import static com.google.common.util.concurrent.MoreExecutors.shutdownAndAwaitTermination; import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; +import static java.util.concurrent.TimeUnit.SECONDS; import static java.util.logging.Level.SEVERE; +import static java.util.logging.Level.WARNING; import build.buildfarm.common.DigestUtil; +import build.buildfarm.common.LoggingMain; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.common.config.GrpcMetrics; import build.buildfarm.common.grpc.TracingMetadataUtils.ServerHeadersInterceptor; import build.buildfarm.common.services.ByteStreamService; import build.buildfarm.common.services.ContentAddressableStorageService; import build.buildfarm.instance.Instance; -import build.buildfarm.instance.shard.ShardInstance; +import build.buildfarm.instance.shard.ServerInstance; import build.buildfarm.metrics.prometheus.PrometheusPublisher; -import build.buildfarm.server.controllers.WebController; import build.buildfarm.server.services.ActionCacheService; -import build.buildfarm.server.services.AdminService; import build.buildfarm.server.services.CapabilitiesService; import build.buildfarm.server.services.ExecutionService; import build.buildfarm.server.services.FetchService; import build.buildfarm.server.services.OperationQueueService; import build.buildfarm.server.services.OperationsService; import build.buildfarm.server.services.PublishBuildEventService; -import com.google.devtools.common.options.OptionsParsingException; import io.grpc.ServerBuilder; import io.grpc.ServerInterceptor; import io.grpc.health.v1.HealthCheckResponse.ServingStatus; @@ -49,24 +48,16 @@ import java.io.File; import java.io.IOException; import java.security.Security; -import java.util.HashMap; -import java.util.Map; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; -import javax.annotation.PostConstruct; -import javax.annotation.PreDestroy; +import java.util.concurrent.atomic.AtomicBoolean; import javax.naming.ConfigurationException; import lombok.extern.java.Log; import org.bouncycastle.jce.provider.BouncyCastleProvider; -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; -import org.springframework.context.annotation.ComponentScan; @SuppressWarnings("deprecation") @Log -@SpringBootApplication -@ComponentScan("build.buildfarm") -public class BuildFarmServer { +public class BuildFarmServer extends LoggingMain { private static final java.util.logging.Logger nettyLogger = java.util.logging.Logger.getLogger("io.grpc.netty"); private static final Counter healthCheckMetric = @@ -80,20 +71,57 @@ public class BuildFarmServer { private Instance instance; private HealthStatusManager healthStatusManager; private io.grpc.Server server; - private boolean stopping = false; private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); + private AtomicBoolean shutdownInitiated = new AtomicBoolean(true); + private AtomicBoolean released = new AtomicBoolean(true); - private ShardInstance createInstance() + BuildFarmServer() { + super("BuildFarmServer"); + } + + /** + * The method will prepare the server for graceful shutdown when the server is ready. Current + * implementation waits specified period of time. Future improvements could be to keep track of + * open connections and shutdown when there are not left. Note on using stderr here instead of + * log. By the time this is called in PreDestroy, the log is no longer available and is not + * logging messages. + */ + public void prepareServerForGracefulShutdown() { + if (configs.getServer().getGracefulShutdownSeconds() == 0) { + log.info( + String.format("Graceful Shutdown is not enabled. Server is shutting down immediately.")); + } else { + try { + log.info( + String.format( + "Graceful Shutdown - Waiting %d to allow connections to drain.", + configs.getServer().getGracefulShutdownSeconds())); + SECONDS.sleep(configs.getServer().getGracefulShutdownSeconds()); + } catch (InterruptedException e) { + log.info( + "Graceful Shutdown - The server graceful shutdown is interrupted: " + e.getMessage()); + } finally { + log.info( + String.format( + "Graceful Shutdown - It took the server %d seconds to shutdown", + configs.getServer().getGracefulShutdownSeconds())); + } + } + } + + private ServerInstance createInstance() throws IOException, ConfigurationException, InterruptedException { - return new ShardInstance( + return new ServerInstance( configs.getServer().getName(), configs.getServer().getSession() + "-" + configs.getServer().getName(), new DigestUtil(configs.getDigestFunction()), - this::stop); + this::initiateShutdown); } public synchronized void start(ServerBuilder serverBuilder, String publicName) throws IOException, ConfigurationException, InterruptedException { + shutdownInitiated.set(false); + released.set(false); instance = createInstance(); healthStatusManager = new HealthStatusManager(); @@ -123,7 +151,6 @@ public synchronized void start(ServerBuilder serverBuilder, String publicName .addService(new ExecutionService(instance, keepaliveScheduler)) .addService(new OperationQueueService(instance)) .addService(new OperationsService(instance)) - .addService(new AdminService(instance)) .addService(new FetchService(instance)) .addService(ProtoReflectionService.newInstance()) .addService(new PublishBuildEventService()) @@ -141,9 +168,7 @@ public synchronized void start(ServerBuilder serverBuilder, String publicName log.info(String.format("%s initialized", configs.getServer().getSession())); - checkState(!stopping, "must not call start after stop"); instance.start(publicName); - WebController.setInstance((ShardInstance) instance); server.start(); healthStatusManager.setStatus( @@ -152,38 +177,70 @@ public synchronized void start(ServerBuilder serverBuilder, String publicName healthCheckMetric.labels("start").inc(); } - @PreDestroy - public void stop() { - synchronized (this) { - if (stopping) { - return; - } - stopping = true; + private synchronized void awaitRelease() throws InterruptedException { + while (!released.get()) { + wait(); + } + } + + synchronized void stop() throws InterruptedException { + try { + shutdown(); + } finally { + released.set(true); + notify(); } - System.err.println("*** shutting down gRPC server since JVM is shutting down"); + } + + private void shutdown() throws InterruptedException { + log.info("*** shutting down gRPC server since JVM is shutting down"); + prepareServerForGracefulShutdown(); healthStatusManager.setStatus( HealthStatusManager.SERVICE_NAME_ALL_SERVICES, ServingStatus.NOT_SERVING); PrometheusPublisher.stopHttpServer(); healthCheckMetric.labels("stop").inc(); try { - if (server != null) { - server.shutdown(); - } + initiateShutdown(); instance.stop(); - server.awaitTermination(10, TimeUnit.SECONDS); + if (server != null && server.awaitTermination(10, TimeUnit.SECONDS)) { + server = null; + } } catch (InterruptedException e) { if (server != null) { server.shutdownNow(); + server = null; } + throw e; } if (!shutdownAndAwaitTermination(keepaliveScheduler, 10, TimeUnit.SECONDS)) { log.warning("could not shut down keepalive scheduler"); } - System.err.println("*** server shut down"); + log.info("*** server shut down"); + } + + @Override + protected void onShutdown() throws InterruptedException { + initiateShutdown(); + awaitRelease(); } - @PostConstruct - public void init() throws OptionsParsingException { + private void initiateShutdown() { + shutdownInitiated.set(true); + if (server != null) { + server.shutdown(); + } + } + + private void awaitTermination() throws InterruptedException { + while (!shutdownInitiated.get()) { + if (server != null && server.awaitTermination(1, TimeUnit.SECONDS)) { + server = null; + shutdownInitiated.set(true); + } + } + } + + public static void main(String[] args) throws Exception { // Only log severe log messages from Netty. Otherwise it logs warnings that look like this: // // 170714 08:16:28.552:WT 18 [io.grpc.netty.NettyServerHandler.onStreamError] Stream Error @@ -191,37 +248,24 @@ public void init() throws OptionsParsingException { // unknown stream 11369 nettyLogger.setLevel(SEVERE); - try { - start( - ServerBuilder.forPort(configs.getServer().getPort()), - configs.getServer().getPublicName()); - } catch (IOException e) { - System.err.println("error: " + formatIOError(e)); - } catch (InterruptedException e) { - System.err.println("error: interrupted"); - } catch (ConfigurationException e) { - throw new RuntimeException(e); - } - } - - public static void main(String[] args) throws ConfigurationException { configs = BuildfarmConfigs.loadServerConfigs(args); // Configure Spring - SpringApplication app = new SpringApplication(BuildFarmServer.class); - Map springConfig = new HashMap<>(); - - // Disable Logback - System.setProperty("org.springframework.boot.logging.LoggingSystem", "none"); - - springConfig.put("ui.frontend.enable", configs.getUi().isEnable()); - springConfig.put("server.port", configs.getUi().getPort()); - app.setDefaultProperties(springConfig); + BuildFarmServer server = new BuildFarmServer(); try { - app.run(args); - } catch (Throwable t) { - log.log(SEVERE, "Error running application", t); + server.start( + ServerBuilder.forPort(configs.getServer().getPort()), + configs.getServer().getPublicName()); + server.awaitTermination(); + } catch (IOException e) { + log.severe("error: " + formatIOError(e)); + } catch (InterruptedException e) { + log.log(WARNING, "interrupted", e); + } catch (Exception e) { + log.log(SEVERE, "Error running application", e); + } finally { + server.stop(); } } } diff --git a/src/main/java/build/buildfarm/server/controllers/BUILD b/src/main/java/build/buildfarm/server/controllers/BUILD deleted file mode 100644 index 1e523bcef5..0000000000 --- a/src/main/java/build/buildfarm/server/controllers/BUILD +++ /dev/null @@ -1,46 +0,0 @@ -java_library( - name = "WebController", - srcs = ["WebController.java"], - plugins = ["//src/main/java/build/buildfarm/common:lombok"], - visibility = ["//visibility:public"], - deps = [ - "//src/main/java/build/buildfarm/common", - "//src/main/java/build/buildfarm/common/config", - "//src/main/java/build/buildfarm/common/grpc", - "//src/main/java/build/buildfarm/common/resources", - "//src/main/java/build/buildfarm/common/resources:resource_java_proto", - "//src/main/java/build/buildfarm/common/services", - "//src/main/java/build/buildfarm/instance", - "//src/main/java/build/buildfarm/instance/shard", - "//src/main/java/build/buildfarm/metrics/prometheus", - "//src/main/java/build/buildfarm/operations", - "//src/main/java/build/buildfarm/server/services", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", - "//src/main/protobuf:build_buildfarm_v1test_buildfarm_proto", - "@googleapis//:google_rpc_code_java_proto", - "@googleapis//:google_rpc_error_details_java_proto", - "@googleapis//:google_rpc_error_details_proto", - "@maven//:com_github_pcj_google_options", - "@maven//:com_google_guava_guava", - "@maven//:com_google_protobuf_protobuf_java", - "@maven//:com_google_protobuf_protobuf_java_util", - "@maven//:com_googlecode_json_simple_json_simple", - "@maven//:com_jayway_jsonpath_json_path", - "@maven//:io_grpc_grpc_api", - "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_protobuf", - "@maven//:io_grpc_grpc_services", - "@maven//:io_prometheus_simpleclient", - "@maven//:javax_annotation_javax_annotation_api", - "@maven//:org_bouncycastle_bcprov_jdk15on", - "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_boot_spring_boot", - "@maven//:org_springframework_boot_spring_boot_autoconfigure", - "@maven//:org_springframework_boot_spring_boot_starter_web", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", - "@maven//:org_springframework_spring_web", - "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", - ], -) diff --git a/src/main/java/build/buildfarm/server/controllers/WebController.java b/src/main/java/build/buildfarm/server/controllers/WebController.java deleted file mode 100644 index 5ed5c8250e..0000000000 --- a/src/main/java/build/buildfarm/server/controllers/WebController.java +++ /dev/null @@ -1,291 +0,0 @@ -// Copyright 2023 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.server.controllers; - -import build.bazel.remote.execution.v2.ExecuteOperationMetadata; -import build.bazel.remote.execution.v2.ExecuteResponse; -import build.bazel.remote.execution.v2.RequestMetadata; -import build.buildfarm.instance.shard.ShardInstance; -import build.buildfarm.operations.EnrichedOperation; -import build.buildfarm.v1test.CompletedOperationMetadata; -import build.buildfarm.v1test.ExecutingOperationMetadata; -import build.buildfarm.v1test.QueuedOperationMetadata; -import com.google.longrunning.Operation; -import com.google.protobuf.Any; -import com.google.protobuf.Duration; -import com.google.protobuf.InvalidProtocolBufferException; -import com.google.protobuf.Timestamp; -import com.google.protobuf.util.Durations; -import com.google.protobuf.util.JsonFormat; -import com.google.protobuf.util.Timestamps; -import com.google.rpc.PreconditionFailure; -import java.util.Map; -import java.util.Set; -import java.util.logging.Level; -import lombok.extern.java.Log; -import org.json.simple.JSONArray; -import org.json.simple.JSONObject; -import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; -import org.springframework.stereotype.Controller; -import org.springframework.ui.Model; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.PathVariable; - -// An HTTP frontend for providing a web UI in buildfarm. Its primarily used by developers so that -// they can introspect their build inovocations. Standalone tools that communicate over GRPC are -// sometimes less accessible. Nonetheless, the APIs should be similar. -// The web frontend can be deployed as part of the servers. However, the controller may also be -// included in other standalone tools if you'd like the frontend decoupled from the servers. -// This controller can provide web content directly through spring boot via thymeleaf. -// The controller also provides raw REST APIs for those wishing to build out their own frontend. -@Log -@Controller -@ConditionalOnProperty("ui.frontend.enable") -public class WebController { - private static ShardInstance instance; - - // Most of the routes require an instance to have their queries fulfilled. - // I wanted to have the instance set in the controller's constructor, but I was having trouble - // doing that with springboot's bean initialization and auto-wiring. - // Particularly because the controller is constructed before the instance is actually created. - // I settled for having the instance static and allowing the server startup to provide the - // instance once ready. - public static void setInstance(ShardInstance instanceIn) { - instance = instanceIn; - } - - // This is typically the starting for a developer looking to introspect their builds. - // When running a bazel client with `--bes_results_url` they will shown this URL route. - // The page is intented to give them a summary of their build invocation and allow them to drill - // down into more details. - @GetMapping("/invocation/{invocationId}") - public String invocation(Model model, @PathVariable String invocationId) { - // We need to look up the user's operations as fast as possible. Scanning all of the stored - // operations and filtering by invocatio ID (i.e. O(n)) does not scale. Instead, the invocation - // ID must be the primary key to their specific list of build operation IDs. We then lookup - // each - // operation by ID (preferably batched). This is technically two backend calls. It could be - // made faster - // at the expense of duplicate information stored in the backend. - Set operationIDs = instance.findOperationsByInvocationId(invocationId); - Iterable> foundOperations = instance.getOperations(operationIDs); - - // Populate the model for the page. - buildInvocationModel(model, invocationId, foundOperations); - - // Render page. - return "invocation"; - } - - private void buildInvocationModel( - Model model, String invocationId, Iterable> foundOperations) { - // Create an array representing table information about all the oprations involved in the - // invocation. - // This is the core content of the page. - JSONArray operationResults = new JSONArray(); - for (Map.Entry entry : foundOperations) { - Operation operation = jsonToOperation(entry.getValue()); - JSONObject obj = new JSONObject(); - obj.put("target", extractTargetId(operation)); - obj.put("mnemonic", extractActionMnemonic(operation)); - obj.put("stage", extractStatus(operation)); - obj.put("duration", extractDuration(operation)); - obj.put("worker", extractWorker(operation)); - - String operationId = entry.getKey(); - String id = operationId.substring(operationId.lastIndexOf('/')).substring(1); - obj.put("operationId", id); - - operationResults.add(obj); - } - - // Populate data to be provided to the frontend. - model.addAttribute("operation", operationResults.toJSONString()); - model.addAttribute("invocationId", String.format("Invocation: %s", invocationId)); - } - - // An operation represents an executed action. - // The operation has a target ID which corresponds to the bazel label what developers are - // typically thinking of when wishing to evaluate their build. - // This page shows them all of the information that we track related to the operation. - @GetMapping("/operation/{operationId}") - public String operation(Model model, @PathVariable String operationId) { - EnrichedOperation result = - instance.findEnrichedOperation(String.format("shard/operations/%s", operationId)); - model.addAttribute("fullOperation", result.asJsonString()); - return "operation"; - } - - // Information about the current deployment. Useful for verifying what is running. - @GetMapping("/info") - public String info(Model model) { - return "info"; - } - - /** - * @brief Convert string json into operation type. - * @details Parses json and returns null if invalid. - * @param json The json to convert to Operation type. - * @return The created operation. - * @note Suggested return identifier: operation. - */ - private static Operation jsonToOperation(String json) { - // create a json parser - JsonFormat.Parser operationParser = - JsonFormat.parser() - .usingTypeRegistry( - JsonFormat.TypeRegistry.newBuilder() - .add(CompletedOperationMetadata.getDescriptor()) - .add(ExecutingOperationMetadata.getDescriptor()) - .add(ExecuteOperationMetadata.getDescriptor()) - .add(QueuedOperationMetadata.getDescriptor()) - .add(PreconditionFailure.getDescriptor()) - .build()) - .ignoringUnknownFields(); - - if (json == null) { - log.log(Level.WARNING, "Operation Json is empty"); - return null; - } - try { - Operation.Builder operationBuilder = Operation.newBuilder(); - operationParser.merge(json, operationBuilder); - return operationBuilder.build(); - } catch (InvalidProtocolBufferException e) { - log.log(Level.WARNING, "InvalidProtocolBufferException while building an operation.", e); - return null; - } - } - - private String extractTargetId(Operation operation) { - return expectRequestMetadata(operation).getTargetId(); - } - - private String extractActionMnemonic(Operation operation) { - return expectRequestMetadata(operation).getActionMnemonic(); - } - - private String extractStatus(Operation operation) { - return String.valueOf(expectExecuteOperationMetadata(operation).getStage()); - } - - private String extractDuration(Operation operation) { - Any result = operation.getResponse(); - try { - Timestamp start = - result - .unpack(ExecuteResponse.class) - .getResult() - .getExecutionMetadata() - .getWorkerStartTimestamp(); - Timestamp end = - result - .unpack(ExecuteResponse.class) - .getResult() - .getExecutionMetadata() - .getWorkerCompletedTimestamp(); - Duration duration = Timestamps.between(start, end); - return Durations.toSecondsAsDouble(duration) + "s"; - } catch (InvalidProtocolBufferException e) { - System.out.println(e.toString()); - return "Unknown"; - } - } - - private String extractWorker(Operation operation) { - Any result = operation.getResponse(); - try { - return result.unpack(ExecuteResponse.class).getResult().getExecutionMetadata().getWorker(); - } catch (InvalidProtocolBufferException e) { - System.out.println(e.toString()); - return "Unknown"; - } - } - - private static ExecuteOperationMetadata expectExecuteOperationMetadata(Operation operation) { - String name = operation.getName(); - Any metadata = operation.getMetadata(); - QueuedOperationMetadata queuedOperationMetadata = maybeQueuedOperationMetadata(name, metadata); - if (queuedOperationMetadata != null) { - return queuedOperationMetadata.getExecuteOperationMetadata(); - } - ExecutingOperationMetadata executingOperationMetadata = - maybeExecutingOperationMetadata(name, metadata); - if (executingOperationMetadata != null) { - return executingOperationMetadata.getExecuteOperationMetadata(); - } - CompletedOperationMetadata completedOperationMetadata = - maybeCompletedOperationMetadata(name, metadata); - if (completedOperationMetadata != null) { - return completedOperationMetadata.getExecuteOperationMetadata(); - } - return ExecuteOperationMetadata.getDefaultInstance(); - } - - private static RequestMetadata expectRequestMetadata(Operation operation) { - String name = operation.getName(); - Any metadata = operation.getMetadata(); - QueuedOperationMetadata queuedOperationMetadata = maybeQueuedOperationMetadata(name, metadata); - if (queuedOperationMetadata != null) { - return queuedOperationMetadata.getRequestMetadata(); - } - ExecutingOperationMetadata executingOperationMetadata = - maybeExecutingOperationMetadata(name, metadata); - if (executingOperationMetadata != null) { - return executingOperationMetadata.getRequestMetadata(); - } - CompletedOperationMetadata completedOperationMetadata = - maybeCompletedOperationMetadata(name, metadata); - if (completedOperationMetadata != null) { - return completedOperationMetadata.getRequestMetadata(); - } - return RequestMetadata.getDefaultInstance(); - } - - private static QueuedOperationMetadata maybeQueuedOperationMetadata(String name, Any metadata) { - if (metadata.is(QueuedOperationMetadata.class)) { - try { - return metadata.unpack(QueuedOperationMetadata.class); - } catch (InvalidProtocolBufferException e) { - log.log(Level.SEVERE, String.format("invalid executing operation metadata %s", name), e); - } - } - return null; - } - - private static ExecutingOperationMetadata maybeExecutingOperationMetadata( - String name, Any metadata) { - if (metadata.is(ExecutingOperationMetadata.class)) { - try { - return metadata.unpack(ExecutingOperationMetadata.class); - } catch (InvalidProtocolBufferException e) { - log.log(Level.SEVERE, String.format("invalid executing operation metadata %s", name), e); - } - } - return null; - } - - private static CompletedOperationMetadata maybeCompletedOperationMetadata( - String name, Any metadata) { - if (metadata.is(CompletedOperationMetadata.class)) { - try { - return metadata.unpack(CompletedOperationMetadata.class); - } catch (InvalidProtocolBufferException e) { - log.log(Level.SEVERE, String.format("invalid completed operation metadata %s", name), e); - } - } - return null; - } -} diff --git a/src/main/java/build/buildfarm/server/services/AdminService.java b/src/main/java/build/buildfarm/server/services/AdminService.java deleted file mode 100644 index 968edc1572..0000000000 --- a/src/main/java/build/buildfarm/server/services/AdminService.java +++ /dev/null @@ -1,256 +0,0 @@ -// Copyright 2020 The Bazel Authors. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package build.buildfarm.server.services; - -import build.buildfarm.admin.Admin; -import build.buildfarm.admin.aws.AwsAdmin; -import build.buildfarm.admin.gcp.GcpAdmin; -import build.buildfarm.common.CasIndexResults; -import build.buildfarm.common.config.BuildfarmConfigs; -import build.buildfarm.instance.Instance; -import build.buildfarm.v1test.AdminGrpc; -import build.buildfarm.v1test.DisableScaleInProtectionRequest; -import build.buildfarm.v1test.DisableScaleInProtectionRequestResults; -import build.buildfarm.v1test.GetClientStartTimeRequest; -import build.buildfarm.v1test.GetClientStartTimeResult; -import build.buildfarm.v1test.GetHostsRequest; -import build.buildfarm.v1test.GetHostsResult; -import build.buildfarm.v1test.PrepareWorkerForGracefulShutDownRequest; -import build.buildfarm.v1test.ReindexCasRequest; -import build.buildfarm.v1test.ReindexCasRequestResults; -import build.buildfarm.v1test.ScaleClusterRequest; -import build.buildfarm.v1test.ShutDownWorkerGracefullyRequest; -import build.buildfarm.v1test.ShutDownWorkerGracefullyRequestResults; -import build.buildfarm.v1test.ShutDownWorkerGrpc; -import build.buildfarm.v1test.StopContainerRequest; -import build.buildfarm.v1test.TerminateHostRequest; -import com.google.rpc.Code; -import com.google.rpc.Status; -import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; -import io.grpc.stub.StreamObserver; -import java.util.logging.Level; -import lombok.extern.java.Log; - -@Log -public class AdminService extends AdminGrpc.AdminImplBase { - private final Admin adminController; - private final Instance instance; - - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - - public AdminService(Instance instance) { - this.adminController = getAdminController(); - this.instance = instance; - } - - @Override - public void terminateHost(TerminateHostRequest request, StreamObserver responseObserver) { - try { - if (adminController != null) { - adminController.terminateHost(request.getHostId()); - } - responseObserver.onNext(Status.newBuilder().setCode(Code.OK_VALUE).build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not terminate host.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void stopContainer(StopContainerRequest request, StreamObserver responseObserver) { - try { - if (adminController != null) { - adminController.stopContainer(request.getHostId(), request.getContainerName()); - } - responseObserver.onNext(Status.newBuilder().setCode(Code.OK_VALUE).build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not stop container.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void getHosts(GetHostsRequest request, StreamObserver responseObserver) { - try { - GetHostsResult result = null; - if (adminController != null) { - result = - adminController.getHosts( - request.getFilter(), request.getAgeInMinutes(), request.getStatus()); - } - responseObserver.onNext(result); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not get hosts.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void getClientStartTime( - GetClientStartTimeRequest request, - StreamObserver responseObserver) { - try { - GetClientStartTimeResult result = instance.getClientStartTime(request); - responseObserver.onNext(result); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log( - Level.SEVERE, - String.format("Could not get client start time for %s.", request.getInstanceName()), - e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void scaleCluster(ScaleClusterRequest request, StreamObserver responseObserver) { - try { - if (adminController != null) { - adminController.scaleCluster( - request.getScaleGroupName(), - request.getMinHosts(), - request.getMaxHosts(), - request.getTargetHosts(), - request.getTargetReservedHostsPercent()); - } - responseObserver.onNext(Status.newBuilder().setCode(Code.OK_VALUE).build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not scale cluster.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - @Override - public void reindexCas( - ReindexCasRequest request, StreamObserver responseObserver) { - try { - CasIndexResults results = instance.reindexCas(); - log.info(String.format("CAS Indexer Results: %s", results.toMessage())); - responseObserver.onNext( - ReindexCasRequestResults.newBuilder() - .setRemovedHosts(results.removedHosts) - .setRemovedKeys(results.removedKeys) - .setTotalKeys(results.totalKeys) - .build()); - responseObserver.onCompleted(); - } catch (Exception e) { - log.log(Level.SEVERE, "Could not reindex CAS.", e); - responseObserver.onError(io.grpc.Status.fromThrowable(e).asException()); - } - } - - /** - * Server-side implementation of ShutDownWorkerGracefully. This will reroute the request to target - * worker. - * - * @param request ShutDownWorkerGracefullyRequest received through grpc - * @param responseObserver grpc response observer - */ - @Override - public void shutDownWorkerGracefully( - ShutDownWorkerGracefullyRequest request, - StreamObserver responseObserver) { - try { - informWorkerToPrepareForShutdown(request.getWorkerName()); - responseObserver.onNext(ShutDownWorkerGracefullyRequestResults.newBuilder().build()); - responseObserver.onCompleted(); - } catch (Exception e) { - String errorMessage = - String.format( - "Could not inform the worker %s to prepare for graceful shutdown with error %s.", - request.getWorkerName(), e.getMessage()); - log.log(Level.SEVERE, errorMessage); - responseObserver.onError(new Exception(errorMessage)); - } - } - - /** - * Inform a worker to prepare for graceful shutdown. - * - * @param host the host that should be prepared for shutdown. - */ - @SuppressWarnings("ResultOfMethodCallIgnored") - private void informWorkerToPrepareForShutdown(String host) { - ManagedChannel channel = null; - try { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(host).negotiationType(NegotiationType.PLAINTEXT); - channel = builder.build(); - ShutDownWorkerGrpc.ShutDownWorkerBlockingStub shutDownWorkerBlockingStub = - ShutDownWorkerGrpc.newBlockingStub(channel); - shutDownWorkerBlockingStub.prepareWorkerForGracefulShutdown( - PrepareWorkerForGracefulShutDownRequest.newBuilder().build()); - } finally { - if (channel != null) { - channel.shutdown(); - } - } - } - - /** - * Server-side implementation of disableScaleInProtection. - * - * @param request grpc request - * @param responseObserver grpc response observer - */ - @Override - public void disableScaleInProtection( - DisableScaleInProtectionRequest request, - StreamObserver responseObserver) { - try { - String hostPrivateIp = trimHostPrivateDns(request.getInstanceName()); - adminController.disableHostScaleInProtection(hostPrivateIp); - responseObserver.onNext(DisableScaleInProtectionRequestResults.newBuilder().build()); - responseObserver.onCompleted(); - } catch (RuntimeException e) { - responseObserver.onError(e); - } - } - - /** - * The private dns get from worker might be suffixed with ":portNumber", which should be trimmed. - * - * @param hostPrivateIp the private dns should be trimmed. - * @return - */ - @SuppressWarnings("JavaDoc") - private String trimHostPrivateDns(String hostPrivateIp) { - String portSeparator = ":"; - if (hostPrivateIp.contains(portSeparator)) { - hostPrivateIp = hostPrivateIp.split(portSeparator)[0]; - } - return hostPrivateIp; - } - - private static Admin getAdminController() { - if (configs.getServer().getAdmin().getDeploymentEnvironment() == null) { - return null; - } - switch (configs.getServer().getAdmin().getDeploymentEnvironment()) { - default: - return null; - case AWS: - return new AwsAdmin(); - case GCP: - return new GcpAdmin(); - } - } -} diff --git a/src/main/java/build/buildfarm/server/services/BUILD b/src/main/java/build/buildfarm/server/services/BUILD index aff642ed7a..6bb44b94f8 100644 --- a/src/main/java/build/buildfarm/server/services/BUILD +++ b/src/main/java/build/buildfarm/server/services/BUILD @@ -4,17 +4,12 @@ java_library( plugins = ["//src/main/java/build/buildfarm/common:lombok"], visibility = ["//visibility:public"], deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/java/build/buildfarm/admin/aws", - "//src/main/java/build/buildfarm/admin/gcp", "//src/main/java/build/buildfarm/common", "//src/main/java/build/buildfarm/common/config", "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/common/resources:resource_java_proto", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/metrics", - "//src/main/java/build/buildfarm/metrics/aws", - "//src/main/java/build/buildfarm/metrics/gcp", "//src/main/java/build/buildfarm/metrics/log", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", @@ -30,7 +25,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_services", "@maven//:io_grpc_grpc_stub", diff --git a/src/main/java/build/buildfarm/server/services/ExecutionService.java b/src/main/java/build/buildfarm/server/services/ExecutionService.java index 7f46c0bb3c..69799524fb 100644 --- a/src/main/java/build/buildfarm/server/services/ExecutionService.java +++ b/src/main/java/build/buildfarm/server/services/ExecutionService.java @@ -28,8 +28,6 @@ import build.buildfarm.common.grpc.TracingMetadataUtils; import build.buildfarm.instance.Instance; import build.buildfarm.metrics.MetricsPublisher; -import build.buildfarm.metrics.aws.AwsMetricsPublisher; -import build.buildfarm.metrics.gcp.GcpMetricsPublisher; import build.buildfarm.metrics.log.LogMetricsPublisher; import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.ListenableFuture; @@ -205,13 +203,6 @@ public void execute(ExecuteRequest request, StreamObserver responseOb } private static MetricsPublisher getMetricsPublisher() { - switch (configs.getServer().getMetrics().getPublisher()) { - default: - return new LogMetricsPublisher(); - case AWS: - return new AwsMetricsPublisher(); - case GCP: - return new GcpMetricsPublisher(); - } + return new LogMetricsPublisher(); } } diff --git a/src/main/java/build/buildfarm/server/services/OperationQueueService.java b/src/main/java/build/buildfarm/server/services/OperationQueueService.java index 6d3edb8178..11f6501667 100644 --- a/src/main/java/build/buildfarm/server/services/OperationQueueService.java +++ b/src/main/java/build/buildfarm/server/services/OperationQueueService.java @@ -28,9 +28,7 @@ import com.google.rpc.Code; import io.grpc.Status; import io.grpc.StatusRuntimeException; -import io.grpc.stub.ServerCallStreamObserver; import io.grpc.stub.StreamObserver; -import java.util.function.Consumer; public class OperationQueueService extends OperationQueueGrpc.OperationQueueImplBase { private final Instance instance; @@ -43,14 +41,11 @@ private static class OperationQueueMatchListener implements MatchListener { @SuppressWarnings("rawtypes") private final InterruptingPredicate onMatch; - private final Consumer setOnCancelHandler; private static final QueueEntry queueEntry = null; @SuppressWarnings("rawtypes") - OperationQueueMatchListener( - InterruptingPredicate onMatch, Consumer setOnCancelHandler) { + OperationQueueMatchListener(InterruptingPredicate onMatch) { this.onMatch = onMatch; - this.setOnCancelHandler = setOnCancelHandler; } @Override @@ -70,11 +65,6 @@ public void onError(Throwable t) { Throwables.throwIfUnchecked(t); throw new RuntimeException(t); } - - @Override - public void setOnCancelHandler(Runnable onCancelHandler) { - setOnCancelHandler.accept(onCancelHandler); - } } private InterruptingPredicate createOnMatch( @@ -97,14 +87,10 @@ private InterruptingPredicate createOnMatch( @Override public void take(TakeOperationRequest request, StreamObserver responseObserver) { - ServerCallStreamObserver callObserver = - (ServerCallStreamObserver) responseObserver; - try { instance.match( request.getPlatform(), - new OperationQueueMatchListener( - createOnMatch(instance, responseObserver), callObserver::setOnCancelHandler)); + new OperationQueueMatchListener(createOnMatch(instance, responseObserver))); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } diff --git a/src/main/java/build/buildfarm/tools/Ac.java b/src/main/java/build/buildfarm/tools/Ac.java index adfca598ff..9515fe0ec4 100644 --- a/src/main/java/build/buildfarm/tools/Ac.java +++ b/src/main/java/build/buildfarm/tools/Ac.java @@ -14,6 +14,8 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.bazel.remote.execution.v2.ActionResult; import build.buildfarm.common.DigestUtil; import build.buildfarm.common.DigestUtil.HashFunction; @@ -21,18 +23,10 @@ import build.buildfarm.instance.stub.StubInstance; import com.google.protobuf.ByteString; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; // This tool can be used to interact directly with the Action Cache API. // ./tool shard SHA256 class Ac { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static void main(String[] args) throws Exception { // get arguments for establishing an instance String host = args[0]; diff --git a/src/main/java/build/buildfarm/tools/BUILD b/src/main/java/build/buildfarm/tools/BUILD index c2f9d94ece..44a4498ef9 100644 --- a/src/main/java/build/buildfarm/tools/BUILD +++ b/src/main/java/build/buildfarm/tools/BUILD @@ -5,6 +5,7 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "//src/main/java/build/buildfarm/worker", @@ -15,7 +16,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], @@ -57,6 +57,7 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance/stub", "@googleapis//:google_bytestream_bytestream_java_grpc", "@googleapis//:google_bytestream_bytestream_java_proto", @@ -67,7 +68,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_grpc", @@ -90,7 +90,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", @@ -108,6 +107,7 @@ java_binary( deps = [ ":worker-profiler-printer", "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", @@ -119,7 +119,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], @@ -137,6 +136,7 @@ java_binary( ":worker-profiler-printer", "//src/main/java/build/buildfarm/common", "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/common/redis", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/shard", @@ -151,7 +151,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], @@ -168,6 +167,7 @@ java_binary( deps = [ ":worker-profiler-printer", "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", @@ -179,22 +179,21 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], ) java_binary( - name = "GracefulShutdownTest", - srcs = ["GracefulShutdownTest.java"], - main_class = "build.buildfarm.tools.GracefulShutdownTest", + name = "GracefulShutdown", + srcs = ["GracefulShutdown.java"], + main_class = "build.buildfarm.tools.GracefulShutdown", visibility = ["//visibility:public"], deps = [ + "//src/main/java/build/buildfarm/common/grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "@maven//:io_grpc_grpc_api", - "@maven//:io_grpc_grpc_netty", ], ) @@ -205,6 +204,7 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "@googleapis//:google_longrunning_operations_java_proto", @@ -214,7 +214,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", @@ -228,6 +227,7 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "@googleapis//:google_longrunning_operations_java_proto", @@ -237,7 +237,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", @@ -251,6 +250,7 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "@googleapis//:google_longrunning_operations_java_proto", @@ -260,7 +260,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", @@ -281,12 +280,12 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], @@ -313,7 +312,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], diff --git a/src/main/java/build/buildfarm/tools/Cancel.java b/src/main/java/build/buildfarm/tools/Cancel.java index 5945df708e..24805034dc 100644 --- a/src/main/java/build/buildfarm/tools/Cancel.java +++ b/src/main/java/build/buildfarm/tools/Cancel.java @@ -14,20 +14,14 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.common.DigestUtil; import build.buildfarm.instance.Instance; import build.buildfarm.instance.stub.StubInstance; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; class Cancel { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static void main(String[] args) throws Exception { String host = args[0]; String instanceName = args[1]; diff --git a/src/main/java/build/buildfarm/tools/Cat.java b/src/main/java/build/buildfarm/tools/Cat.java index 399ca7f1fb..bca404b3c1 100644 --- a/src/main/java/build/buildfarm/tools/Cat.java +++ b/src/main/java/build/buildfarm/tools/Cat.java @@ -14,6 +14,7 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; import static build.buildfarm.instance.Utils.getBlob; import static com.google.common.util.concurrent.MoreExecutors.shutdownAndAwaitTermination; import static java.lang.String.format; @@ -68,8 +69,6 @@ import io.grpc.Context; import io.grpc.ManagedChannel; import io.grpc.Status; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -86,12 +85,6 @@ import java.util.stream.StreamSupport; class Cat { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - private static void printCapabilities(ServerCapabilities capabilities) { System.out.println(capabilities); } @@ -657,6 +650,9 @@ private static void getWorkerProfile(Instance instance) { private static void printStageInformation(StageInformation stage) { System.out.printf("%s slots configured: %d%n", stage.getName(), stage.getSlotsConfigured()); System.out.printf("%s slots used %d%n", stage.getName(), stage.getSlotsUsed()); + for (String operationName : stage.getOperationNamesList()) { + System.out.printf("%s operation %s\n", stage.getName(), operationName); + } } private static void printOperationTime(OperationTimesBetweenStages time) { diff --git a/src/main/java/build/buildfarm/tools/Executor.java b/src/main/java/build/buildfarm/tools/Executor.java index 77ca4002c1..4f2f9f3d6f 100644 --- a/src/main/java/build/buildfarm/tools/Executor.java +++ b/src/main/java/build/buildfarm/tools/Executor.java @@ -15,6 +15,7 @@ package build.buildfarm.tools; import static build.bazel.remote.execution.v2.ExecutionStage.Value.EXECUTING; +import static build.buildfarm.common.grpc.Channels.createChannel; import static build.buildfarm.common.io.Utils.stat; import static build.buildfarm.instance.stub.ByteStreamUploader.uploadResourceName; import static com.google.common.base.Preconditions.checkState; @@ -53,8 +54,6 @@ import com.google.rpc.Code; import io.grpc.Channel; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import io.grpc.stub.StreamObserver; import java.io.IOException; import java.io.InputStream; @@ -223,12 +222,6 @@ static void executeActions( shutdownAndAwaitTermination(service, 1, SECONDS); } - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - private static void loadFilesIntoCAS(String instanceName, Channel channel, Path blobsDir) throws Exception { ContentAddressableStorageBlockingStub casStub = diff --git a/src/main/java/build/buildfarm/tools/Extract.java b/src/main/java/build/buildfarm/tools/Extract.java index e4de193ae2..fed81ac267 100644 --- a/src/main/java/build/buildfarm/tools/Extract.java +++ b/src/main/java/build/buildfarm/tools/Extract.java @@ -14,6 +14,7 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; import static com.google.common.util.concurrent.MoreExecutors.listeningDecorator; import static java.util.concurrent.Executors.newSingleThreadExecutor; import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; @@ -41,8 +42,6 @@ import io.grpc.ManagedChannel; import io.grpc.Status; import io.grpc.Status.Code; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import io.grpc.stub.StreamObserver; import java.io.IOException; import java.io.InputStream; @@ -61,12 +60,6 @@ import java.util.concurrent.atomic.AtomicLong; class Extract { - static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static void main(String[] args) throws Exception { String host = args[0]; String instanceName = args[1]; diff --git a/src/main/java/build/buildfarm/tools/FindOperations.java b/src/main/java/build/buildfarm/tools/FindOperations.java index c858f121f6..f1d8494dff 100644 --- a/src/main/java/build/buildfarm/tools/FindOperations.java +++ b/src/main/java/build/buildfarm/tools/FindOperations.java @@ -14,14 +14,14 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.common.DigestUtil; import build.buildfarm.instance.Instance; import build.buildfarm.instance.stub.StubInstance; import com.google.common.collect.ImmutableList; import com.google.longrunning.Operation; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; // This tool can be used to find Operations based on their particular properties. // For example, it could find all of the operations executed by a particular user or particular @@ -29,12 +29,6 @@ // ./tool shard SHA256 // The operations that match the query will be printed. class FindOperations { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static void main(String[] args) throws Exception { // get arguments for establishing an instance String host = args[0]; diff --git a/src/main/java/build/buildfarm/tools/GracefulShutdownTest.java b/src/main/java/build/buildfarm/tools/GracefulShutdown.java similarity index 86% rename from src/main/java/build/buildfarm/tools/GracefulShutdownTest.java rename to src/main/java/build/buildfarm/tools/GracefulShutdown.java index f98cddbde4..689edfeff1 100644 --- a/src/main/java/build/buildfarm/tools/GracefulShutdownTest.java +++ b/src/main/java/build/buildfarm/tools/GracefulShutdown.java @@ -14,24 +14,18 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.v1test.AdminGrpc; import build.buildfarm.v1test.DisableScaleInProtectionRequest; import build.buildfarm.v1test.PrepareWorkerForGracefulShutDownRequest; import build.buildfarm.v1test.ShutDownWorkerGracefullyRequest; import build.buildfarm.v1test.ShutDownWorkerGrpc; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; - -class GracefulShutdownTest { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } +class GracefulShutdown { /** - * Example command: GracefulShutdownTest ShutDown workerIp buildfarm-endpoint + * Example command: GracefulShutdown ShutDown workerIp buildfarm-endpoint * * @param args */ @@ -54,7 +48,7 @@ private static void shutDownGracefully(String[] args) { } /** - * Example command: GracefulShutdownTest PrepareWorker WorkerIp:port + * Example command: GracefulShutdown PrepareWorker WorkerIp:port * * @param args */ @@ -71,7 +65,7 @@ private static void prepareWorkerForShutDown(String[] args) { } /** - * Example command: GracefulShutdownTest DisableProtection WorkerIp buildfarm_endpoint + * Example command: GracefulShutdown DisableProtection WorkerIp buildfarm_endpoint * * @param args */ diff --git a/src/main/java/build/buildfarm/tools/Hist.java b/src/main/java/build/buildfarm/tools/Hist.java index c8ec6c2bfa..2abdf55f7d 100644 --- a/src/main/java/build/buildfarm/tools/Hist.java +++ b/src/main/java/build/buildfarm/tools/Hist.java @@ -14,6 +14,8 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.bazel.remote.execution.v2.ExecuteOperationMetadata; import build.bazel.remote.execution.v2.ExecutionStage; import build.buildfarm.common.DigestUtil; @@ -23,16 +25,8 @@ import com.google.longrunning.Operation; import com.google.protobuf.InvalidProtocolBufferException; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; class Hist { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - @SuppressWarnings("ConstantConditions") private static void printHistogramValue(int executing) { StringBuilder s = new StringBuilder(); diff --git a/src/main/java/build/buildfarm/tools/IndexWorker.java b/src/main/java/build/buildfarm/tools/IndexWorker.java index a36e3f9217..317a5ff637 100644 --- a/src/main/java/build/buildfarm/tools/IndexWorker.java +++ b/src/main/java/build/buildfarm/tools/IndexWorker.java @@ -14,25 +14,19 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.common.CasIndexResults; import build.buildfarm.common.DigestUtil; import build.buildfarm.instance.Instance; import build.buildfarm.instance.stub.StubInstance; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; // This tool can be used to remove worker entries from the CAS. // This is usually done via the admin service when a worker is departing from the cluster. // ./tool shard SHA256 // The results of the removal are printed after the CAS entries have been removed. class IndexWorker { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static void main(String[] args) throws Exception { String host = args[0]; String instanceName = args[1]; diff --git a/src/main/java/build/buildfarm/tools/Mount.java b/src/main/java/build/buildfarm/tools/Mount.java index 43061d12bc..a0a4528d22 100644 --- a/src/main/java/build/buildfarm/tools/Mount.java +++ b/src/main/java/build/buildfarm/tools/Mount.java @@ -14,6 +14,7 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; import static build.buildfarm.instance.Utils.getBlob; import static com.google.common.base.Preconditions.checkArgument; @@ -27,8 +28,6 @@ import build.buildfarm.worker.FuseCAS; import com.google.protobuf.ByteString; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import java.io.IOException; import java.io.InputStream; import java.nio.file.Path; @@ -37,12 +36,6 @@ import java.util.Map; class Mount { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - @SuppressWarnings("BusyWait") public static void main(String[] args) throws Exception { String host = args[0]; diff --git a/src/main/java/build/buildfarm/tools/WorkerProfile.java b/src/main/java/build/buildfarm/tools/WorkerProfile.java index d820446a53..69dbb8cdee 100644 --- a/src/main/java/build/buildfarm/tools/WorkerProfile.java +++ b/src/main/java/build/buildfarm/tools/WorkerProfile.java @@ -14,6 +14,8 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.common.DigestUtil; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.common.config.ShardWorkerOptions; @@ -30,10 +32,7 @@ import com.google.protobuf.InvalidProtocolBufferException; import com.google.protobuf.util.Durations; import com.google.protobuf.util.JsonFormat; -import io.grpc.ManagedChannel; import io.grpc.StatusRuntimeException; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import java.io.IOException; import java.nio.file.Paths; import java.util.HashMap; @@ -46,12 +45,6 @@ class WorkerProfile { private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - /** * Transform worker string from "ip-10-135-31-210.ec2:8981" to "10.135.31.210". * @@ -116,7 +109,7 @@ private static Set getWorkers(String[] args) throws ConfigurationExcepti } catch (IOException e) { System.out.println("Could not parse yml configuration file." + e); } - RedisClient client = new RedisClient(JedisClusterFactory.create().get()); + RedisClient client = new RedisClient(JedisClusterFactory.create("worker-profile").get()); return client.call(jedis -> fetchWorkers(jedis, System.currentTimeMillis())); } diff --git a/src/main/java/build/buildfarm/worker/BUILD b/src/main/java/build/buildfarm/worker/BUILD index 417d530e9a..e24b50d731 100644 --- a/src/main/java/build/buildfarm/worker/BUILD +++ b/src/main/java/build/buildfarm/worker/BUILD @@ -32,11 +32,11 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:io_prometheus_simpleclient", "@maven//:org_apache_commons_commons_compress", + "@maven//:org_apache_commons_commons_lang3", "@maven//:org_jetbrains_annotations", "@maven//:org_projectlombok_lombok", ], diff --git a/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java b/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java index c3c4c1dfc9..3536ed6bac 100644 --- a/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java +++ b/src/main/java/build/buildfarm/worker/DequeueMatchEvaluator.java @@ -14,11 +14,12 @@ package build.buildfarm.worker; -import build.bazel.remote.execution.v2.Command; import build.bazel.remote.execution.v2.Platform; import build.buildfarm.common.ExecutionProperties; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.v1test.QueueEntry; +import build.buildfarm.worker.resources.LocalResourceSet; +import build.buildfarm.worker.resources.LocalResourceSetUtils; import com.google.common.collect.Iterables; import com.google.common.collect.SetMultimap; import org.jetbrains.annotations.NotNull; @@ -45,6 +46,7 @@ public class DequeueMatchEvaluator { * @brief Decide whether the worker should keep the operation or put it back on the queue. * @details Compares the platform properties of the worker to the operation's platform properties. * @param workerProvisions The provisions of the worker. + * @param resourceSet The limited resources that the worker has available. * @param queueEntry An entry recently removed from the queue. * @return Whether or not the worker should accept or reject the queue entry. * @note Overloaded. @@ -53,24 +55,10 @@ public class DequeueMatchEvaluator { @SuppressWarnings("NullableProblems") @NotNull public static boolean shouldKeepOperation( - SetMultimap workerProvisions, QueueEntry queueEntry) { - return shouldKeepViaPlatform(workerProvisions, queueEntry.getPlatform()); - } - - /** - * @brief Decide whether the worker should keep the operation or put it back on the queue. - * @details Compares the platform properties of the worker to the operation's platform properties. - * @param workerProvisions The provisions of the worker. - * @param command A command to evaluate. - * @return Whether or not the worker should accept or reject the queue entry. - * @note Overloaded. - * @note Suggested return identifier: shouldKeepOperation. - */ - @SuppressWarnings("NullableProblems") - @NotNull - public static boolean shouldKeepOperation( - SetMultimap workerProvisions, Command command) { - return shouldKeepViaPlatform(workerProvisions, command.getPlatform()); + SetMultimap workerProvisions, + LocalResourceSet resourceSet, + QueueEntry queueEntry) { + return shouldKeepViaPlatform(workerProvisions, resourceSet, queueEntry.getPlatform()); } /** @@ -79,6 +67,7 @@ public static boolean shouldKeepOperation( * @details Compares the platform properties of the worker to the platform properties of the * operation. * @param workerProvisions The provisions of the worker. + * @param resourceSet The limited resources that the worker has available. * @param platform The platforms of operation. * @return Whether or not the worker should accept or reject the operation. * @note Suggested return identifier: shouldKeepOperation. @@ -86,9 +75,15 @@ public static boolean shouldKeepOperation( @SuppressWarnings("NullableProblems") @NotNull private static boolean shouldKeepViaPlatform( - SetMultimap workerProvisions, Platform platform) { - // attempt to execute everything the worker gets off the queue. + SetMultimap workerProvisions, + LocalResourceSet resourceSet, + Platform platform) { + // attempt to execute everything the worker gets off the queue, + // provided there is enough resources to do so. // this is a recommended configuration. + if (!LocalResourceSetUtils.claimResources(platform, resourceSet)) { + return false; + } if (configs.getWorker().getDequeueMatchSettings().isAcceptEverything()) { return true; } diff --git a/src/main/java/build/buildfarm/worker/ExecuteActionStage.java b/src/main/java/build/buildfarm/worker/ExecuteActionStage.java index 6c5e247a8f..f1ad546676 100644 --- a/src/main/java/build/buildfarm/worker/ExecuteActionStage.java +++ b/src/main/java/build/buildfarm/worker/ExecuteActionStage.java @@ -107,7 +107,7 @@ public void releaseExecutor( int slotUsage = removeAndRelease(operationName, claims); executionTime.observe(usecs / 1000.0); executionStallTime.observe(stallUSecs / 1000.0); - logComplete( + complete( operationName, usecs, stallUSecs, @@ -141,7 +141,7 @@ protected void iterate() throws InterruptedException { executors.add(executorThread); int slotUsage = executorClaims.addAndGet(limits.cpu.claimed); executionSlotUsage.set(slotUsage); - logStart(operationContext.operation.getName(), getUsage(slotUsage)); + start(operationContext.operation.getName(), getUsage(slotUsage)); executorThread.start(); } } diff --git a/src/main/java/build/buildfarm/worker/Executor.java b/src/main/java/build/buildfarm/worker/Executor.java index 588d38c208..f4776a6f00 100644 --- a/src/main/java/build/buildfarm/worker/Executor.java +++ b/src/main/java/build/buildfarm/worker/Executor.java @@ -199,7 +199,7 @@ private long executePolled( Stopwatch stopwatch) throws InterruptedException { /* execute command */ - log.log(Level.FINE, "Executor: Operation " + operation.getName() + " Executing command"); + log.log(Level.FINER, "Executor: Operation " + operation.getName() + " Executing command"); ActionResult.Builder resultBuilder = operationContext.executeResponse.getResultBuilder(); resultBuilder @@ -291,7 +291,7 @@ private long executePolled( long executeUSecs = stopwatch.elapsed(MICROSECONDS); log.log( - Level.FINE, + Level.FINER, String.format( "Executor::executeCommand(%s): Completed command: exit code %d", operationName, resultBuilder.getExitCode())); @@ -309,7 +309,7 @@ private long executePolled( throw e; } } else { - log.log(Level.FINE, "Executor: Operation " + operationName + " Failed to claim output"); + log.log(Level.FINER, "Executor: Operation " + operationName + " Failed to claim output"); boolean wasInterrupted = Thread.interrupted(); try { putError(); @@ -359,6 +359,9 @@ public void run(ResourceLimits limits) { } finally { boolean wasInterrupted = Thread.interrupted(); try { + // Now that the execution has finished we can return any of the claims against local + // resources. + workerContext.returnLocalResources(operationContext.queueEntry); owner.releaseExecutor( operationName, limits.cpu.claimed, diff --git a/src/main/java/build/buildfarm/worker/InputFetchStage.java b/src/main/java/build/buildfarm/worker/InputFetchStage.java index 6b0f741fd8..3953ef9595 100644 --- a/src/main/java/build/buildfarm/worker/InputFetchStage.java +++ b/src/main/java/build/buildfarm/worker/InputFetchStage.java @@ -72,13 +72,14 @@ public void releaseInputFetcher( int size = removeAndRelease(operationName); inputFetchTime.observe(usecs / 1000.0); inputFetchStallTime.observe(stallUSecs / 1000.0); - logComplete( + complete( operationName, usecs, stallUSecs, String.format("%s, %s", success ? "Success" : "Failure", getUsage(size))); } + @Override public int getSlotUsage() { return fetchers.size(); } @@ -106,8 +107,7 @@ protected void iterate() throws InterruptedException { fetchers.add(fetcher); int slotUsage = fetchers.size(); inputFetchSlotUsage.set(slotUsage); - logStart( - operationContext.queueEntry.getExecuteEntry().getOperationName(), getUsage(slotUsage)); + start(operationContext.queueEntry.getExecuteEntry().getOperationName(), getUsage(slotUsage)); fetcher.start(); } } diff --git a/src/main/java/build/buildfarm/worker/InputFetcher.java b/src/main/java/build/buildfarm/worker/InputFetcher.java index 1c99df6a91..7be7a29e96 100644 --- a/src/main/java/build/buildfarm/worker/InputFetcher.java +++ b/src/main/java/build/buildfarm/worker/InputFetcher.java @@ -168,7 +168,7 @@ static String getExecutablePath( @VisibleForTesting long fetchPolled(Stopwatch stopwatch) throws InterruptedException { String operationName = operationContext.queueEntry.getExecuteEntry().getOperationName(); - log.log(Level.FINE, format("fetching inputs: %s", operationName)); + log.log(Level.FINER, format("fetching inputs: %s", operationName)); ExecutedActionMetadata.Builder executedAction = operationContext @@ -278,7 +278,7 @@ private void proceedToOutput(Action action, Command command, Path execDir) } } else { String operationName = operationContext.queueEntry.getExecuteEntry().getOperationName(); - log.log(Level.FINE, "InputFetcher: Operation " + operationName + " Failed to claim output"); + log.log(Level.FINER, "InputFetcher: Operation " + operationName + " Failed to claim output"); owner.error().put(operationContext); } diff --git a/src/main/java/build/buildfarm/worker/MatchStage.java b/src/main/java/build/buildfarm/worker/MatchStage.java index e245ee68d6..663aee3c52 100644 --- a/src/main/java/build/buildfarm/worker/MatchStage.java +++ b/src/main/java/build/buildfarm/worker/MatchStage.java @@ -95,20 +95,15 @@ public void onError(Throwable t) { throw new RuntimeException(t); } - @Override - public void setOnCancelHandler(Runnable onCancelHandler) { - // never called, only blocking stub used - } - @SuppressWarnings("SameReturnValue") private boolean onOperationPolled() throws InterruptedException { String operationName = operationContext.queueEntry.getExecuteEntry().getOperationName(); - logStart(operationName); + start(operationName); long matchingAtUSecs = stopwatch.elapsed(MICROSECONDS); OperationContext matchedOperationContext = match(operationContext); long matchedInUSecs = stopwatch.elapsed(MICROSECONDS) - matchingAtUSecs; - logComplete(operationName, matchedInUSecs, waitDuration, true); + complete(operationName, matchedInUSecs, waitDuration, true); matchedOperationContext.poller.pause(); try { output.put(matchedOperationContext); @@ -139,7 +134,7 @@ protected void iterate() throws InterruptedException { } MatchOperationListener listener = new MatchOperationListener(operationContext, stopwatch); try { - logStart(); + start(); workerContext.match(listener); } finally { if (!listener.wasMatched()) { diff --git a/src/main/java/build/buildfarm/worker/OperationContext.java b/src/main/java/build/buildfarm/worker/OperationContext.java index e07649d03a..71b1975783 100644 --- a/src/main/java/build/buildfarm/worker/OperationContext.java +++ b/src/main/java/build/buildfarm/worker/OperationContext.java @@ -74,11 +74,6 @@ private Builder( this.queueEntry = queueEntry; } - public Builder setExecuteResponseBuilder(ExecuteResponse.Builder executeResponse) { - this.executeResponse = executeResponse; - return this; - } - public Builder setOperation(Operation operation) { this.operation = operation; return this; diff --git a/src/main/java/build/buildfarm/worker/OutputDirectory.java b/src/main/java/build/buildfarm/worker/OutputDirectory.java index 6245553803..caf77b273e 100644 --- a/src/main/java/build/buildfarm/worker/OutputDirectory.java +++ b/src/main/java/build/buildfarm/worker/OutputDirectory.java @@ -161,10 +161,15 @@ private static OutputDirectory parseDirectories(Iterable o Iterables.addAll(sortedOutputDirs, outputDirs); Collections.sort(sortedOutputDirs); + String currentOutputDir = ""; Builder currentBuilder = builder; String prefix = "/"; for (OutputDirectoryEntry entry : sortedOutputDirs) { String outputDir = entry.outputDirectory; + if (outputDir == currentOutputDir) { + continue; + } + currentOutputDir = outputDir; while (!outputDir.startsWith(prefix)) { currentBuilder = stack.pop(); int upPathSeparatorIndex = prefix.lastIndexOf('/', prefix.length() - 2); diff --git a/src/main/java/build/buildfarm/worker/Pipeline.java b/src/main/java/build/buildfarm/worker/Pipeline.java index 0ed4e7b40a..0228f883ac 100644 --- a/src/main/java/build/buildfarm/worker/Pipeline.java +++ b/src/main/java/build/buildfarm/worker/Pipeline.java @@ -44,14 +44,7 @@ public void add(PipelineStage stage, int closePriority) { stageClosePriorities.put(stage, closePriority); } - /** - * Start the pipeline. - * - *

You can provide callback which is invoked when any stage has an uncaught exception, for - * instance to shutdown the worker gracefully - */ - public void start(SettableFuture uncaughtExceptionFuture) { - stageThreadGroup.setUncaughtExceptionFuture(uncaughtExceptionFuture); + public void start() { for (Thread stageThread : stageThreads.values()) { stageThread.start(); } @@ -70,9 +63,12 @@ public void close() throws InterruptedException { /** Inform MatchStage to stop matching or picking up new Operations from queue. */ public void stopMatchingOperations() { - for (PipelineStage stage : stageClosePriorities.keySet()) { + for (Map.Entry entry : stageThreads.entrySet()) { + PipelineStage stage = entry.getKey(); if (stage instanceof MatchStage) { - ((MatchStage) stage).prepareForGracefulShutdown(); + MatchStage matchStage = (MatchStage) stage; + matchStage.prepareForGracefulShutdown(); + entry.getValue().interrupt(); return; } } @@ -143,15 +139,17 @@ private void join(boolean closeStage) throws InterruptedException { } } if (stageToClose != null && !stageToClose.isClosed()) { - log.log(Level.FINE, "Closing stage at priority " + maxPriority); + log.log(Level.FINER, "Closing stage " + stageToClose + " at priority " + maxPriority); stageToClose.close(); } } + boolean longStageWait = !closeStage; for (Map.Entry stageThread : stageThreads.entrySet()) { PipelineStage stage = stageThread.getKey(); Thread thread = stageThread.getValue(); try { - thread.join(closeStage ? 1 : 1000); + // 0 is wait forever, no instant wait + thread.join(longStageWait ? 1000 : 1); } catch (InterruptedException e) { if (!closeStage) { synchronized (this) { @@ -166,7 +164,7 @@ private void join(boolean closeStage) throws InterruptedException { if (!thread.isAlive()) { log.log( - Level.FINE, + Level.FINER, "Stage " + stage.name() + " has exited at priority " @@ -181,8 +179,8 @@ private void join(boolean closeStage) throws InterruptedException { + stageClosePriorities.get(stage)); thread.interrupt(); } + longStageWait = false; } - closeStage = false; for (PipelineStage stage : inactiveStages) { synchronized (this) { stageThreads.remove(stage); diff --git a/src/main/java/build/buildfarm/worker/PipelineStage.java b/src/main/java/build/buildfarm/worker/PipelineStage.java index da34172a2a..ecc78f8e7e 100644 --- a/src/main/java/build/buildfarm/worker/PipelineStage.java +++ b/src/main/java/build/buildfarm/worker/PipelineStage.java @@ -14,11 +14,13 @@ package build.buildfarm.worker; +import static java.lang.String.format; import static java.util.concurrent.TimeUnit.MICROSECONDS; import com.google.common.base.Stopwatch; import java.util.logging.Level; import java.util.logging.Logger; +import javax.annotation.Nullable; public abstract class PipelineStage implements Runnable { protected final String name; @@ -30,6 +32,7 @@ public abstract class PipelineStage implements Runnable { private volatile boolean closed = false; private Thread tickThread = null; private boolean tickCancelledFlag = false; + private String operationName = null; PipelineStage( String name, WorkerContext workerContext, PipelineStage output, PipelineStage error) { @@ -39,18 +42,29 @@ public abstract class PipelineStage implements Runnable { this.error = error; } + public String getName() { + return name; + } + private void runInterruptible() throws InterruptedException { while (!output.isClosed() || isClaimed()) { iterate(); } } + public @Nullable String getOperationName() { + return operationName; + } + @Override public void run() { try { runInterruptible(); } catch (InterruptedException e) { // ignore + } catch (Exception e) { + getLogger() + .log(Level.SEVERE, format("%s::run(): stage terminated due to exception", name), e); } finally { boolean wasInterrupted = Thread.interrupted(); try { @@ -90,7 +104,7 @@ protected void iterate() throws InterruptedException { Stopwatch stopwatch = Stopwatch.createUnstarted(); try { operationContext = take(); - logStart(operationContext.operation.getName()); + start(operationContext.operation.getName()); stopwatch.start(); boolean valid = false; tickThread = Thread.currentThread(); @@ -124,35 +138,38 @@ protected void iterate() throws InterruptedException { } after(operationContext); long usecs = stopwatch.elapsed(MICROSECONDS); - logComplete( - operationContext.operation.getName(), usecs, stallUSecs, nextOperationContext != null); + complete(operationName, usecs, stallUSecs, nextOperationContext != null); + operationName = null; } private String logIterateId(String operationName) { - return String.format("%s::iterate(%s)", name, operationName); + return format("%s::iterate(%s)", name, operationName); } - protected void logStart() { - logStart(""); + protected void start() { + start(""); } - protected void logStart(String operationName) { - logStart(operationName, "Starting"); + protected void start(String operationName) { + start(operationName, "Starting"); } - protected void logStart(String operationName, String message) { - getLogger().log(Level.FINE, String.format("%s: %s", logIterateId(operationName), message)); + protected void start(String operationName, String message) { + // TODO to unary stage + this.operationName = operationName; + getLogger().log(Level.FINER, format("%s: %s", logIterateId(operationName), message)); } - protected void logComplete(String operationName, long usecs, long stallUSecs, boolean success) { - logComplete(operationName, usecs, stallUSecs, success ? "Success" : "Failed"); + protected void complete(String operationName, long usecs, long stallUSecs, boolean success) { + complete(operationName, usecs, stallUSecs, success ? "Success" : "Failed"); } - protected void logComplete(String operationName, long usecs, long stallUSecs, String status) { + protected void complete(String operationName, long usecs, long stallUSecs, String status) { + this.operationName = operationName; getLogger() .log( - Level.FINE, - String.format( + Level.FINER, + format( "%s: %g ms (%g ms stalled) %s", logIterateId(operationName), usecs / 1000.0f, stallUSecs / 1000.0f, status)); } diff --git a/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java b/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java index 64636d72c2..ed7e610a10 100644 --- a/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java +++ b/src/main/java/build/buildfarm/worker/SuperscalarPipelineStage.java @@ -14,17 +14,21 @@ package build.buildfarm.worker; +import java.util.HashSet; +import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.TimeUnit; import java.util.logging.Level; -abstract class SuperscalarPipelineStage extends PipelineStage { +public abstract class SuperscalarPipelineStage extends PipelineStage { protected final int width; @SuppressWarnings("rawtypes") protected final BlockingQueue claims; + protected Set operationNames = new HashSet<>(); + private volatile boolean catastrophic = false; // ensure that only a single claim waits for available slots for core count @@ -46,6 +50,39 @@ public SuperscalarPipelineStage( protected abstract int claimsRequired(OperationContext operationContext); + @Override + public String getOperationName() { + throw new UnsupportedOperationException("use getOperationNames on superscalar stages"); + } + + public int getWidth() { + return width; + } + + public abstract int getSlotUsage(); + + public Iterable getOperationNames() { + synchronized (operationNames) { + return new HashSet(operationNames); + } + } + + @Override + protected void start(String operationName, String message) { + synchronized (operationNames) { + operationNames.add(operationName); + } + super.start(operationName, message); + } + + @Override + protected void complete(String operationName, long usecs, long stallUSecs, String status) { + super.complete(operationName, usecs, stallUSecs, status); + synchronized (operationNames) { + operationNames.remove(operationName); + } + } + synchronized void waitForReleaseOrCatastrophe(BlockingQueue queue) { boolean interrupted = false; while (!catastrophic && isClaimed()) { diff --git a/src/main/java/build/buildfarm/worker/WorkerContext.java b/src/main/java/build/buildfarm/worker/WorkerContext.java index 873ad1b938..70060acea1 100644 --- a/src/main/java/build/buildfarm/worker/WorkerContext.java +++ b/src/main/java/build/buildfarm/worker/WorkerContext.java @@ -130,4 +130,6 @@ IOResource limitExecution( int commandExecutionClaims(Command command); ResourceLimits commandExecutionSettings(Command command); + + void returnLocalResources(QueueEntry queueEntry); } diff --git a/src/main/java/build/buildfarm/worker/resources/BUILD b/src/main/java/build/buildfarm/worker/resources/BUILD index ac2d69179b..36a4dc3d34 100644 --- a/src/main/java/build/buildfarm/worker/resources/BUILD +++ b/src/main/java/build/buildfarm/worker/resources/BUILD @@ -8,6 +8,7 @@ java_library( "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "@maven//:com_google_guava_guava", "@maven//:com_googlecode_json_simple_json_simple", + "@maven//:io_prometheus_simpleclient", "@maven//:org_apache_commons_commons_lang3", ], ) diff --git a/src/main/java/build/buildfarm/worker/resources/LocalResourceSet.java b/src/main/java/build/buildfarm/worker/resources/LocalResourceSet.java new file mode 100644 index 0000000000..97d64fe9b1 --- /dev/null +++ b/src/main/java/build/buildfarm/worker/resources/LocalResourceSet.java @@ -0,0 +1,37 @@ +// Copyright 2023 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.worker.resources; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.Semaphore; + +/** + * @class Local Resource Set + * @brief A fixed amount of a specific resource. + * @details We define limited resources as a counting semaphores whose configuration contains a name + * and a count representing a physical or logical group of units obtained by executors as a + * precondition to fulfill a long running operation. These units are released upon the + * operation's completion. The resource is requested by the action's platform properties. These + * resources are specific to the individual worker. + */ +public class LocalResourceSet { + /** + * @field resources + * @brief A set containing resource semaphores organized by name. + * @details Key is name, and value is current usage amount. + */ + public Map resources = new HashMap<>(); +} diff --git a/src/main/java/build/buildfarm/worker/resources/LocalResourceSetMetrics.java b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetMetrics.java new file mode 100644 index 0000000000..787a7840e5 --- /dev/null +++ b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetMetrics.java @@ -0,0 +1,46 @@ +// Copyright 2020 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.worker.resources; + +import io.prometheus.client.Gauge; + +/** + * @class LocalResourceSetMetrics + * @brief Tracks metrics related to a worker's limited local resources. + * @details Answers how many resources exist, how many are claimed, and by how many requesters. + */ +public class LocalResourceSetMetrics { + public static final Gauge resourceUsageMetric = + Gauge.build() + .name("local_resource_usage") + .labelNames("resource_name") + .help("The number of claims for each resource currently being used for execution") + .register(); + + public static final Gauge resourceTotalMetric = + Gauge.build() + .name("local_resource_total") + .labelNames("resource_name") + .help("The total number of claims exist for a particular resource") + .register(); + + public static final Gauge requestersMetric = + Gauge.build() + .name("local_resource_requesters") + .labelNames("resource_name") + .help( + "Tracks how many actions have requested local resources. This can help determine if resources are being hogged by some actions.") + .register(); +} diff --git a/src/main/java/build/buildfarm/worker/resources/LocalResourceSetUtils.java b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetUtils.java new file mode 100644 index 0000000000..e8a1e56131 --- /dev/null +++ b/src/main/java/build/buildfarm/worker/resources/LocalResourceSetUtils.java @@ -0,0 +1,120 @@ +// Copyright 2020 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.worker.resources; + +import build.bazel.remote.execution.v2.Platform; +import build.buildfarm.common.config.LimitedResource; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Semaphore; +import org.apache.commons.lang3.StringUtils; + +/** + * @class LocalResourceSetUtils + * @brief Utilities for working with the worker's set of local limited resources. + * @details The methods help with allocation / de-allocation of claims, as well as metrics printing. + */ +public class LocalResourceSetUtils { + private static final LocalResourceSetMetrics metrics = new LocalResourceSetMetrics(); + + public static LocalResourceSet create(List resources) { + LocalResourceSet resourceSet = new LocalResourceSet(); + for (LimitedResource resource : resources) { + resourceSet.resources.put(resource.getName(), new Semaphore(resource.getAmount())); + metrics.resourceTotalMetric.labels(resource.getName()).set(resource.getAmount()); + } + return resourceSet; + } + + public static boolean claimResources(Platform platform, LocalResourceSet resourceSet) { + List> claimed = new ArrayList<>(); + + boolean allClaimed = true; + for (Platform.Property property : platform.getPropertiesList()) { + // Skip properties that are not requesting a limited resource. + String resourceName = getResourceName(property); + Semaphore resource = resourceSet.resources.get(resourceName); + if (resource == null) { + continue; + } + + // Attempt to claim. If claiming fails, we must return all other claims. + int requestAmount = getResourceRequestAmount(property); + boolean wasAcquired = semaphoreAquire(resource, resourceName, requestAmount); + if (wasAcquired) { + claimed.add(new AbstractMap.SimpleEntry<>(resourceName, requestAmount)); + } else { + allClaimed = false; + break; + } + } + + // cleanup remaining resources if they were not all claimed. + if (!allClaimed) { + for (Map.Entry claim : claimed) { + semaphoreRelease( + resourceSet.resources.get(claim.getKey()), claim.getKey(), claim.getValue()); + } + } + + return allClaimed; + } + + public static void releaseClaims(Platform platform, LocalResourceSet resourceSet) { + for (Platform.Property property : platform.getPropertiesList()) { + String resourceName = getResourceName(property); + Semaphore resource = resourceSet.resources.get(resourceName); + if (resource == null) { + continue; + } + int requestAmount = getResourceRequestAmount(property); + semaphoreRelease(resource, resourceName, requestAmount); + } + } + + private static boolean semaphoreAquire(Semaphore resource, String resourceName, int amount) { + boolean wasAcquired = resource.tryAcquire(amount); + if (wasAcquired) { + metrics.resourceUsageMetric.labels(resourceName).inc(amount); + } + metrics.requestersMetric.labels(resourceName).inc(); + return wasAcquired; + } + + private static void semaphoreRelease(Semaphore resource, String resourceName, int amount) { + resource.release(amount); + metrics.resourceUsageMetric.labels(resourceName).dec(amount); + metrics.requestersMetric.labels(resourceName).dec(); + } + + private static int getResourceRequestAmount(Platform.Property property) { + // We support resource values that are not numbers and interpret them as a request for 1 + // resource. For example "gpu:RTX-4090" is equivalent to resource:gpu:1". + try { + return Integer.parseInt(property.getValue()); + } catch (NumberFormatException e) { + return 1; + } + } + + private static String getResourceName(Platform.Property property) { + // We match to keys whether they are prefixed "resource:" or not. + // "resource:gpu:1" requests the gpu resource in the same way that "gpu:1" does. + // The prefix originates from bazel's syntax for the --extra_resources flag. + return StringUtils.removeStart(property.getName(), "resource:"); + } +} diff --git a/src/main/java/build/buildfarm/worker/shard/BUILD b/src/main/java/build/buildfarm/worker/shard/BUILD index 3df1ab7e77..b62e4369de 100644 --- a/src/main/java/build/buildfarm/worker/shard/BUILD +++ b/src/main/java/build/buildfarm/worker/shard/BUILD @@ -4,8 +4,6 @@ java_library( plugins = ["//src/main/java/build/buildfarm/common:lombok"], visibility = ["//visibility:public"], deps = [ - "//src/main/java/build/buildfarm/admin", - "//src/main/java/build/buildfarm/admin/aws", "//src/main/java/build/buildfarm/backplane", "//src/main/java/build/buildfarm/cas", "//src/main/java/build/buildfarm/common", @@ -23,6 +21,8 @@ java_library( "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "@googleapis//:google_rpc_error_details_java_proto", + "@io_grpc_grpc_proto//:health_java_proto", + "@io_grpc_grpc_proto//:health_proto", "@maven//:com_github_ben_manes_caffeine_caffeine", "@maven//:com_github_pcj_google_options", "@maven//:com_google_code_findbugs_jsr305", @@ -32,17 +32,11 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_services", "@maven//:io_grpc_grpc_stub", "@maven//:io_prometheus_simpleclient", "@maven//:javax_annotation_javax_annotation_api", "@maven//:org_projectlombok_lombok", - "@maven//:org_springframework_boot_spring_boot", - "@maven//:org_springframework_boot_spring_boot_autoconfigure", - "@maven//:org_springframework_spring_beans", - "@maven//:org_springframework_spring_context", - "@maven//:org_springframework_spring_core", ], ) diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index e5371894c4..264151ef48 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -19,7 +19,6 @@ import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Preconditions.checkState; import static com.google.common.collect.Iterables.concat; -import static com.google.common.collect.Iterables.filter; import static com.google.common.util.concurrent.Futures.allAsList; import static com.google.common.util.concurrent.Futures.catchingAsync; import static com.google.common.util.concurrent.Futures.immediateFailedFuture; @@ -41,13 +40,15 @@ import build.buildfarm.cas.ContentAddressableStorage; import build.buildfarm.cas.cfc.CASFileCache; import build.buildfarm.common.BuildfarmExecutors; +import build.buildfarm.common.DigestUtil; import build.buildfarm.common.io.Directories; import build.buildfarm.common.io.Dirent; import build.buildfarm.worker.ExecDirException; import build.buildfarm.worker.ExecDirException.ViolationException; import build.buildfarm.worker.OutputDirectory; import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; import com.google.common.util.concurrent.ListenableFuture; import java.io.IOException; import java.io.InputStream; @@ -55,14 +56,17 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.UserPrincipal; -import java.util.Collections; +import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.Stack; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.function.Consumer; import java.util.logging.Level; +import java.util.regex.Pattern; import javax.annotation.Nullable; import lombok.extern.java.Log; @@ -75,8 +79,11 @@ class CFCExecFileSystem implements ExecFileSystem { // perform first-available non-output symlinking and retain directories in cache private final boolean linkInputDirectories; - // override the symlinking above for a set of matching paths - private final Iterable realInputDirectories; + // indicate symlinking above for a set of matching paths + private final Iterable linkedInputDirectories; + + // permit symlinks to point to absolute paths in inputs + private final boolean allowSymlinkTargetAbsolute; private final Map> rootKeys = new ConcurrentHashMap<>(); private final Map> rootInputFiles = new ConcurrentHashMap<>(); @@ -91,14 +98,18 @@ class CFCExecFileSystem implements ExecFileSystem { CASFileCache fileCache, @Nullable UserPrincipal owner, boolean linkInputDirectories, - Iterable realInputDirectories, + Iterable linkedInputDirectories, + boolean allowSymlinkTargetAbsolute, ExecutorService removeDirectoryService, ExecutorService accessRecorder) { this.root = root; this.fileCache = fileCache; this.owner = owner; this.linkInputDirectories = linkInputDirectories; - this.realInputDirectories = realInputDirectories; + this.linkedInputDirectories = + Iterables.transform( + linkedInputDirectories, realInputDirectory -> Pattern.compile(realInputDirectory)); + this.allowSymlinkTargetAbsolute = allowSymlinkTargetAbsolute; this.removeDirectoryService = removeDirectoryService; this.accessRecorder = accessRecorder; } @@ -142,7 +153,8 @@ public void start(Consumer> onDigests, boolean skipLoad) } @Override - public void stop() { + public void stop() throws InterruptedException { + fileCache.stop(); if (!shutdownAndAwaitTermination(fetchService, 1, MINUTES)) { log.log(Level.SEVERE, "could not terminate fetchService"); } @@ -173,7 +185,7 @@ public InputStream newInput(Compressor.Value compressor, Digest digest, long off private ListenableFuture putSymlink(Path path, SymlinkNode symlinkNode) { Path symlinkPath = path.resolve(symlinkNode.getName()); Path relativeTargetPath = path.getFileSystem().getPath(symlinkNode.getTarget()); - checkState(!relativeTargetPath.isAbsolute()); + checkState(allowSymlinkTargetAbsolute || !relativeTargetPath.isAbsolute()); return listeningDecorator(fetchService) .submit( () -> { @@ -239,6 +251,7 @@ private Iterable> fetchInputs( Digest directoryDigest, Map directoriesIndex, OutputDirectory outputDirectory, + Set linkedInputDirectories, Consumer onKey, ImmutableList.Builder inputDirectories) throws IOException { @@ -262,13 +275,16 @@ private Iterable> fetchInputs( .map(symlinkNode -> putSymlink(path, symlinkNode)) .collect(ImmutableList.toImmutableList())); + ImmutableList.Builder> linkedDirectories = ImmutableList.builder(); for (DirectoryNode directoryNode : directory.getDirectoriesList()) { Digest digest = directoryNode.getDigest(); String name = directoryNode.getName(); OutputDirectory childOutputDirectory = outputDirectory != null ? outputDirectory.getChild(name) : null; Path dirPath = path.resolve(name); - if (childOutputDirectory != null || !linkInputDirectories) { + if (childOutputDirectory != null + || !linkInputDirectories + || !linkedInputDirectories.contains(dirPath)) { Files.createDirectories(dirPath); downloads = concat( @@ -279,32 +295,27 @@ private Iterable> fetchInputs( digest, directoriesIndex, childOutputDirectory, + linkedInputDirectories, onKey, inputDirectories)); } else { - downloads = - concat( - downloads, - ImmutableList.of( - transform( - linkDirectory(dirPath, digest, directoriesIndex), - (result) -> { - // note: this could non-trivial make sync due to - // the way decrementReferences is implemented. - // we saw null entries in the built immutable list - // without synchronization - synchronized (inputDirectories) { - inputDirectories.add(digest); - } - return null; - }, - fetchService))); + linkedDirectories.add( + transform( + linkDirectory(dirPath, digest, directoriesIndex), + (result) -> { + // we saw null entries in the built immutable list without synchronization + synchronized (inputDirectories) { + inputDirectories.add(digest); + } + return null; + }, + fetchService)); } if (Thread.currentThread().isInterrupted()) { break; } } - return downloads; + return concat(downloads, linkedDirectories.build()); } @SuppressWarnings("ConstantConditions") @@ -312,8 +323,14 @@ private ListenableFuture linkDirectory( Path execPath, Digest digest, Map directoriesIndex) { return transformAsync( fileCache.putDirectory(digest, directoriesIndex, fetchService), - (cachePath) -> { - Files.createSymbolicLink(execPath, cachePath); + pathResult -> { + Path path = pathResult.getPath(); + if (pathResult.getMissed()) { + log.fine( + String.format( + "putDirectory(%s, %s) created", execPath, DigestUtil.toString(digest))); + } + Files.createSymbolicLink(execPath, path); return immediateFuture(null); }, fetchService); @@ -325,30 +342,76 @@ private static void checkExecErrors(Path path, List errors) throws Ex } } - private static boolean treeContainsPath( - String directoryPath, Map directoriesIndex, Digest rootDigest) { - Directory directory = directoriesIndex.get(rootDigest); - for (String name : directoryPath.split("/")) { - List subdirs = directory.getDirectoriesList(); - int index = Collections.binarySearch(Lists.transform(subdirs, DirectoryNode::getName), name); - if (index < 0) { - return false; + private static Iterator directoriesIterator( + Digest digest, Map directoriesIndex) { + Directory root = directoriesIndex.get(digest); + return new Iterator() { + boolean atEnd = root.getDirectoriesCount() == 0; + Stack path = new Stack<>(); + Stack> route = new Stack<>(); + Iterator current = root.getDirectoriesList().iterator(); + + @Override + public boolean hasNext() { + return !atEnd; } - directory = directoriesIndex.get(subdirs.get(index).getDigest()); - } - return true; + + @Override + public String next() { + String nextPath; + DirectoryNode next = current.next(); + String name = next.getName(); + path.push(name); + nextPath = String.join("/", path); + Digest digest = next.getDigest(); + if (digest.getSizeBytes() != 0) { + route.push(current); + current = directoriesIndex.get(digest).getDirectoriesList().iterator(); + } else { + path.pop(); + } + while (!current.hasNext() && !route.isEmpty()) { + current = route.pop(); + path.pop(); + } + atEnd = !current.hasNext(); + return nextPath; + } + }; } - private Iterable realDirectories( + private Set linkedDirectories( Map directoriesIndex, Digest rootDigest) { // skip this search if all the directories are real if (linkInputDirectories) { - // somewhat inefficient, but would need many overrides to be painful - return filter( - realInputDirectories, - realInputDirectory -> treeContainsPath(realInputDirectory, directoriesIndex, rootDigest)); + ImmutableSet.Builder builder = ImmutableSet.builder(); + + Iterator dirs = directoriesIterator(rootDigest, directoriesIndex); + while (dirs.hasNext()) { + String dir = dirs.next(); + for (Pattern pattern : linkedInputDirectories) { + if (pattern.matcher(dir).matches()) { + builder.add(dir); + break; // avoid adding the same directory twice + } + } + } + return builder.build(); + } + return ImmutableSet.of(); + } + + private OutputDirectory createOutputDirectory(Command command) { + Iterable files; + Iterable dirs; + if (command.getOutputPathsCount() != 0) { + files = command.getOutputPathsList(); + dirs = ImmutableList.of(); // output paths require the action to create their own directory + } else { + files = command.getOutputFilesList(); + dirs = command.getOutputDirectoriesList(); } - return ImmutableList.of(); + return OutputDirectory.parse(files, dirs, command.getEnvironmentVariablesList()); } @Override @@ -357,13 +420,7 @@ public Path createExecDir( throws IOException, InterruptedException { log.log(Level.FINEST, "ExecFileSystem::createExecDir(" + operationName + ")"); Digest inputRootDigest = action.getInputRootDigest(); - OutputDirectory outputDirectory = - OutputDirectory.parse( - command.getOutputFilesList(), - concat( - command.getOutputDirectoriesList(), - realDirectories(directoriesIndex, inputRootDigest)), - command.getEnvironmentVariablesList()); + OutputDirectory outputDirectory = createOutputDirectory(command); Path execDir = root.resolve(operationName); if (Files.exists(execDir)) { @@ -374,6 +431,14 @@ public Path createExecDir( ImmutableList.Builder inputFiles = new ImmutableList.Builder<>(); ImmutableList.Builder inputDirectories = new ImmutableList.Builder<>(); + Set linkedInputDirectories = + ImmutableSet.copyOf( + Iterables.transform( + linkedDirectories(directoriesIndex, inputRootDigest), + path -> execDir.resolve(path))); // does this work on windows with / separators? + + log.log( + Level.FINER, "ExecFileSystem::createExecDir(" + operationName + ") calling fetchInputs"); // Get lock keys so we can increment them prior to downloading // and no other threads can to create/delete during // eviction or the invocation of fetchInputs @@ -387,6 +452,7 @@ public Path createExecDir( inputRootDigest, directoriesIndex, outputDirectory, + linkedInputDirectories, key -> { synchronized (inputFiles) { inputFiles.add(key); @@ -440,7 +506,7 @@ public Path createExecDir( rootInputDirectories.put(execDir, inputDirectories.build()); log.log( - Level.FINE, + Level.FINER, "ExecFileSystem::createExecDir(" + operationName + ") stamping output directories"); boolean stamped = false; try { diff --git a/src/main/java/build/buildfarm/worker/shard/ExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/ExecFileSystem.java index 916b43cef0..b55601d598 100644 --- a/src/main/java/build/buildfarm/worker/shard/ExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/ExecFileSystem.java @@ -30,7 +30,7 @@ public interface ExecFileSystem extends InputStreamFactory { void start(Consumer> onDigests, boolean skipLoad) throws IOException, InterruptedException; - void stop(); + void stop() throws InterruptedException; Path root(); diff --git a/src/main/java/build/buildfarm/worker/shard/RemoteCasWriter.java b/src/main/java/build/buildfarm/worker/shard/RemoteCasWriter.java index d7d5f0a776..7ec1aa1bcf 100644 --- a/src/main/java/build/buildfarm/worker/shard/RemoteCasWriter.java +++ b/src/main/java/build/buildfarm/worker/shard/RemoteCasWriter.java @@ -76,7 +76,7 @@ private void insertFileToCasMember(Digest digest, DigestFunction.Value digestFun Throwable cause = e.getCause(); Throwables.throwIfInstanceOf(cause, IOException.class); Throwables.throwIfUnchecked(cause); - throw new RuntimeException(cause); + throw new IOException(cause); } } @@ -93,7 +93,7 @@ private long writeToCasMember(Digest digest, DigestFunction.Value digestFunction Throwables.throwIfInstanceOf(cause, IOException.class); // prevent a discard of this frame Status status = Status.fromThrowable(cause); - throw status.asRuntimeException(); + throw new IOException(status.asException()); } } @@ -123,14 +123,14 @@ private void insertBlobToCasMember(Digest digest, DigestFunction.Value digestFun Throwable cause = e.getCause(); Throwables.throwIfInstanceOf(cause, IOException.class); Throwables.throwIfUnchecked(cause); - throw new RuntimeException(cause); + throw new IOException(cause); } } private String getRandomWorker() throws IOException { synchronized (workerSet) { if (workerSet.isEmpty()) { - throw new RuntimeException("no available workers"); + throw new IOException("no available workers"); } Random rand = new Random(); int index = rand.nextInt(workerSet.size()); diff --git a/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java b/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java index d745a6194b..78f4e11dd1 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardCASFileCache.java @@ -38,7 +38,6 @@ class ShardCASFileCache extends CASFileCache { long maxEntrySizeInBytes, int maxBucketLevels, boolean storeFileDirsIndexInMemory, - boolean publishTtlMetric, boolean execRootFallback, DigestUtil digestUtil, ExecutorService expireService, @@ -53,7 +52,6 @@ class ShardCASFileCache extends CASFileCache { maxEntrySizeInBytes, maxBucketLevels, storeFileDirsIndexInMemory, - publishTtlMetric, execRootFallback, digestUtil, expireService, diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java index 27e169960a..a5232d5153 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java @@ -32,6 +32,7 @@ import build.bazel.remote.execution.v2.ExecutionStage; import build.bazel.remote.execution.v2.FileNode; import build.bazel.remote.execution.v2.Platform; +import build.bazel.remote.execution.v2.SymlinkNode; import build.bazel.remote.execution.v2.Tree; import build.buildfarm.backplane.Backplane; import build.buildfarm.common.CommandUtils; @@ -61,6 +62,8 @@ import build.buildfarm.worker.cgroup.Cpu; import build.buildfarm.worker.cgroup.Group; import build.buildfarm.worker.cgroup.Mem; +import build.buildfarm.worker.resources.LocalResourceSet; +import build.buildfarm.worker.resources.LocalResourceSetUtils; import build.buildfarm.worker.resources.ResourceDecider; import build.buildfarm.worker.resources.ResourceLimits; import com.google.common.annotations.VisibleForTesting; @@ -93,6 +96,7 @@ import java.util.Map; import java.util.Stack; import java.util.logging.Level; +import javax.annotation.Nullable; import lombok.extern.java.Log; @Log @@ -129,6 +133,7 @@ class ShardWorkerContext implements WorkerContext { private final Group operationsGroup = executionsGroup.getChild("operations"); private final CasWriter writer; private final boolean errorOperationRemainingResources; + private final LocalResourceSet resourceSet; static SetMultimap getMatchProvisions( Iterable policies, int executeStageWidth) { @@ -162,6 +167,7 @@ static SetMultimap getMatchProvisions( boolean onlyMulticoreTests, boolean allowBringYourOwnContainer, boolean errorOperationRemainingResources, + LocalResourceSet resourceSet, CasWriter writer) { this.name = name; this.matchProvisions = getMatchProvisions(policies, executeStageWidth); @@ -182,6 +188,7 @@ static SetMultimap getMatchProvisions( this.onlyMulticoreTests = onlyMulticoreTests; this.allowBringYourOwnContainer = allowBringYourOwnContainer; this.errorOperationRemainingResources = errorOperationRemainingResources; + this.resourceSet = resourceSet; this.writer = writer; } @@ -240,12 +247,12 @@ public void resumePoller( } else { operationPollerCounter.inc(); log.log( - Level.INFO, format("%s: poller: Completed Poll for %s: OK", name, operationName)); + Level.FINE, format("%s: poller: Completed Poll for %s: OK", name, operationName)); } return success; }, () -> { - log.log(Level.INFO, format("%s: poller: Deadline expired for %s", name, operationName)); + log.log(Level.FINE, format("%s: poller: Deadline expired for %s", name, operationName)); onFailure.run(); }, deadline); @@ -273,6 +280,12 @@ public QueuedOperation getQueuedOperation(QueueEntry queueEntry) @SuppressWarnings("ConstantConditions") private void matchInterruptible(MatchListener listener) throws IOException, InterruptedException { + QueueEntry queueEntry = takeEntryOffOperationQueue(listener); + decideWhetherToKeepOperation(queueEntry, listener); + } + + private QueueEntry takeEntryOffOperationQueue(MatchListener listener) + throws IOException, InterruptedException { listener.onWaitStart(); QueueEntry queueEntry = null; try { @@ -294,9 +307,13 @@ private void matchInterruptible(MatchListener listener) throws IOException, Inte // transient backplane errors will propagate a null queueEntry } listener.onWaitEnd(); + return queueEntry; + } + private void decideWhetherToKeepOperation(QueueEntry queueEntry, MatchListener listener) + throws IOException, InterruptedException { if (queueEntry == null - || DequeueMatchEvaluator.shouldKeepOperation(matchProvisions, queueEntry)) { + || DequeueMatchEvaluator.shouldKeepOperation(matchProvisions, resourceSet, queueEntry)) { listener.onEntry(queueEntry); } else { backplane.rejectOperation(queueEntry); @@ -306,6 +323,11 @@ private void matchInterruptible(MatchListener listener) throws IOException, Inte } } + @Override + public void returnLocalResources(QueueEntry queueEntry) { + LocalResourceSetUtils.releaseClaims(queueEntry.getPlatform(), resourceSet); + } + @Override public void match(MatchListener listener) throws InterruptedException { RetryingMatchListener dedupMatchListener = @@ -314,7 +336,7 @@ public void match(MatchListener listener) throws InterruptedException { @Override public boolean getMatched() { - return !matched; + return matched; } @Override @@ -328,20 +350,28 @@ public void onWaitEnd() { } @Override - public boolean onEntry(QueueEntry queueEntry) throws InterruptedException { + public boolean onEntry(@Nullable QueueEntry queueEntry) throws InterruptedException { if (queueEntry == null) { matched = true; return listener.onEntry(null); } + return onValidEntry(queueEntry); + } + + private boolean onValidEntry(QueueEntry queueEntry) throws InterruptedException { String operationName = queueEntry.getExecuteEntry().getOperationName(); if (activeOperations.putIfAbsent(operationName, queueEntry) != null) { log.log(Level.WARNING, "matched duplicate operation " + operationName); return false; } + return onUniqueEntry(queueEntry); + } + + private boolean onUniqueEntry(QueueEntry queueEntry) throws InterruptedException { matched = true; boolean success = listener.onEntry(queueEntry); if (!success) { - requeue(operationName); + requeue(queueEntry.getExecuteEntry().getOperationName()); } return success; } @@ -351,13 +381,8 @@ public void onError(Throwable t) { Throwables.throwIfUnchecked(t); throw new RuntimeException(t); } - - @Override - public void setOnCancelHandler(Runnable onCancelHandler) { - listener.setOnCancelHandler(onCancelHandler); - } }; - while (dedupMatchListener.getMatched()) { + while (!dedupMatchListener.getMatched()) { try { matchInterruptible(dedupMatchListener); } catch (IOException e) { @@ -485,7 +510,7 @@ private void uploadOutputFile( throws IOException, InterruptedException { String outputFile = actionRoot.relativize(outputPath).toString(); if (!Files.exists(outputPath)) { - log.log(Level.FINE, "ReportResultStage: " + outputFile + " does not exist..."); + log.log(Level.FINER, "ReportResultStage: " + outputFile + " does not exist..."); return; } @@ -493,7 +518,7 @@ private void uploadOutputFile( String message = String.format( "ReportResultStage: %s is a directory but it should have been a file", outputPath); - log.log(Level.FINE, message); + log.log(Level.FINER, message); preconditionFailure .addViolationsBuilder() .setType(VIOLATION_TYPE_INVALID) @@ -550,6 +575,7 @@ private void uploadOutputFile( static class OutputDirectoryContext { private final List files = new ArrayList<>(); private final List directories = new ArrayList<>(); + private final List symlinks = new ArrayList<>(); void addFile(FileNode fileNode) { files.add(fileNode); @@ -559,10 +585,19 @@ void addDirectory(DirectoryNode directoryNode) { directories.add(directoryNode); } + void addSymlink(SymlinkNode symlinkNode) { + symlinks.add(symlinkNode); + } + Directory toDirectory() { files.sort(Comparator.comparing(FileNode::getName)); directories.sort(Comparator.comparing(DirectoryNode::getName)); - return Directory.newBuilder().addAllFiles(files).addAllDirectories(directories).build(); + symlinks.sort(Comparator.comparing(SymlinkNode::getName)); + return Directory.newBuilder() + .addAllFiles(files) + .addAllDirectories(directories) + .addAllSymlinks(symlinks) + .build(); } } @@ -574,12 +609,12 @@ private void uploadOutputDirectory( throws IOException, InterruptedException { String outputDir = actionRoot.relativize(outputDirPath).toString(); if (!Files.exists(outputDirPath)) { - log.log(Level.FINE, "ReportResultStage: " + outputDir + " does not exist..."); + log.log(Level.FINER, "ReportResultStage: " + outputDir + " does not exist..."); return; } if (!Files.isDirectory(outputDirPath)) { - log.log(Level.FINE, "ReportResultStage: " + outputDir + " is not a directory..."); + log.log(Level.FINER, "ReportResultStage: " + outputDir + " is not a directory..."); preconditionFailure .addViolationsBuilder() .setType(VIOLATION_TYPE_INVALID) @@ -599,8 +634,30 @@ private void uploadOutputDirectory( @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + if (configs.getWorker().isCreateSymlinkOutputs() && attrs.isSymbolicLink()) { + visitSymbolicLink(file); + } else { + visitRegularFile(file, attrs); + } + return FileVisitResult.CONTINUE; + } + + private void visitSymbolicLink(Path file) throws IOException { + // TODO convert symlinks with absolute targets within execution root to relative ones + currentDirectory.addSymlink( + SymlinkNode.newBuilder() + .setName(file.getFileName().toString()) + .setTarget(Files.readSymbolicLink(file).toString()) + .build()); + } + + private void visitRegularFile(Path file, BasicFileAttributes attrs) throws IOException { Digest digest; try { + // should we create symlink nodes in output? + // is buildstream trying to execute in a specific container?? + // can get to NSFE for nonexistent symlinks + // can fail outright for a symlink to a directory digest = getDigestUtil().compute(file); } catch (NoSuchFileException e) { log.log( @@ -609,7 +666,7 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) "error visiting file %s under output dir %s", outputDirPath.relativize(file), outputDirPath.toAbsolutePath()), e); - return FileVisitResult.CONTINUE; + return; } // should we cast to PosixFilePermissions and do gymnastics there for executable? @@ -633,7 +690,6 @@ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) .setDescription( "An output could not be uploaded because it exceeded the maximum size of an entry"); } - return FileVisitResult.CONTINUE; } @Override @@ -702,7 +758,7 @@ public boolean putOperation(Operation operation) throws IOException, Interrupted boolean success = createBackplaneRetrier().execute(() -> instance.putOperation(operation)); if (success && operation.getDone()) { completedOperations.inc(); - log.log(Level.FINE, "CompletedOperation: " + operation.getName()); + log.log(Level.FINER, "CompletedOperation: " + operation.getName()); } return success; } diff --git a/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java b/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java index 63b1b205cb..7d394c9731 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java +++ b/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java @@ -14,19 +14,14 @@ package build.buildfarm.worker.shard; -import static java.util.logging.Level.WARNING; - -import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.v1test.PrepareWorkerForGracefulShutDownRequest; import build.buildfarm.v1test.PrepareWorkerForGracefulShutDownRequestResults; import build.buildfarm.v1test.ShutDownWorkerGrpc; import io.grpc.stub.StreamObserver; -import java.util.concurrent.CompletableFuture; import lombok.extern.java.Log; @Log public class ShutDownWorkerGracefully extends ShutDownWorkerGrpc.ShutDownWorkerImplBase { - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); private final Worker worker; public ShutDownWorkerGracefully(Worker worker) { @@ -44,35 +39,8 @@ public ShutDownWorkerGracefully(Worker worker) { public void prepareWorkerForGracefulShutdown( PrepareWorkerForGracefulShutDownRequest request, StreamObserver responseObserver) { - String clusterId = configs.getServer().getClusterId(); - String clusterEndpoint = configs.getServer().getAdmin().getClusterEndpoint(); - if (clusterId == null - || clusterId.equals("") - || clusterEndpoint == null - || clusterEndpoint.equals("")) { - String errorMessage = - String.format( - "Current AdminConfig doesn't have cluster_id or cluster_endpoint set, " - + "the worker %s won't be shut down.", - configs.getWorker().getPublicName()); - log.log(WARNING, errorMessage); - responseObserver.onError(new RuntimeException(errorMessage)); - return; - } - - if (!configs.getServer().getAdmin().isEnableGracefulShutdown()) { - String errorMessage = - String.format( - "Current AdminConfig doesn't support shut down worker gracefully, " - + "the worker %s won't be shut down.", - configs.getWorker().getPublicName()); - log.log(WARNING, errorMessage); - responseObserver.onError(new RuntimeException(errorMessage)); - return; - } - try { - CompletableFuture.runAsync(worker::prepareWorkerForGracefulShutdown); + worker.initiateShutdown(); responseObserver.onNext(PrepareWorkerForGracefulShutDownRequestResults.newBuilder().build()); responseObserver.onCompleted(); } catch (Exception e) { diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index 9d99018688..7f1bee11b8 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -28,7 +28,6 @@ import build.bazel.remote.execution.v2.Compressor; import build.bazel.remote.execution.v2.Digest; -import build.buildfarm.admin.aws.AwsAdmin; import build.buildfarm.backplane.Backplane; import build.buildfarm.cas.ContentAddressableStorage; import build.buildfarm.cas.ContentAddressableStorage.Blob; @@ -37,11 +36,13 @@ import build.buildfarm.common.BuildfarmExecutors; import build.buildfarm.common.DigestUtil; import build.buildfarm.common.InputStreamFactory; +import build.buildfarm.common.LoggingMain; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.common.config.Cas; import build.buildfarm.common.config.GrpcMetrics; import build.buildfarm.common.grpc.Retrier; import build.buildfarm.common.grpc.Retrier.Backoff; +import build.buildfarm.common.grpc.TracingMetadataUtils.ServerHeadersInterceptor; import build.buildfarm.common.services.ByteStreamService; import build.buildfarm.common.services.ContentAddressableStorageService; import build.buildfarm.instance.Instance; @@ -58,6 +59,8 @@ import build.buildfarm.worker.PipelineStage; import build.buildfarm.worker.PutOperationStage; import build.buildfarm.worker.ReportResultStage; +import build.buildfarm.worker.SuperscalarPipelineStage; +import build.buildfarm.worker.resources.LocalResourceSetUtils; import com.google.common.cache.LoadingCache; import com.google.common.collect.Lists; import com.google.common.util.concurrent.SettableFuture; @@ -70,6 +73,7 @@ import io.grpc.Status; import io.grpc.Status.Code; import io.grpc.health.v1.HealthCheckResponse.ServingStatus; +import io.grpc.protobuf.services.ProtoReflectionService; import io.grpc.services.HealthStatusManager; import io.prometheus.client.Counter; import io.prometheus.client.Gauge; @@ -88,22 +92,14 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.logging.Level; import javax.annotation.Nullable; -import javax.annotation.PostConstruct; -import javax.annotation.PreDestroy; import javax.naming.ConfigurationException; import lombok.extern.java.Log; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.SpringApplication; -import org.springframework.boot.autoconfigure.SpringBootApplication; -import org.springframework.context.ApplicationContext; -import org.springframework.context.annotation.ComponentScan; @Log -@SpringBootApplication -@ComponentScan("build.buildfarm") -public class Worker { +public final class Worker extends LoggingMain { private static final java.util.logging.Logger nettyLogger = java.util.logging.Logger.getLogger("io.grpc.netty"); private static final Counter healthCheckMetric = @@ -132,7 +128,7 @@ public class Worker { private boolean inGracefulShutdown = false; private boolean isPaused = false; - private ShardWorkerInstance instance; + private WorkerInstance instance; @SuppressWarnings("deprecation") private final HealthStatusManager healthStatusManager = new HealthStatusManager(); @@ -144,92 +140,56 @@ public class Worker { private Pipeline pipeline; private Backplane backplane; private LoadingCache workerStubs; - @Autowired private AwsAdmin awsAdmin; + private AtomicBoolean released = new AtomicBoolean(true); - @Autowired private ApplicationContext springContext; /** - * The method will prepare the worker for graceful shutdown and send out grpc request to disable - * scale in protection when the worker is ready. If unexpected errors happened, it will cancel the - * graceful shutdown progress make the worker available again. + * The method will prepare the worker for graceful shutdown when the worker is ready. Note on + * using stderr here instead of log. By the time this is called in PreDestroy, the log is no + * longer available and is not logging messages. */ public void prepareWorkerForGracefulShutdown() { - inGracefulShutdown = true; - log.log( - Level.INFO, - "The current worker will not be registered again and should be shutdown gracefully!"); - pipeline.stopMatchingOperations(); - int scanRate = 30; // check every 30 seconds - int timeWaited = 0; - int timeOut = 60 * 15; // 15 minutes - - try { - while (!pipeline.isEmpty() && timeWaited < timeOut) { - SECONDS.sleep(scanRate); - timeWaited += scanRate; - log.log(INFO, String.format("Pipeline is still not empty after %d seconds.", timeWaited)); - } - } catch (InterruptedException e) { - log.log(Level.SEVERE, "The worker gracefully shutdown is interrupted: " + e.getMessage()); - } finally { - // make a grpc call to disable scale protection - String clusterEndpoint = configs.getServer().getAdmin().getClusterEndpoint(); - log.log( - INFO, + if (configs.getWorker().getGracefulShutdownSeconds() == 0) { + log.info( String.format( - "It took the worker %d seconds to %s", - timeWaited, - pipeline.isEmpty() ? "finish all actions" : "but still cannot finish all actions")); + "Graceful Shutdown is not enabled. Worker is shutting down without finishing executions in progress.")); + } else { + inGracefulShutdown = true; + log.info( + "Graceful Shutdown - The current worker will not be registered again and should be shutdown gracefully!"); + pipeline.stopMatchingOperations(); + int scanRate = 30; // check every 30 seconds + int timeWaited = 0; + int timeOut = configs.getWorker().getGracefulShutdownSeconds(); try { - awsAdmin.disableHostScaleInProtection(clusterEndpoint, configs.getWorker().getPublicName()); - } catch (Exception e) { - log.log( - SEVERE, + if (pipeline.isEmpty()) { + log.info("Graceful Shutdown - no work in the pipeline."); + } else { + log.info(String.format("Graceful Shutdown - waiting for executions to finish.")); + } + while (!pipeline.isEmpty() && timeWaited < timeOut) { + SECONDS.sleep(scanRate); + timeWaited += scanRate; + log.info( + String.format( + "Graceful Shutdown - Pipeline is still not empty after %d seconds.", timeWaited)); + } + } catch (InterruptedException e) { + log.info( + "Graceful Shutdown - The worker gracefully shutdown is interrupted: " + e.getMessage()); + } finally { + log.info( String.format( - "gRPC call to AdminService to disable scale in protection failed with exception: %s and stacktrace %s", - e.getMessage(), Arrays.toString(e.getStackTrace()))); - // Gracefully shutdown cannot be performed successfully because of error in - // AdminService side. Under this scenario, the worker has to be added back to the worker - // pool. - inGracefulShutdown = false; + "Graceful Shutdown - It took the worker %d seconds to %s", + timeWaited, + pipeline.isEmpty() + ? "finish all actions" + : "gracefully shutdown but still cannot finish all actions")); } } } - private void exitPostPipelineFailure() { - // Shutdown the worker if a pipeline fails. By means of the spring lifecycle - // hooks - e.g. the `PreDestroy` hook here - it will attempt to gracefully - // spin down the pipeline - - // By calling these spring shutdown facilities; we're open to the risk that - // a subsystem may be hanging a criticial thread indeffinitly. Deadline the - // shutdown workflow to ensure we don't leave a zombie worker in this - // situation - ScheduledExecutorService shutdownDeadlineExecutor = newSingleThreadScheduledExecutor(); - - // This may be shorter than the action timeout; assume we have interrupted - // actions in a fatal uncaught exception. - int forceShutdownDeadline = 60; - ScheduledFuture termFuture = - shutdownDeadlineExecutor.schedule( - new Runnable() { - public void run() { - log.log( - Level.SEVERE, - String.format( - "Force terminating due to shutdown deadline exceeded (%d seconds)", - forceShutdownDeadline)); - System.exit(1); - } - }, - forceShutdownDeadline, - SECONDS); - - // Consider defining exit codes to better afford out of band instance - // recovery - int code = SpringApplication.exit(springContext, () -> 1); - termFuture.cancel(false); - shutdownDeadlineExecutor.shutdown(); - System.exit(code); + public Worker() { + super("BuildFarmShardWorker"); } private Operation stripOperation(Operation operation) { @@ -242,7 +202,7 @@ private Operation stripQueuedOperation(Operation operation) { private Server createServer( ServerBuilder serverBuilder, - ContentAddressableStorage storage, + @Nullable CASFileCache storage, Instance instance, Pipeline pipeline, ShardWorkerContext context) { @@ -250,19 +210,20 @@ private Server createServer( serverBuilder.addService(new ContentAddressableStorageService(instance)); serverBuilder.addService(new ByteStreamService(instance)); serverBuilder.addService(new ShutDownWorkerGracefully(this)); + serverBuilder.addService(ProtoReflectionService.newInstance()); // We will build a worker's server based on it's capabilities. // A worker that is capable of execution will construct an execution pipeline. // It will use various execution phases for it's profile service. // On the other hand, a worker that is only capable of CAS storage does not need a pipeline. if (configs.getWorker().getCapabilities().isExecution()) { - PipelineStage completeStage = - new PutOperationStage((operation) -> context.deactivate(operation.getName())); + PutOperationStage completeStage = + new PutOperationStage(operation -> context.deactivate(operation.getName())); PipelineStage errorStage = completeStage; /* new ErrorStage(); */ PipelineStage reportResultStage = new ReportResultStage(context, completeStage, errorStage); - PipelineStage executeActionStage = + SuperscalarPipelineStage executeActionStage = new ExecuteActionStage(context, reportResultStage, errorStage); - PipelineStage inputFetchStage = + SuperscalarPipelineStage inputFetchStage = new InputFetchStage(context, executeActionStage, new PutOperationStage(context::requeue)); PipelineStage matchStage = new MatchStage(context, inputFetchStage, errorStage); @@ -273,9 +234,16 @@ private Server createServer( serverBuilder.addService( new WorkerProfileService( - storage, inputFetchStage, executeActionStage, context, completeStage, backplane)); + storage, + matchStage, + inputFetchStage, + executeActionStage, + reportResultStage, + completeStage, + backplane)); } GrpcMetrics.handleGrpcMetricIntercepts(serverBuilder, configs.getWorker().getGrpcMetrics()); + serverBuilder.intercept(new ServerHeadersInterceptor()); return serverBuilder.build(); } @@ -398,7 +366,6 @@ private ContentAddressableStorage createStorage( // delegate level cas.getHexBucketLevels(), cas.isFileDirectoriesIndexInMemory(), - cas.isPublishTtlMetric(), cas.isExecRootCopyFallback(), digestUtil, removeDirectoryService, @@ -420,7 +387,8 @@ private ExecFileSystem createCFCExecFileSystem( fileCache, owner, configs.getWorker().isLinkInputDirectories(), - configs.getWorker().getRealInputDirectories(), + configs.getWorker().getLinkedInputDirectories(), + configs.isAllowSymlinkTargetAbsolute(), removeDirectoryService, accessRecorder /* deadlineAfter=*/ @@ -546,13 +514,6 @@ public void run() { } } catch (InterruptedException e) { // ignore - } finally { - try { - stop(); - } catch (InterruptedException ie) { - log.log(SEVERE, "interrupted while stopping worker", ie); - // ignore - } } } }, @@ -572,6 +533,7 @@ private long loadWorkerStartTimeInMillis() { } public void start() throws ConfigurationException, InterruptedException, IOException { + released.set(false); String session = UUID.randomUUID().toString(); ServerBuilder serverBuilder = ServerBuilder.forPort(configs.getWorker().getPort()); String identifier = "buildfarm-worker-" + configs.getWorker().getPublicName() + "-" + session; @@ -584,7 +546,12 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep if (SHARD.equals(configs.getBackplane().getType())) { backplane = - new RedisShardBackplane(identifier, this::stripOperation, this::stripQueuedOperation); + new RedisShardBackplane( + identifier, + /* subscribeToBackplane=*/ false, + /* runFailsafeOperation=*/ false, + this::stripOperation, + this::stripQueuedOperation); backplane.start(configs.getWorker().getPublicName()); } else { throw new IllegalArgumentException("Shard Backplane not set in config"); @@ -616,8 +583,7 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep remoteInputStreamFactory, removeDirectoryService, accessRecorder, storage); instance = - new ShardWorkerInstance( - configs.getWorker().getPublicName(), digestUtil, backplane, storage); + new WorkerInstance(configs.getWorker().getPublicName(), digestUtil, backplane, storage); // Create the appropriate writer for the context CasWriter writer; @@ -650,10 +616,11 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep configs.getWorker().isOnlyMulticoreTests(), configs.getWorker().isAllowBringYourOwnContainer(), configs.getWorker().isErrorOperationRemainingResources(), + LocalResourceSetUtils.create(configs.getWorker().getResources()), writer); pipeline = new Pipeline(); - server = createServer(serverBuilder, storage, instance, pipeline, context); + server = createServer(serverBuilder, (CASFileCache) storage, instance, pipeline, context); removeWorker(configs.getWorker().getPublicName()); @@ -667,12 +634,7 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep PrometheusPublisher.startHttpServer(configs.getPrometheusPort()); startFailsafeRegistration(); - // Listen for pipeline unhandled exceptions - ExecutorService pipelineExceptionExecutor = newSingleThreadExecutor(); - SettableFuture pipelineExceptionFuture = SettableFuture.create(); - pipelineExceptionFuture.addListener(this::exitPostPipelineFailure, pipelineExceptionExecutor); - - pipeline.start(pipelineExceptionFuture); + pipeline.start(); healthCheckMetric.labels("start").inc(); executionSlotsTotal.set(configs.getWorker().getExecuteStageWidth()); @@ -681,9 +643,42 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep log.log(INFO, String.format("%s initialized", identifier)); } - @PreDestroy - public void stop() throws InterruptedException { - System.err.println("*** shutting down gRPC server since JVM is shutting down"); + @Override + protected void onShutdown() throws InterruptedException { + initiateShutdown(); + awaitRelease(); + } + + private void awaitTermination() throws InterruptedException { + pipeline.join(); + server.awaitTermination(); + } + + public void initiateShutdown() { + pipeline.stopMatchingOperations(); + if (server != null) { + server.shutdown(); + } + } + + private synchronized void awaitRelease() throws InterruptedException { + while (!released.get()) { + wait(); + } + } + + public synchronized void stop() throws InterruptedException { + try { + shutdown(); + } finally { + released.set(true); + notify(); + } + } + + private void shutdown() throws InterruptedException { + log.info("*** shutting down gRPC server since JVM is shutting down"); + prepareWorkerForGracefulShutdown(); PrometheusPublisher.stopHttpServer(); boolean interrupted = Thread.interrupted(); if (pipeline != null) { @@ -701,11 +696,12 @@ public void stop() throws InterruptedException { executionSlotsTotal.set(0); inputFetchSlotsTotal.set(0); if (execFileSystem != null) { - log.log(INFO, "Stopping exec filesystem"); + log.info("Stopping exec filesystem"); execFileSystem.stop(); + execFileSystem = null; } if (server != null) { - log.log(INFO, "Shutting down the server"); + log.info("Shutting down the server"); server.shutdown(); try { @@ -716,26 +712,28 @@ public void stop() throws InterruptedException { } finally { server.shutdownNow(); } + server = null; } if (backplane != null) { try { backplane.stop(); + backplane = null; } catch (InterruptedException e) { interrupted = true; } } if (workerStubs != null) { workerStubs.invalidateAll(); + workerStubs = null; } if (interrupted) { Thread.currentThread().interrupt(); throw new InterruptedException(); } - System.err.println("*** server shut down"); + log.info("*** server shut down"); } - @PostConstruct - public void init() throws OptionsParsingException { + public static void main(String[] args) throws Exception { // Only log severe log messages from Netty. Otherwise it logs warnings that look like this: // // 170714 08:16:28.552:WT 18 [io.grpc.netty.NettyServerHandler.onStreamError] Stream Error @@ -743,19 +741,17 @@ public void init() throws OptionsParsingException { // unknown stream 11369 nettyLogger.setLevel(SEVERE); + configs = BuildfarmConfigs.loadWorkerConfigs(args); + Worker worker = new Worker(); try { - start(); + worker.start(); + worker.awaitTermination(); } catch (IOException e) { - System.err.println("error: " + formatIOError(e)); + log.severe(formatIOError(e)); } catch (InterruptedException e) { - System.out.println("error: interrupted"); - } catch (ConfigurationException e) { - throw new RuntimeException(e); + log.log(Level.WARNING, "interrupted", e); + } finally { + worker.stop(); } } - - public static void main(String[] args) throws ConfigurationException { - configs = BuildfarmConfigs.loadWorkerConfigs(args); - SpringApplication.run(Worker.class, args); - } } diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java b/src/main/java/build/buildfarm/worker/shard/WorkerInstance.java similarity index 97% rename from src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java rename to src/main/java/build/buildfarm/worker/shard/WorkerInstance.java index 04c11d315b..e100417f53 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java +++ b/src/main/java/build/buildfarm/worker/shard/WorkerInstance.java @@ -36,7 +36,7 @@ import build.buildfarm.common.Write; import build.buildfarm.common.grpc.UniformDelegateServerCallStreamObserver; import build.buildfarm.instance.MatchListener; -import build.buildfarm.instance.server.AbstractServerInstance; +import build.buildfarm.instance.server.NodeInstance; import build.buildfarm.operations.EnrichedOperation; import build.buildfarm.operations.FindOperationsResults; import build.buildfarm.v1test.BackplaneStatus; @@ -56,6 +56,7 @@ import io.grpc.Status; import io.grpc.Status.Code; import io.grpc.stub.ServerCallStreamObserver; +import io.prometheus.client.Counter; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; @@ -67,10 +68,13 @@ import lombok.extern.java.Log; @Log -public class ShardWorkerInstance extends AbstractServerInstance { +public class WorkerInstance extends NodeInstance { + private static final Counter IO_METRIC = + Counter.build().name("io_bytes_read").help("Read I/O (bytes)").register(); + private final Backplane backplane; - public ShardWorkerInstance( + public WorkerInstance( String name, DigestUtil digestUtil, Backplane backplane, @@ -132,6 +136,7 @@ public void getBlob( @Override public void onNext(ByteString data) { blobObserver.onNext(data); + IO_METRIC.inc(data.size()); } void removeBlobLocation() { @@ -341,7 +346,7 @@ protected static ExecuteOperationMetadata expectExecuteOperationMetadata(Operati return null; } } else { - return AbstractServerInstance.expectExecuteOperationMetadata(operation); + return NodeInstance.expectExecuteOperationMetadata(operation); } } diff --git a/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java b/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java index 6eee990426..f00c57cc3d 100644 --- a/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java +++ b/src/main/java/build/buildfarm/worker/shard/WorkerProfileService.java @@ -15,7 +15,6 @@ package build.buildfarm.worker.shard; import build.buildfarm.backplane.Backplane; -import build.buildfarm.cas.ContentAddressableStorage; import build.buildfarm.cas.cfc.CASFileCache; import build.buildfarm.v1test.OperationTimesBetweenStages; import build.buildfarm.v1test.StageInformation; @@ -24,67 +23,90 @@ import build.buildfarm.v1test.WorkerProfileGrpc; import build.buildfarm.v1test.WorkerProfileMessage; import build.buildfarm.v1test.WorkerProfileRequest; -import build.buildfarm.worker.ExecuteActionStage; -import build.buildfarm.worker.InputFetchStage; import build.buildfarm.worker.PipelineStage; import build.buildfarm.worker.PutOperationStage; import build.buildfarm.worker.PutOperationStage.OperationStageDurations; -import build.buildfarm.worker.WorkerContext; +import build.buildfarm.worker.SuperscalarPipelineStage; import io.grpc.stub.StreamObserver; import java.io.IOException; +import javax.annotation.Nullable; public class WorkerProfileService extends WorkerProfileGrpc.WorkerProfileImplBase { - private final CASFileCache storage; - private final InputFetchStage inputFetchStage; - private final ExecuteActionStage executeActionStage; - private final WorkerContext context; + private final @Nullable CASFileCache storage; + private final PipelineStage matchStage; + private final SuperscalarPipelineStage inputFetchStage; + private final SuperscalarPipelineStage executeActionStage; + private final PipelineStage reportResultStage; private final PutOperationStage completeStage; private final Backplane backplane; public WorkerProfileService( - ContentAddressableStorage storage, - PipelineStage inputFetchStage, - PipelineStage executeActionStage, - WorkerContext context, - PipelineStage completeStage, + @Nullable CASFileCache storage, + PipelineStage matchStage, + SuperscalarPipelineStage inputFetchStage, + SuperscalarPipelineStage executeActionStage, + PipelineStage reportResultStage, + PutOperationStage completeStage, Backplane backplane) { - this.storage = (CASFileCache) storage; - this.inputFetchStage = (InputFetchStage) inputFetchStage; - this.executeActionStage = (ExecuteActionStage) executeActionStage; - this.context = context; + this.storage = storage; + this.matchStage = matchStage; + this.inputFetchStage = inputFetchStage; + this.executeActionStage = executeActionStage; + this.reportResultStage = reportResultStage; this.completeStage = (PutOperationStage) completeStage; this.backplane = backplane; } + private StageInformation unaryStageInformation(String name, @Nullable String operationName) { + StageInformation.Builder builder = + StageInformation.newBuilder().setName(name).setSlotsConfigured(1); + if (operationName != null) { + builder.setSlotsUsed(1).addOperationNames(operationName); + } + return builder.build(); + } + + private StageInformation superscalarStageInformation(SuperscalarPipelineStage stage) { + return StageInformation.newBuilder() + .setName(stage.getName()) + .setSlotsConfigured(stage.getWidth()) + .setSlotsUsed(stage.getSlotUsage()) + .addAllOperationNames(stage.getOperationNames()) + .build(); + } + @Override public void getWorkerProfile( WorkerProfileRequest request, StreamObserver responseObserver) { // get usage of CASFileCache - WorkerProfileMessage.Builder replyBuilder = - WorkerProfileMessage.newBuilder() - .setCasSize(storage.size()) - .setCasEntryCount(storage.entryCount()) - .setCasMaxSize(storage.maxSize()) - .setCasMaxEntrySize(storage.maxEntrySize()) - .setCasUnreferencedEntryCount(storage.unreferencedEntryCount()) - .setCasDirectoryEntryCount(storage.directoryStorageCount()) - .setCasEvictedEntryCount(storage.getEvictedCount()) - .setCasEvictedEntrySize(storage.getEvictedSize()); + WorkerProfileMessage.Builder replyBuilder = WorkerProfileMessage.newBuilder(); + + // FIXME deliver full local storage chain + if (storage != null) { + replyBuilder + .setCasSize(storage.size()) + .setCasEntryCount(storage.entryCount()) + .setCasMaxSize(storage.maxSize()) + .setCasMaxEntrySize(storage.maxEntrySize()) + .setCasUnreferencedEntryCount(storage.unreferencedEntryCount()) + .setCasDirectoryEntryCount(storage.directoryStorageCount()) + .setCasEvictedEntryCount(storage.getEvictedCount()) + .setCasEvictedEntrySize(storage.getEvictedSize()); + } // get slots configured and used of superscalar stages + // prefer reverse order to avoid double counting if possible + // these stats are not consistent across their sampling and will + // produce: slots that are not consistent with operations, operations + // in multiple stages even in reverse due to claim progress + // in short: this is for monitoring, not for guaranteed consistency checks + String reportResultOperation = reportResultStage.getOperationName(); + String matchOperation = matchStage.getOperationName(); replyBuilder - .addStages( - StageInformation.newBuilder() - .setName("InputFetchStage") - .setSlotsConfigured(context.getInputFetchStageWidth()) - .setSlotsUsed(inputFetchStage.getSlotUsage()) - .build()) - .addStages( - StageInformation.newBuilder() - .setName("ExecuteActionStage") - .setSlotsConfigured(context.getExecuteStageWidth()) - .setSlotsUsed(executeActionStage.getSlotUsage()) - .build()); + .addStages(unaryStageInformation(reportResultStage.getName(), reportResultOperation)) + .addStages(superscalarStageInformation(executeActionStage)) + .addStages(superscalarStageInformation(inputFetchStage)) + .addStages(unaryStageInformation(matchStage.getName(), matchOperation)); // get average time costs on each stage OperationStageDurations[] durations = completeStage.getAverageTimeCostPerStage(); diff --git a/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto b/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto index 56216577f0..cf3ce805b7 100644 --- a/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto +++ b/src/main/protobuf/build/buildfarm/v1test/buildfarm.proto @@ -616,6 +616,8 @@ message StageInformation { // number of slots used for this stage int32 slots_used = 3; + + repeated string operation_names = 4; } message WorkerProfileMessage { diff --git a/src/test/java/build/buildfarm/cas/BUILD b/src/test/java/build/buildfarm/cas/BUILD index ff7774e067..fc127b1ea6 100644 --- a/src/test/java/build/buildfarm/cas/BUILD +++ b/src/test/java/build/buildfarm/cas/BUILD @@ -1,10 +1,10 @@ -load("//:jvm_flags.bzl", "ensure_accurate_metadata") +load("//:jvm_flags.bzl", "add_opens_sun_nio_fs", "ensure_accurate_metadata") java_test( name = "tests", size = "small", srcs = glob(["**/*.java"]), - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), test_class = "build.buildfarm.AllTests", deps = [ "//src/main/java/build/buildfarm/cas", diff --git a/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java b/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java index 1bd1791999..caaab02b35 100644 --- a/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java +++ b/src/test/java/build/buildfarm/cas/cfc/CASFileCacheTest.java @@ -19,8 +19,11 @@ import static com.google.common.truth.Truth.assertThat; import static com.google.common.util.concurrent.MoreExecutors.directExecutor; import static com.google.common.util.concurrent.MoreExecutors.shutdownAndAwaitTermination; +import static java.lang.Thread.State.TERMINATED; +import static java.lang.Thread.State.WAITING; import static java.util.concurrent.Executors.newSingleThreadExecutor; import static java.util.concurrent.TimeUnit.MICROSECONDS; +import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.SECONDS; import static org.junit.Assert.fail; import static org.mockito.Mockito.any; @@ -58,6 +61,7 @@ import com.google.common.collect.Maps; import com.google.common.jimfs.Configuration; import com.google.common.jimfs.Jimfs; +import com.google.common.util.concurrent.FutureCallback; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.SettableFuture; @@ -77,6 +81,7 @@ import java.util.Map; import java.util.UUID; import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; @@ -147,7 +152,6 @@ public void setUp() throws IOException, InterruptedException { /* maxEntrySizeInBytes=*/ 1024, /* hexBucketLevels=*/ 1, storeFileDirsIndexInMemory, - /* publishTtlMetric=*/ false, /* execRootFallback=*/ false, DIGEST_UTIL, expireService, @@ -241,7 +245,8 @@ public void putDirectoryCreatesTree() throws IOException, InterruptedException { subdirDigest, subDirectory); Path dirPath = getInterruptiblyOrIOException( - fileCache.putDirectory(dirDigest, directoriesIndex, putService)); + fileCache.putDirectory(dirDigest, directoriesIndex, putService)) + .getPath(); assertThat(Files.isDirectory(dirPath)).isTrue(); assertThat(Files.exists(dirPath.resolve("file"))).isTrue(); assertThat(Files.isDirectory(dirPath.resolve("subdir"))).isTrue(); @@ -1111,7 +1116,6 @@ public void copyExternalInputRetries() throws Exception { /* maxEntrySizeInBytes=*/ 1024, /* hexBucketLevels=*/ 1, storeFileDirsIndexInMemory, - /* publishTtlMetric=*/ false, /* execRootFallback=*/ false, DIGEST_UTIL, expireService, @@ -1175,7 +1179,6 @@ public void newInputThrowsNoSuchFileExceptionWithoutDelegate() throws Exception /* maxEntrySizeInBytes=*/ 1024, /* hexBucketLevels=*/ 1, storeFileDirsIndexInMemory, - /* publishTtlMetric=*/ false, /* execRootFallback=*/ false, DIGEST_UTIL, expireService, @@ -1209,6 +1212,114 @@ protected InputStream newExternalInput( assertThat(expected).isNotNull(); } + @Test + public void testConcurrentWrites() throws Exception { + ByteString blob = ByteString.copyFromUtf8("concurrent write"); + Digest digest = DIGEST_UTIL.compute(blob); + UUID uuid = UUID.randomUUID(); + // The same instance of Write will be passed to both the threads, so that the both threads + // try to get same output stream. + Write write = + fileCache.getWrite( + Compressor.Value.IDENTITY, digest, uuid, RequestMetadata.getDefaultInstance()); + + CyclicBarrier barrier = new CyclicBarrier(3); + + Thread write1 = + new Thread( + () -> { + try { + ConcurrentWriteStreamObserver writeStreamObserver = + new ConcurrentWriteStreamObserver(write); + writeStreamObserver.registerCallback(); + barrier.await(); // let both the threads get same write stream. + writeStreamObserver.ownStream(); // let other thread get the ownership of stream + writeStreamObserver.write(blob); + writeStreamObserver.close(); + } catch (Exception e) { + // do nothing + } + }, + "FirstRequest"); + Thread write2 = + new Thread( + () -> { + try { + ConcurrentWriteStreamObserver writeStreamObserver = + new ConcurrentWriteStreamObserver(write); + writeStreamObserver.registerCallback(); + writeStreamObserver.ownStream(); // this thread will get the ownership of stream + barrier.await(); // let both the threads get same write stream. + while (write1.getState() != WAITING) ; // wait for first request to go in wait state + writeStreamObserver.write(blob); + writeStreamObserver.close(); + } catch (Exception e) { + // do nothing + } + }, + "SecondRequest"); + write1.start(); + write2.start(); + barrier.await(); // let both the requests reach the critical section + + // Wait for each write operation to complete, allowing a maximum of 100ms per write. + // Note: A 100ms wait time allowed 1000 * 8 successful test runs. + // In certain scenario, even this wait time may not be enough and test still be called flaky. + // But setting wait time 0 may cause test to wait forever (if there is issue in code) and the + // build might fail with timeout error. + write1.join(100); + write2.join(100); + + assertThat(write1.getState()).isEqualTo(TERMINATED); + assertThat(write2.getState()).isEqualTo(TERMINATED); + } + + static class ConcurrentWriteStreamObserver { + Write write; + FeedbackOutputStream out; + + ConcurrentWriteStreamObserver(Write write) { + this.write = write; + } + + void registerCallback() { + Futures.addCallback( + write.getFuture(), + new FutureCallback() { + @Override + public void onSuccess(Long committedSize) { + commit(); + } + + @Override + public void onFailure(Throwable t) { + // do nothing + } + }, + directExecutor()); + } + + synchronized void ownStream() throws Exception { + this.out = write.getOutput(10, MILLISECONDS, () -> {}); + } + /** + * Request 1 may invoke this method for request 2 or vice-versa via callback on + * write.getFuture(). Synchronization is necessary to prevent conflicts when this method is + * called simultaneously by different threads. + */ + synchronized void commit() { + // critical section + } + + void write(ByteString data) throws IOException { + data.writeTo(out); + } + + void close() throws IOException { + out.close(); + } + } + @RunWith(JUnit4.class) public static class NativeFileDirsIndexInMemoryCASFileCacheTest extends CASFileCacheTest { public NativeFileDirsIndexInMemoryCASFileCacheTest() throws IOException { diff --git a/src/test/java/build/buildfarm/common/BUILD b/src/test/java/build/buildfarm/common/BUILD index bda4f4243f..70ff4abc94 100644 --- a/src/test/java/build/buildfarm/common/BUILD +++ b/src/test/java/build/buildfarm/common/BUILD @@ -1,10 +1,10 @@ -load("//:jvm_flags.bzl", "ensure_accurate_metadata") +load("//:jvm_flags.bzl", "add_opens_sun_nio_fs", "ensure_accurate_metadata") java_test( name = "tests", size = "small", srcs = glob(["*.java"]), - jvm_flags = ensure_accurate_metadata(), + jvm_flags = ensure_accurate_metadata() + add_opens_sun_nio_fs(), test_class = "build.buildfarm.AllTests", deps = [ "//src/main/java/build/buildfarm/common", diff --git a/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java b/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java index 5301b82608..547f81b2ca 100644 --- a/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java +++ b/src/test/java/build/buildfarm/common/config/GrpcMetricsTest.java @@ -30,7 +30,6 @@ public void testHandleGrpcMetricIntercepts_withLatencyBucket() { grpcMetrics.setEnabled(true); grpcMetrics.setProvideLatencyHistograms(true); grpcMetrics.setLatencyBuckets(new double[] {1, 2, 3}); - GrpcMetrics.handleGrpcMetricIntercepts(serverBuilder, grpcMetrics); verify(serverBuilder, times(1)).intercept(any(MonitoringServerInterceptor.class)); } diff --git a/src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java b/src/test/java/build/buildfarm/instance/server/NodeInstanceTest.java similarity index 89% rename from src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java rename to src/test/java/build/buildfarm/instance/server/NodeInstanceTest.java index 719806b9f7..319a4a5cc5 100644 --- a/src/test/java/build/buildfarm/instance/server/AbstractServerInstanceTest.java +++ b/src/test/java/build/buildfarm/instance/server/NodeInstanceTest.java @@ -17,12 +17,13 @@ import static build.buildfarm.common.Actions.checkPreconditionFailure; import static build.buildfarm.common.Errors.VIOLATION_TYPE_INVALID; import static build.buildfarm.common.Errors.VIOLATION_TYPE_MISSING; -import static build.buildfarm.instance.server.AbstractServerInstance.ACTION_INPUT_ROOT_DIRECTORY_PATH; -import static build.buildfarm.instance.server.AbstractServerInstance.DIRECTORY_NOT_SORTED; -import static build.buildfarm.instance.server.AbstractServerInstance.DUPLICATE_DIRENT; -import static build.buildfarm.instance.server.AbstractServerInstance.INVALID_COMMAND; -import static build.buildfarm.instance.server.AbstractServerInstance.OUTPUT_DIRECTORY_IS_OUTPUT_ANCESTOR; -import static build.buildfarm.instance.server.AbstractServerInstance.OUTPUT_FILE_IS_OUTPUT_ANCESTOR; +import static build.buildfarm.instance.server.NodeInstance.ACTION_INPUT_ROOT_DIRECTORY_PATH; +import static build.buildfarm.instance.server.NodeInstance.DIRECTORY_NOT_SORTED; +import static build.buildfarm.instance.server.NodeInstance.DUPLICATE_DIRENT; +import static build.buildfarm.instance.server.NodeInstance.INVALID_COMMAND; +import static build.buildfarm.instance.server.NodeInstance.OUTPUT_DIRECTORY_IS_OUTPUT_ANCESTOR; +import static build.buildfarm.instance.server.NodeInstance.OUTPUT_FILE_IS_OUTPUT_ANCESTOR; +import static build.buildfarm.instance.server.NodeInstance.SYMLINK_TARGET_ABSOLUTE; import static com.google.common.truth.Truth.assertThat; import static com.google.common.util.concurrent.Futures.immediateFuture; import static org.mockito.Mockito.any; @@ -43,6 +44,7 @@ import build.bazel.remote.execution.v2.OutputDirectory; import build.bazel.remote.execution.v2.Platform; import build.bazel.remote.execution.v2.RequestMetadata; +import build.bazel.remote.execution.v2.SymlinkNode; import build.bazel.remote.execution.v2.Tree; import build.buildfarm.actioncache.ActionCache; import build.buildfarm.cas.ContentAddressableStorage; @@ -97,10 +99,10 @@ @RunWith(JUnit4.class) @Log -public class AbstractServerInstanceTest { +public class NodeInstanceTest { private static final DigestUtil DIGEST_UTIL = new DigestUtil(HashFunction.SHA256); - static class DummyServerInstance extends AbstractServerInstance { + static class DummyServerInstance extends NodeInstance { DummyServerInstance( ContentAddressableStorage contentAddressableStorage, ActionCache actionCache) { super( @@ -259,7 +261,7 @@ public PrepareWorkerForGracefulShutDownRequestResults shutDownWorkerGracefully() @Test public void duplicateFileInputIsInvalid() { PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllFiles( @@ -270,6 +272,7 @@ public void duplicateFileInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFile=*/ file -> {}, /* onInputDirectorie=*/ directory -> {}, /* onInputDigest=*/ digest -> {}, @@ -287,7 +290,7 @@ public void duplicateEmptyDirectoryCheckPasses() throws StatusException { Directory emptyDirectory = Directory.getDefaultInstance(); Digest emptyDirectoryDigest = DIGEST_UTIL.compute(emptyDirectory); PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllDirectories( @@ -304,6 +307,7 @@ public void duplicateEmptyDirectoryCheckPasses() throws StatusException { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(Digest.getDefaultInstance(), emptyDirectory), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -316,7 +320,7 @@ public void duplicateEmptyDirectoryCheckPasses() throws StatusException { @Test public void unsortedFileInputIsInvalid() { PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllFiles( @@ -327,6 +331,7 @@ public void unsortedFileInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -344,7 +349,7 @@ public void duplicateDirectoryInputIsInvalid() { Directory emptyDirectory = Directory.getDefaultInstance(); Digest emptyDirectoryDigest = DIGEST_UTIL.compute(emptyDirectory); PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllDirectories( @@ -361,6 +366,7 @@ public void duplicateDirectoryInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(emptyDirectoryDigest, emptyDirectory), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -378,7 +384,7 @@ public void unsortedDirectoryInputIsInvalid() { Directory emptyDirectory = Directory.getDefaultInstance(); Digest emptyDirectoryDigest = DIGEST_UTIL.compute(emptyDirectory); PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, Directory.newBuilder() .addAllDirectories( @@ -395,6 +401,7 @@ public void unsortedDirectoryInputIsInvalid() { /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(emptyDirectoryDigest, emptyDirectory), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -407,10 +414,52 @@ public void unsortedDirectoryInputIsInvalid() { assertThat(violation.getDescription()).isEqualTo(DIRECTORY_NOT_SORTED); } + @Test + public void shouldValidateIfSymlinkTargetAbsolute() { + // invalid for disallowed + PreconditionFailure.Builder preconditionFailure = PreconditionFailure.newBuilder(); + Directory absoluteSymlinkDirectory = + Directory.newBuilder() + .addSymlinks(SymlinkNode.newBuilder().setName("foo").setTarget("/root/secret").build()) + .build(); + NodeInstance.validateActionInputDirectory( + ACTION_INPUT_ROOT_DIRECTORY_PATH, + absoluteSymlinkDirectory, + /* pathDigests=*/ new Stack<>(), + /* visited=*/ Sets.newHashSet(), + /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ false, + /* onInputFile=*/ file -> {}, + /* onInputDirectorie=*/ directory -> {}, + /* onInputDigest=*/ digest -> {}, + preconditionFailure); + + assertThat(preconditionFailure.getViolationsCount()).isEqualTo(1); + Violation violation = preconditionFailure.getViolationsList().get(0); + assertThat(violation.getType()).isEqualTo(VIOLATION_TYPE_INVALID); + assertThat(violation.getSubject()).isEqualTo("/: foo -> /root/secret"); + assertThat(violation.getDescription()).isEqualTo(SYMLINK_TARGET_ABSOLUTE); + + // valid for allowed + preconditionFailure = PreconditionFailure.newBuilder(); + NodeInstance.validateActionInputDirectory( + ACTION_INPUT_ROOT_DIRECTORY_PATH, + absoluteSymlinkDirectory, + /* pathDigests=*/ new Stack<>(), + /* visited=*/ Sets.newHashSet(), + /* directoriesIndex=*/ Maps.newHashMap(), + /* allowSymlinkTargetAbsolute=*/ true, + /* onInputFile=*/ file -> {}, + /* onInputDirectorie=*/ directory -> {}, + /* onInputDigest=*/ digest -> {}, + preconditionFailure); + assertThat(preconditionFailure.getViolationsCount()).isEqualTo(0); + } + @Test public void nestedOutputDirectoriesAreInvalid() { PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateOutputs( + NodeInstance.validateOutputs( ImmutableSet.of(), ImmutableSet.of(), ImmutableSet.of(), @@ -427,7 +476,7 @@ public void nestedOutputDirectoriesAreInvalid() { @Test public void outputDirectoriesContainingOutputFilesAreInvalid() { PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateOutputs( + NodeInstance.validateOutputs( ImmutableSet.of(), ImmutableSet.of(), ImmutableSet.of("foo/bar"), @@ -444,7 +493,7 @@ public void outputDirectoriesContainingOutputFilesAreInvalid() { @Test public void outputFilesAsOutputDirectoryAncestorsAreInvalid() { PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); - AbstractServerInstance.validateOutputs( + NodeInstance.validateOutputs( ImmutableSet.of(), ImmutableSet.of(), ImmutableSet.of("foo"), @@ -460,7 +509,7 @@ public void outputFilesAsOutputDirectoryAncestorsAreInvalid() { @Test public void emptyArgumentListIsInvalid() { - AbstractServerInstance instance = new DummyServerInstance(); + NodeInstance instance = new DummyServerInstance(); PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); instance.validateCommand( @@ -480,7 +529,7 @@ public void emptyArgumentListIsInvalid() { @Test public void absoluteWorkingDirectoryIsInvalid() { - AbstractServerInstance instance = new DummyServerInstance(); + NodeInstance instance = new DummyServerInstance(); PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); instance.validateCommand( @@ -500,7 +549,7 @@ public void absoluteWorkingDirectoryIsInvalid() { @Test public void undeclaredWorkingDirectoryIsInvalid() { - AbstractServerInstance instance = new DummyServerInstance(); + NodeInstance instance = new DummyServerInstance(); Digest inputRootDigest = DIGEST_UTIL.compute(Directory.getDefaultInstance()); PreconditionFailure.Builder preconditionFailureBuilder = PreconditionFailure.newBuilder(); @@ -541,12 +590,13 @@ public void multipleIdenticalDirectoryMissingAreAllPreconditionFailures() { .setDigest(missingDirectoryDigest) .build())) .build(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, root, /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -602,12 +652,13 @@ public void validationRevisitReplicatesPreconditionFailures() { DirectoryNode.newBuilder().setName("bar").setDigest(fooDigest).build(), DirectoryNode.newBuilder().setName("foo").setDigest(fooDigest).build())) .build(); - AbstractServerInstance.validateActionInputDirectory( + NodeInstance.validateActionInputDirectory( ACTION_INPUT_ROOT_DIRECTORY_PATH, root, /* pathDigests=*/ new Stack<>(), /* visited=*/ Sets.newHashSet(), /* directoriesIndex=*/ ImmutableMap.of(fooDigest, foo), + /* allowSymlinkTargetAbsolute=*/ false, /* onInputFiles=*/ file -> {}, /* onInputDirectories=*/ directory -> {}, /* onInputDigests=*/ digest -> {}, @@ -670,8 +721,7 @@ public void outputDirectoriesFilesAreEnsuredPresent() throws Exception { .build(); ContentAddressableStorage contentAddressableStorage = mock(ContentAddressableStorage.class); ActionCache actionCache = mock(ActionCache.class); - AbstractServerInstance instance = - new DummyServerInstance(contentAddressableStorage, actionCache); + NodeInstance instance = new DummyServerInstance(contentAddressableStorage, actionCache); Tree tree = Tree.newBuilder() @@ -731,7 +781,7 @@ public void fetchBlobWriteCompleteIsSuccess() throws Exception { Digest expectedDigest = contentDigest.toBuilder().setSizeBytes(-1).build(); ContentAddressableStorage contentAddressableStorage = mock(ContentAddressableStorage.class); - AbstractServerInstance instance = new DummyServerInstance(contentAddressableStorage, null); + NodeInstance instance = new DummyServerInstance(contentAddressableStorage, null); RequestMetadata requestMetadata = RequestMetadata.getDefaultInstance(); Write write = mock(Write.class); diff --git a/src/test/java/build/buildfarm/instance/shard/BUILD b/src/test/java/build/buildfarm/instance/shard/BUILD index 6a501a1b68..2f08b1003d 100644 --- a/src/test/java/build/buildfarm/instance/shard/BUILD +++ b/src/test/java/build/buildfarm/instance/shard/BUILD @@ -1,7 +1,158 @@ java_test( - name = "tests", + name = "DispatchedMonitorTest", size = "small", - srcs = glob(["*.java"]), + srcs = [ + "DispatchedMonitorTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "RedisShardBackplaneTest", + size = "small", + srcs = [ + "RedisShardBackplaneTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "RedisShardSubscriberTest", + size = "small", + srcs = [ + "RedisShardSubscriberTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "ServerInstanceTest", + size = "small", + srcs = [ + "ServerInstanceTest.java", + "UnobservableWatcher.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "TimedWatcherTest", + size = "small", + srcs = [ + "TimedWatcherTest.java", + "UnobservableWatcher.java", + ], data = ["//examples:example_configs"], test_class = "build.buildfarm.AllTests", deps = [ @@ -31,3 +182,59 @@ java_test( "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", ], ) + +java_test( + name = "UtilTest", + size = "small", + srcs = [ + "UnobservableWatcher.java", + "UtilTest.java", + ], + data = ["//examples:example_configs"], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/actioncache", + "//src/main/java/build/buildfarm/backplane", + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/instance", + "//src/main/java/build/buildfarm/instance/server", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@googleapis//:google_longrunning_operations_java_proto", + "@googleapis//:google_rpc_code_java_proto", + "@googleapis//:google_rpc_error_details_java_proto", + "@maven//:com_github_ben_manes_caffeine_caffeine", + "@maven//:com_google_guava_guava", + "@maven//:com_google_protobuf_protobuf_java", + "@maven//:com_google_protobuf_protobuf_java_util", + "@maven//:com_google_truth_truth", + "@maven//:io_grpc_grpc_api", + "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_protobuf", + "@maven//:io_grpc_grpc_stub", + "@maven//:org_mockito_mockito_core", + "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", + ], +) + +java_test( + name = "JedisCasWorkerMapTest", + size = "small", + srcs = [ + "JedisCasWorkerMapTest.java", + ], + test_class = "build.buildfarm.AllTests", + deps = [ + "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/redis", + "//src/main/java/build/buildfarm/instance/shard", + "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", + "//src/test/java/build/buildfarm:test_runner", + "//third_party/jedis", + "@maven//:com_github_fppt_jedis_mock", + "@maven//:com_google_truth_truth", + ], +) diff --git a/src/test/java/build/buildfarm/instance/shard/JedisCasWorkerMapTest.java b/src/test/java/build/buildfarm/instance/shard/JedisCasWorkerMapTest.java new file mode 100644 index 0000000000..caa69536c1 --- /dev/null +++ b/src/test/java/build/buildfarm/instance/shard/JedisCasWorkerMapTest.java @@ -0,0 +1,63 @@ +package build.buildfarm.instance.shard; + +import static com.google.common.truth.Truth.assertThat; + +import build.bazel.remote.execution.v2.Digest; +import build.buildfarm.common.DigestUtil; +import build.buildfarm.common.redis.RedisClient; +import com.github.fppt.jedismock.RedisServer; +import com.github.fppt.jedismock.server.ServiceOptions; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import redis.clients.jedis.HostAndPort; +import redis.clients.jedis.JedisCluster; + +@RunWith(JUnit4.class) +public class JedisCasWorkerMapTest { + + private static final String CAS_PREFIX = "ContentAddressableStorage"; + + private RedisServer redisServer; + private RedisClient redisClient; + private JedisCasWorkerMap jedisCasWorkerMap; + + @Before + public void setup() throws IOException { + redisServer = + RedisServer.newRedisServer() + .setOptions(ServiceOptions.defaultOptions().withClusterModeEnabled()) + .start(); + redisClient = + new RedisClient( + new JedisCluster( + Collections.singleton( + new HostAndPort(redisServer.getHost(), redisServer.getBindPort())))); + jedisCasWorkerMap = new JedisCasWorkerMap(CAS_PREFIX, 60); + } + + @Test + public void testSetExpire() throws IOException { + Digest testDigest1 = Digest.newBuilder().setHash("abc").build(); + Digest testDigest2 = Digest.newBuilder().setHash("xyz").build(); + + String casKey1 = CAS_PREFIX + ":" + DigestUtil.toString(testDigest1); + String casKey2 = CAS_PREFIX + ":" + DigestUtil.toString(testDigest2); + + redisClient.run(jedis -> jedis.sadd(casKey1, "worker1")); + jedisCasWorkerMap.setExpire(redisClient, Arrays.asList(testDigest1, testDigest2)); + + assertThat((Long) redisClient.call(jedis -> jedis.ttl(casKey1))).isGreaterThan(0L); + assertThat((Long) redisClient.call(jedis -> jedis.ttl(casKey2))).isEqualTo(-2L); + } + + @After + public void tearDown() throws IOException { + redisServer.stop(); + } +} diff --git a/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java b/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java index 51c7c0568d..a77e74a3bd 100644 --- a/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java +++ b/src/test/java/build/buildfarm/instance/shard/RedisShardBackplaneTest.java @@ -59,7 +59,6 @@ @RunWith(JUnit4.class) public class RedisShardBackplaneTest { - private RedisShardBackplane backplane; private BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); @Mock Supplier mockJedisClusterFactory; @@ -67,12 +66,20 @@ public class RedisShardBackplaneTest { @Before public void setUp() throws IOException { configs.getBackplane().setOperationExpire(10); - configs.getBackplane().setSubscribeToBackplane(false); - configs.getBackplane().setRunFailsafeOperation(false); configs.getBackplane().setQueues(new Queue[] {}); MockitoAnnotations.initMocks(this); } + public RedisShardBackplane createBackplane(String name) { + return new RedisShardBackplane( + name, + /* subscribeToBackplane=*/ false, + /* runFailsafeOperation=*/ false, + o -> o, + o -> o, + mockJedisClusterFactory); + } + @Test public void workersWithInvalidProtobufAreRemoved() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); @@ -81,9 +88,7 @@ public void workersWithInvalidProtobufAreRemoved() throws IOException { .thenReturn(ImmutableMap.of("foo", "foo")); when(jedisCluster.hdel(configs.getBackplane().getWorkersHashName() + "_storage", "foo")) .thenReturn(1L); - backplane = - new RedisShardBackplane( - "invalid-protobuf-worker-removed-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("invalid-protobuf-worker-removed-test"); backplane.start("startTime/test:0000"); assertThat(backplane.getStorageWorkers()).isEmpty(); @@ -100,12 +105,10 @@ public void workersWithInvalidProtobufAreRemoved() throws IOException { assertThat(workerChange.getTypeCase()).isEqualTo(WorkerChange.TypeCase.REMOVE); } - void verifyChangePublished(JedisCluster jedis) throws IOException { + OperationChange verifyChangePublished(String channel, JedisCluster jedis) throws IOException { ArgumentCaptor changeCaptor = ArgumentCaptor.forClass(String.class); - verify(jedis, times(1)).publish(eq(backplane.operationChannel("op")), changeCaptor.capture()); - OperationChange opChange = parseOperationChange(changeCaptor.getValue()); - assertThat(opChange.hasReset()).isTrue(); - assertThat(opChange.getReset().getOperation().getName()).isEqualTo("op"); + verify(jedis, times(1)).publish(eq(channel), changeCaptor.capture()); + return parseOperationChange(changeCaptor.getValue()); } String operationName(String name) { @@ -116,9 +119,7 @@ String operationName(String name) { public void prequeueUpdatesOperationPrequeuesAndPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "prequeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("prequeue-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -136,32 +137,34 @@ public void prequeueUpdatesOperationPrequeuesAndPublishes() throws IOException { .lpush( configs.getBackplane().getPreQueuedOperationsListName(), JsonFormat.printer().print(executeEntry)); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test public void queuingPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; backplane.queueing(opName); verify(mockJedisClusterFactory, times(1)).get(); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test public void requeueDispatchedOperationQueuesAndPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -181,7 +184,10 @@ public void requeueDispatchedOperationQueuesAndPublishes() throws IOException { .lpush( configs.getBackplane().getQueuedOperationsListName(), JsonFormat.printer().print(queueEntry)); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test @@ -195,9 +201,7 @@ public void dispatchedOperationsShowProperRequeueAmount0to1() // create a backplane JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); // ARRANGE @@ -210,8 +214,7 @@ public void dispatchedOperationsShowProperRequeueAmount0to1() .setRequeueAttempts(STARTING_REQUEUE_AMOUNT) .build(); String queueEntryJson = JsonFormat.printer().print(queueEntry); - when(jedisCluster.brpoplpush(any(String.class), any(String.class), any(Integer.class))) - .thenReturn(queueEntryJson); + when(jedisCluster.rpoplpush(any(String.class), any(String.class))).thenReturn(queueEntryJson); // PRE-ASSERT when(jedisCluster.hsetnx(any(String.class), any(String.class), any(String.class))) @@ -251,9 +254,7 @@ public void dispatchedOperationsShowProperRequeueAmount1to2() // create a backplane JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "requeue-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("requeue-operation-test"); backplane.start("startTime/test:0000"); // Assume the operation queue is already populated from a first re-queue. @@ -265,8 +266,7 @@ public void dispatchedOperationsShowProperRequeueAmount1to2() .setRequeueAttempts(STARTING_REQUEUE_AMOUNT) .build(); String queueEntryJson = JsonFormat.printer().print(queueEntry); - when(jedisCluster.brpoplpush(any(String.class), any(String.class), any(Integer.class))) - .thenReturn(queueEntryJson); + when(jedisCluster.rpoplpush(any(String.class), any(String.class))).thenReturn(queueEntryJson); // PRE-ASSERT when(jedisCluster.hsetnx(any(String.class), any(String.class), any(String.class))) @@ -299,9 +299,7 @@ public void dispatchedOperationsShowProperRequeueAmount1to2() public void completeOperationUndispatches() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "complete-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("complete-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -318,9 +316,7 @@ public void completeOperationUndispatches() throws IOException { public void deleteOperationDeletesAndPublishes() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "delete-operation-test", (o) -> o, (o) -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("delete-operation-test"); backplane.start("startTime/test:0000"); final String opName = "op"; @@ -331,7 +327,10 @@ public void deleteOperationDeletesAndPublishes() throws IOException { verify(jedisCluster, times(1)) .hdel(configs.getBackplane().getDispatchedOperationsHashName(), opName); verify(jedisCluster, times(1)).del(operationName(opName)); - verifyChangePublished(jedisCluster); + OperationChange opChange = + verifyChangePublished(backplane.operationChannel(opName), jedisCluster); + assertThat(opChange.hasReset()).isTrue(); + assertThat(opChange.getReset().getOperation().getName()).isEqualTo(opName); } @Test @@ -342,9 +341,7 @@ public void invocationsCanBeBlacklisted() throws IOException { configs.getBackplane().getInvocationBlacklistPrefix() + ":" + toolInvocationId; when(jedisCluster.exists(invocationBlacklistKey)).thenReturn(true); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane( - "invocation-blacklist-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("invocation-blacklist-test"); backplane.start("startTime/test:0000"); assertThat( @@ -362,8 +359,7 @@ public void invocationsCanBeBlacklisted() throws IOException { public void testGetWorkersStartTime() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane("workers-starttime-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("workers-starttime-test"); backplane.start("startTime/test:0000"); Set workerNames = ImmutableSet.of("worker1", "worker2", "missing_worker"); @@ -387,8 +383,7 @@ public void testGetWorkersStartTime() throws IOException { public void getDigestInsertTime() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); - backplane = - new RedisShardBackplane("digest-inserttime-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("digest-inserttime-test"); backplane.start("startTime/test:0000"); long ttl = 3600L; long expirationInSecs = configs.getBackplane().getCasExpire(); @@ -412,8 +407,7 @@ public void testAddWorker() throws IOException { JedisCluster jedisCluster = mock(JedisCluster.class); when(mockJedisClusterFactory.get()).thenReturn(jedisCluster); when(jedisCluster.hset(anyString(), anyString(), anyString())).thenReturn(1L); - backplane = - new RedisShardBackplane("digest-inserttime-test", o -> o, o -> o, mockJedisClusterFactory); + RedisShardBackplane backplane = createBackplane("digest-inserttime-test"); backplane.start("addWorker/test:0000"); backplane.addWorker(shardWorker); verify(jedisCluster, times(1)) diff --git a/src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java b/src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java similarity index 97% rename from src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java rename to src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java index 908b3c826b..87554bc3e0 100644 --- a/src/test/java/build/buildfarm/instance/shard/ShardInstanceTest.java +++ b/src/test/java/build/buildfarm/instance/shard/ServerInstanceTest.java @@ -20,9 +20,9 @@ import static build.buildfarm.common.Actions.invalidActionVerboseMessage; import static build.buildfarm.common.Errors.VIOLATION_TYPE_INVALID; import static build.buildfarm.common.Errors.VIOLATION_TYPE_MISSING; -import static build.buildfarm.instance.server.AbstractServerInstance.INVALID_PLATFORM; -import static build.buildfarm.instance.server.AbstractServerInstance.MISSING_ACTION; -import static build.buildfarm.instance.server.AbstractServerInstance.MISSING_COMMAND; +import static build.buildfarm.instance.server.NodeInstance.INVALID_PLATFORM; +import static build.buildfarm.instance.server.NodeInstance.MISSING_ACTION; +import static build.buildfarm.instance.server.NodeInstance.MISSING_COMMAND; import static com.google.common.base.Predicates.notNull; import static com.google.common.truth.Truth.assertThat; import static com.google.common.util.concurrent.Futures.immediateFuture; @@ -31,9 +31,13 @@ import static java.util.concurrent.Executors.newSingleThreadExecutor; import static java.util.concurrent.TimeUnit.SECONDS; import static org.mockito.AdditionalAnswers.answer; +import static org.mockito.ArgumentMatchers.anyIterable; +import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.ArgumentMatchers.argThat; import static org.mockito.Mockito.any; +import static org.mockito.Mockito.atLeast; import static org.mockito.Mockito.atLeastOnce; +import static org.mockito.Mockito.atMost; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.eq; import static org.mockito.Mockito.mock; @@ -81,6 +85,7 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Maps; import com.google.common.collect.Sets; +import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.longrunning.Operation; import com.google.protobuf.Any; @@ -96,6 +101,7 @@ import io.grpc.stub.ServerCallStreamObserver; import io.grpc.stub.StreamObserver; import java.io.IOException; +import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -122,14 +128,14 @@ import org.mockito.stubbing.Answer; @RunWith(JUnit4.class) -public class ShardInstanceTest { +public class ServerInstanceTest { private static final DigestUtil DIGEST_UTIL = new DigestUtil(HashFunction.SHA256); private static final long QUEUE_TEST_TIMEOUT_SECONDS = 3; private static final Duration DEFAULT_TIMEOUT = Durations.fromSeconds(60); private static final Command SIMPLE_COMMAND = Command.newBuilder().addAllArguments(ImmutableList.of("true")).build(); - private ShardInstance instance; + private ServerInstance instance; private Map blobDigests; @Mock private Backplane mockBackplane; @@ -146,7 +152,7 @@ public void setUp() throws InterruptedException { blobDigests = Maps.newHashMap(); ActionCache actionCache = new ShardActionCache(10, mockBackplane, newDirectExecutorService()); instance = - new ShardInstance( + new ServerInstance( "shard", DIGEST_UTIL, mockBackplane, @@ -1055,7 +1061,6 @@ public void containsBlobReflectsWorkerWithUnknownSize() throws Exception { @Test public void findMissingBlobsTest_ViaBackPlane() throws Exception { - Set activeWorkers = ImmutableSet.of("worker1", "worker2", "worker3"); Set expiredWorkers = ImmutableSet.of("workerX", "workerY", "workerZ"); Set imposterWorkers = ImmutableSet.of("imposter1", "imposter2", "imposter3"); @@ -1114,8 +1119,12 @@ public void findMissingBlobsTest_ViaBackPlane() throws Exception { buildfarmConfigs.getServer().setFindMissingBlobsViaBackplane(true); Set activeAndImposterWorkers = Sets.newHashSet(Iterables.concat(activeWorkers, imposterWorkers)); + when(mockBackplane.getStorageWorkers()).thenReturn(activeAndImposterWorkers); when(mockBackplane.getBlobDigestsWorkers(any(Iterable.class))).thenReturn(digestAndWorkersMap); + when(mockInstanceLoader.load(anyString())).thenReturn(mockWorkerInstance); + when(mockWorkerInstance.findMissingBlobs(anyIterable(), any(RequestMetadata.class))) + .thenReturn(Futures.immediateFuture(new ArrayList<>())); long serverStartTime = 1686951033L; // june 15th, 2023 Map workersStartTime = new HashMap<>(); @@ -1138,6 +1147,10 @@ public void findMissingBlobsTest_ViaBackPlane() throws Exception { Iterables.concat(missingDigests, digestAvailableOnImposters); assertThat(actualMissingDigests).containsExactlyElementsIn(expectedMissingDigests); + verify(mockWorkerInstance, atMost(3)) + .findMissingBlobs(anyIterable(), any(RequestMetadata.class)); + verify(mockWorkerInstance, atLeast(1)) + .findMissingBlobs(anyIterable(), any(RequestMetadata.class)); for (Digest digest : actualMissingDigests) { assertThat(digest).isNotIn(availableDigests); diff --git a/src/test/java/build/buildfarm/metrics/BUILD b/src/test/java/build/buildfarm/metrics/BUILD index 8e5d3cace9..c32955e20d 100644 --- a/src/test/java/build/buildfarm/metrics/BUILD +++ b/src/test/java/build/buildfarm/metrics/BUILD @@ -8,14 +8,10 @@ java_test( "//src/main/java/build/buildfarm/common", "//src/main/java/build/buildfarm/common/config", "//src/main/java/build/buildfarm/metrics", - "//src/main/java/build/buildfarm/metrics/aws", "//src/main/java/build/buildfarm/metrics/log", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "//src/test/java/build/buildfarm:test_runner", "@googleapis//:google_rpc_error_details_java_proto", - "@maven//:com_amazonaws_aws_java_sdk_core", - "@maven//:com_amazonaws_aws_java_sdk_secretsmanager", - "@maven//:com_amazonaws_aws_java_sdk_sns", "@maven//:com_google_guava_guava", "@maven//:com_google_jimfs_jimfs", "@maven//:com_google_protobuf_protobuf_java", diff --git a/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java b/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java index 21c9685efd..f30adb55c6 100644 --- a/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java +++ b/src/test/java/build/buildfarm/metrics/MetricsPublisherTest.java @@ -22,7 +22,6 @@ import build.bazel.remote.execution.v2.RequestMetadata; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.common.config.Metrics; -import build.buildfarm.metrics.aws.AwsMetricsPublisher; import build.buildfarm.metrics.log.LogMetricsPublisher; import build.buildfarm.v1test.OperationRequestMetadata; import com.google.longrunning.Operation; @@ -69,7 +68,7 @@ public class MetricsPublisherTest { public void setUp() throws IOException { configs.getServer().setCloudRegion("test"); configs.getServer().setClusterId("buildfarm-test"); - configs.getServer().getMetrics().setPublisher(Metrics.PUBLISHER.AWS); + configs.getServer().getMetrics().setPublisher(Metrics.PUBLISHER.LOG); } @Test @@ -81,7 +80,7 @@ public void publishCompleteMetricsTest() throws InvalidProtocolBufferException { .setMetadata(Any.pack(defaultExecuteOperationMetadata)) .build(); - AwsMetricsPublisher metricsPublisher = new AwsMetricsPublisher(); + LogMetricsPublisher metricsPublisher = new LogMetricsPublisher(); assertThat( AbstractMetricsPublisher.formatRequestMetadataToJson( metricsPublisher.populateRequestMetadata(operation, defaultRequestMetadata))) @@ -110,7 +109,7 @@ public void publishMetricsWithNoExecuteResponseTest() { Operation operation = defaultOperation.toBuilder().setMetadata(Any.pack(defaultExecuteOperationMetadata)).build(); - assertThat(new AwsMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) + assertThat(new LogMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) .isNotNull(); } @@ -119,7 +118,7 @@ public void publishMetricsWithNoExecuteOperationMetadataTest() { Operation operation = defaultOperation.toBuilder().setResponse(Any.pack(defaultExecuteResponse)).build(); - assertThat(new AwsMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) + assertThat(new LogMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) .isNotNull(); } @@ -135,7 +134,7 @@ public void preconditionFailureTest() { .setMetadata(Any.pack(defaultExecuteOperationMetadata)) .build(); - assertThat(new AwsMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) + assertThat(new LogMetricsPublisher().populateRequestMetadata(operation, defaultRequestMetadata)) .isNotNull(); } diff --git a/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java b/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java index bc7acd00a4..47f25cdf79 100644 --- a/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java +++ b/src/test/java/build/buildfarm/worker/DequeueMatchEvaluatorTest.java @@ -19,8 +19,10 @@ import build.bazel.remote.execution.v2.Platform; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.v1test.QueueEntry; +import build.buildfarm.worker.resources.LocalResourceSet; import com.google.common.collect.HashMultimap; import com.google.common.collect.SetMultimap; +import java.util.concurrent.Semaphore; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.JUnit4; @@ -52,10 +54,12 @@ public class DequeueMatchEvaluatorTest { public void shouldKeepOperationKeepEmptyQueueEntry() throws Exception { // ARRANGE SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); QueueEntry entry = QueueEntry.newBuilder().setPlatform(Platform.newBuilder()).build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isTrue(); @@ -70,6 +74,7 @@ public void shouldKeepOperationKeepEmptyQueueEntry() throws Exception { public void shouldKeepOperationValidMinCoresQueueEntry() throws Exception { // ARRANGE SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); workerProvisions.put("cores", "11"); QueueEntry entry = @@ -81,7 +86,8 @@ public void shouldKeepOperationValidMinCoresQueueEntry() throws Exception { .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT // the worker accepts because it has more cores than the min-cores requested @@ -98,6 +104,7 @@ public void shouldKeepOperationInvalidMinCoresQueueEntry() throws Exception { // ARRANGE configs.getWorker().getDequeueMatchSettings().setAcceptEverything(false); SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); workerProvisions.put("cores", "10"); QueueEntry entry = @@ -109,7 +116,8 @@ public void shouldKeepOperationInvalidMinCoresQueueEntry() throws Exception { .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT // the worker rejects because it has less cores than the min-cores requested @@ -123,6 +131,7 @@ public void shouldKeepOperationInvalidMinCoresQueueEntry() throws Exception { public void shouldKeepOperationMaxCoresDoNotInfluenceAcceptance() throws Exception { // ARRANGE SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); workerProvisions.put("cores", "10"); QueueEntry entry = @@ -136,7 +145,8 @@ public void shouldKeepOperationMaxCoresDoNotInfluenceAcceptance() throws Excepti .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT // the worker accepts because it has the same cores as the min-cores requested @@ -153,6 +163,7 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E configs.getWorker().getDequeueMatchSettings().setAcceptEverything(false); configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(false); SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); QueueEntry entry = QueueEntry.newBuilder() @@ -163,7 +174,8 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E .build(); // ACT - boolean shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isFalse(); @@ -172,7 +184,7 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); // ACT - shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isTrue(); @@ -181,9 +193,151 @@ public void shouldKeepOperationUnmatchedPropertiesRejectionAcceptance() throws E configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); // ACT - shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, entry); + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); // ASSERT assertThat(shouldKeep).isTrue(); } + + // Function under test: shouldKeepOperation + // Reason for testing: the local resource should be claimed + // Failure explanation: semaphore claim did not work as expected. + @Test + public void shouldKeepOperationClaimsResource() throws Exception { + // ARRANGE + configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); + configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); + SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); + resourceSet.resources.put("FOO", new Semaphore(1)); + + QueueEntry entry = + QueueEntry.newBuilder() + .setPlatform( + Platform.newBuilder() + .addProperties( + Platform.Property.newBuilder().setName("resource:FOO").setValue("1"))) + .build(); + + // PRE-ASSERT + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(1); + + // ACT + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker accepts because the resource is available. + assertThat(shouldKeep).isTrue(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + + // ACT + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker rejects because there are no resources left. + assertThat(shouldKeep).isFalse(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + } + + // Function under test: shouldKeepOperation + // Reason for testing: the local resources should be claimed + // Failure explanation: semaphore claim did not work as expected. + @Test + public void shouldKeepOperationClaimsMultipleResource() throws Exception { + // ARRANGE + configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); + configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); + SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); + resourceSet.resources.put("FOO", new Semaphore(2)); + resourceSet.resources.put("BAR", new Semaphore(4)); + + QueueEntry entry = + QueueEntry.newBuilder() + .setPlatform( + Platform.newBuilder() + .addProperties( + Platform.Property.newBuilder().setName("resource:FOO").setValue("1")) + .addProperties( + Platform.Property.newBuilder().setName("resource:BAR").setValue("2"))) + .build(); + + // PRE-ASSERT + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(2); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(4); + + // ACT + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker accepts because the resource is available. + assertThat(shouldKeep).isTrue(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(1); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(2); + + // ACT + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker accepts because the resource is available. + assertThat(shouldKeep).isTrue(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(0); + + // ACT + shouldKeep = DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker rejects because there are no resources left. + assertThat(shouldKeep).isFalse(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(0); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(0); + } + + // Function under test: shouldKeepOperation + // Reason for testing: the local resources should fail to claim, and the existing amount should be + // the same. + // Failure explanation: semaphore claim did not work as expected. + @Test + public void shouldKeepOperationFailsToClaimSameAmountRemains() throws Exception { + // ARRANGE + configs.getWorker().getDequeueMatchSettings().setAcceptEverything(true); + configs.getWorker().getDequeueMatchSettings().setAllowUnmatched(true); + SetMultimap workerProvisions = HashMultimap.create(); + LocalResourceSet resourceSet = new LocalResourceSet(); + resourceSet.resources.put("FOO", new Semaphore(50)); + resourceSet.resources.put("BAR", new Semaphore(100)); + resourceSet.resources.put("BAZ", new Semaphore(200)); + + QueueEntry entry = + QueueEntry.newBuilder() + .setPlatform( + Platform.newBuilder() + .addProperties( + Platform.Property.newBuilder().setName("resource:FOO").setValue("20")) + .addProperties( + Platform.Property.newBuilder().setName("resource:BAR").setValue("101")) + .addProperties( + Platform.Property.newBuilder().setName("resource:BAZ").setValue("20"))) + .build(); + + // PRE-ASSERT + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(50); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(100); + assertThat(resourceSet.resources.get("BAZ").availablePermits()).isEqualTo(200); + + // ACT + boolean shouldKeep = + DequeueMatchEvaluator.shouldKeepOperation(workerProvisions, resourceSet, entry); + + // ASSERT + // the worker rejects because there are no resources left. + // The same amount are returned. + assertThat(shouldKeep).isFalse(); + assertThat(resourceSet.resources.get("FOO").availablePermits()).isEqualTo(50); + assertThat(resourceSet.resources.get("BAR").availablePermits()).isEqualTo(100); + assertThat(resourceSet.resources.get("BAZ").availablePermits()).isEqualTo(200); + } } diff --git a/src/test/java/build/buildfarm/worker/PipelineTest.java b/src/test/java/build/buildfarm/worker/PipelineTest.java index 713f1b0345..f414b8cefe 100644 --- a/src/test/java/build/buildfarm/worker/PipelineTest.java +++ b/src/test/java/build/buildfarm/worker/PipelineTest.java @@ -53,7 +53,7 @@ public void stageThreadReturnCompletesJoin() throws InterruptedException { public void run() {} }, 1); - pipeline.start(null); + pipeline.start(); pipeline.join(); } @@ -68,7 +68,7 @@ public void run() { } }, 1); - pipeline.start(null); + pipeline.start(); pipeline.join(); } } diff --git a/src/test/java/build/buildfarm/worker/StubWorkerContext.java b/src/test/java/build/buildfarm/worker/StubWorkerContext.java index d288d67a6f..7ccf1182d3 100644 --- a/src/test/java/build/buildfarm/worker/StubWorkerContext.java +++ b/src/test/java/build/buildfarm/worker/StubWorkerContext.java @@ -222,4 +222,9 @@ public ResourceLimits commandExecutionSettings(Command command) { public boolean shouldErrorOperationOnRemainingResources() { throw new UnsupportedOperationException(); } + + @Override + public void returnLocalResources(QueueEntry queueEntry) { + throw new UnsupportedOperationException(); + } } diff --git a/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java b/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java index 60cb0d8be9..3716513102 100644 --- a/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java +++ b/src/test/java/build/buildfarm/worker/SuperscalarPipelineStageTest.java @@ -72,6 +72,11 @@ protected int claimsRequired(OperationContext operationContext) { boolean isFull() { return claims.size() == width; } + + @Override + public int getSlotUsage() { + return 0; + } } @Test diff --git a/src/test/java/build/buildfarm/worker/shard/BUILD b/src/test/java/build/buildfarm/worker/shard/BUILD index b8a7b31ec6..c254e41fa4 100644 --- a/src/test/java/build/buildfarm/worker/shard/BUILD +++ b/src/test/java/build/buildfarm/worker/shard/BUILD @@ -11,6 +11,7 @@ java_test( "//src/main/java/build/buildfarm/common/config", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/worker", + "//src/main/java/build/buildfarm/worker/resources", "//src/main/java/build/buildfarm/worker/shard", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "//src/test/java/build/buildfarm:test_runner", diff --git a/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java b/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java index dd584c49af..c24e681abb 100644 --- a/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java +++ b/src/test/java/build/buildfarm/worker/shard/ShardWorkerContextTest.java @@ -35,6 +35,7 @@ import build.buildfarm.instance.MatchListener; import build.buildfarm.v1test.QueueEntry; import build.buildfarm.worker.WorkerContext; +import build.buildfarm.worker.resources.LocalResourceSet; import com.google.common.collect.ImmutableList; import com.google.protobuf.Duration; import java.util.ArrayList; @@ -105,6 +106,7 @@ WorkerContext createTestContext(Iterable policies) { /* onlyMulticoreTests=*/ false, /* allowBringYourOwnContainer=*/ false, /* errorOperationRemainingResources=*/ false, + /* resourceSet=*/ new LocalResourceSet(), writer); } diff --git a/src/test/java/build/buildfarm/worker/shard/ShardWorkerInstanceTest.java b/src/test/java/build/buildfarm/worker/shard/WorkerInstanceTest.java similarity index 97% rename from src/test/java/build/buildfarm/worker/shard/ShardWorkerInstanceTest.java rename to src/test/java/build/buildfarm/worker/shard/WorkerInstanceTest.java index 7a4e40be26..3df73187bb 100644 --- a/src/test/java/build/buildfarm/worker/shard/ShardWorkerInstanceTest.java +++ b/src/test/java/build/buildfarm/worker/shard/WorkerInstanceTest.java @@ -50,19 +50,19 @@ import org.mockito.MockitoAnnotations; @RunWith(JUnit4.class) -public class ShardWorkerInstanceTest { +public class WorkerInstanceTest { private final DigestUtil DIGEST_UTIL = new DigestUtil(HashFunction.SHA256); @Mock private Backplane backplane; @Mock private ContentAddressableStorage storage; - private ShardWorkerInstance instance; + private WorkerInstance instance; @Before public void setUp() throws Exception { MockitoAnnotations.initMocks(this); - instance = new ShardWorkerInstance("test", DIGEST_UTIL, backplane, storage); + instance = new WorkerInstance("test", DIGEST_UTIL, backplane, storage); } @SuppressWarnings("unchecked") diff --git a/third_party/docker_go_toolchain.patch b/third_party/docker_go_toolchain.patch new file mode 100644 index 0000000000..3b00ff333c --- /dev/null +++ b/third_party/docker_go_toolchain.patch @@ -0,0 +1,11 @@ +--- repositories/go_repositories.bzl.orig 2023-09-23 08:36:00.148468653 -0400 ++++ repositories/go_repositories.bzl 2023-09-23 08:33:22.502127476 -0400 +@@ -37,7 +37,7 @@ + go_repository_default_config (str, optional): A file used to determine the root of the workspace. + """ + go_rules_dependencies() +- go_register_toolchains() ++ go_register_toolchains("1.21.0") + gazelle_dependencies(go_repository_default_config = go_repository_default_config) + excludes = native.existing_rules().keys() + if "com_github_google_go_containerregistry" not in excludes: diff --git a/tools/buildfarm-indexer.py b/tools/buildfarm-indexer.py index 824763dd21..b020cb91d5 100755 --- a/tools/buildfarm-indexer.py +++ b/tools/buildfarm-indexer.py @@ -1,5 +1,5 @@ from redis.client import Pipeline -from rediscluster import StrictRedisCluster +from rediscluster import RedisCluster import sys def get_cas_page(r, cursor, count): @@ -15,7 +15,7 @@ def get_cas_page(r, cursor, count): print ("usage: buildfarm-indexer.py ") sys.exit(1) -r = StrictRedisCluster(startup_nodes=[{"host": redis_host, "port": 6379}], skip_full_coverage_check=True) +r = RedisCluster(startup_nodes=[{"host": redis_host, "port": 6379}], skip_full_coverage_check=True) nodes = r.connection_pool.nodes @@ -30,14 +30,15 @@ def get_cas_page(r, cursor, count): slots.remove(slot) node_keys[slot] = str(node_key) -workers = r.hkeys("Workers") +# config f"{backplane.workersHashName}_storage" +workers = r.hkeys("Workers_storage") worker_count = len(workers) print ("%d workers" % worker_count) p = r.pipeline() -for node_key in node_keys.viewvalues(): +for node_key in node_keys.values(): p.delete("{%s}:intersecting-workers" % node_key) p.sadd("{%s}:intersecting-workers" % node_key, *workers) p.execute() @@ -101,8 +102,9 @@ def process(self, cas_names, conn): count = len(cas_names) p = self.pipeline(conn) for i in range(count): - name = cas_names[i] - node_key = node_keys[nodes.keyslot(str(name))] + name = cas_names[i].decode() + keyslot = nodes.keyslot(name) + node_key = node_keys[keyslot] set_key = "{%s}:intersecting-workers" % node_key p.sinterstore(name, set_key, name) p.execute() @@ -116,8 +118,8 @@ def process(self, cas_names, conn): map_cas_page(r, 10000, indexer.process) p = r.pipeline() -for node_key in node_keys.viewvalues(): +for node_key in node_keys.values(): p.delete("{%s}:intersecting-workers" % node_key) p.execute() -print("\n%d processed" % (indexer.processed)) \ No newline at end of file +print("\n%d processed" % (indexer.processed))