diff --git a/_site/docs/architecture/content_addressable_storage.md b/_site/docs/architecture/content_addressable_storage.md index a8955b4ce3..41b50d9948 100644 --- a/_site/docs/architecture/content_addressable_storage.md +++ b/_site/docs/architecture/content_addressable_storage.md @@ -38,9 +38,9 @@ This is the example presentation of a CAS in the memory instance available [here ``` worker: - cas: - type: MEMORY - maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 + storages: + - type: MEMORY + maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 ``` ## GRPC @@ -53,9 +53,11 @@ A grpc config example is available in the alternate instance specification in th server: name: shard worker: - cas: - type: GRPC - target: + storages: + - type: FILESYSTEM + path: "cache" + - type: GRPC + target: ``` ## HTTP/1 @@ -89,11 +91,10 @@ The CASFileCache is also available on MemoryInstance servers, where it can repre ``` worker: - cas: - type: FILESYSTEM - path: "cache" - maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 - maxEntrySizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 + storages: + - type: FILESYSTEM + path: "cache" + maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 ``` CASTest is a standalone tool to load the cache and print status information about it. diff --git a/_site/docs/configuration/configuration.md b/_site/docs/configuration/configuration.md index c6df9b58d3..cbd286587f 100644 --- a/_site/docs/configuration/configuration.md +++ b/_site/docs/configuration/configuration.md @@ -238,26 +238,27 @@ backplane: ### Worker -| Configuration | Accepted and _Default_ Values | Environment Var | Description | -|----------------------------------|-------------------------------|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| port | Integer, _8981_ | | Listening port of the worker | -| publicName | String, _DERIVED:port_ | INSTANCE_NAME | Host:port of the GRPC server, required to be accessible by all servers | -| root | String, _/tmp/worker_ | | Path for all operation content storage | -| inlineContentLimit | Integer, _1048567_ | | Total size in bytes of inline content for action results, output files, stdout, stderr content | -| operationPollPeriod | Integer, _1_ | | Period between poll operations at any stage | -| executeStageWidth | Integer, _0_ | EXECUTION_STAGE_WIDTH | Number of CPU cores available for execution (0 = system available cores) | -| executeStageWidthOffset | Integer, _0_ | | Offset number of CPU cores available for execution (to allow for use by other processes) | -| inputFetchStageWidth | Integer, _0_ | | Number of concurrently available slots to fetch inputs (0 = system calculated based on CPU cores) | -| inputFetchDeadline | Integer, _60_ | | Limit on time (seconds) for input fetch stage to fetch inputs | -| linkInputDirectories | boolean, _true_ | | Use an input directory creation strategy which creates a single directory tree at the highest level containing no output paths of any kind, and symlinks that directory into an action's execroot, saving large amounts of time spent manufacturing the same read-only input hierirchy over multiple actions' executions | -| execOwner | String, _null_ | | Create exec trees containing directories that are owned by this user | -| hexBucketLevels | Integer, _0_ | | Number of levels to create for directory storage by leading byte of the hash (problematic, not recommended) | -| defaultMaxCores | Integer, _0_ | | Constrain all executions to this logical core count unless otherwise specified via min/max-cores (0 = no limit) | -| limitGlobalExecution | boolean, _false_ | | Constrain all executions to a pool of logical cores specified in executeStageWidth | -| onlyMulticoreTests | boolean, _false_ | | Only permit tests to exceed the default coresvalue for their min/max-cores range specification (only works with non-zero defaultMaxCores) | -| allowBringYourOwnContainer | boolean, _false_ | | Enable execution in a custom Docker container | -| errorOperationRemainingResources | boolean, _false_ | | | -| realInputDirectories | List of Strings, _external_ | | A list of paths that will not be subject to the effects of linkInputDirectories setting, may also be used to provide writable directories as input roots for actions which expect to be able to write to an input location and will fail if they cannot | +| Configuration | Accepted and _Default_ Values | Environment Var | Description | +|----------------------------------|-------------------------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| port | Integer, _8981_ | | Listening port of the worker | +| publicName | String, _DERIVED:port_ | INSTANCE_NAME | Host:port of the GRPC server, required to be accessible by all servers | +| root | String, _/tmp/worker_ | | Path for all operation content storage | +| inlineContentLimit | Integer, _1048567_ | | Total size in bytes of inline content for action results, output files, stdout, stderr content | +| operationPollPeriod | Integer, _1_ | | Period between poll operations at any stage | +| executeStageWidth | Integer, _0_ | EXECUTION_STAGE_WIDTH | Number of CPU cores available for execution (0 = system available cores) | +| executeStageWidthOffset | Integer, _0_ | | Offset number of CPU cores available for execution (to allow for use by other processes) | +| inputFetchStageWidth | Integer, _0_ | | Number of concurrently available slots to fetch inputs (0 = system calculated based on CPU cores) | +| inputFetchDeadline | Integer, _60_ | | Limit on time (seconds) for input fetch stage to fetch inputs | +| linkInputDirectories | boolean, _true_ | | Use an input directory creation strategy which creates a single directory tree at the highest level containing no output paths of any kind, and symlinks that directory into an action's execroot, saving large amounts of time spent manufacturing the same read-only input hierirchy over multiple actions' executions | +| execOwner | String, _null_ | | Create exec trees containing directories that are owned by this user | +| hexBucketLevels | Integer, _0_ | | Number of levels to create for directory storage by leading byte of the hash (problematic, not recommended) | +| defaultMaxCores | Integer, _0_ | | Constrain all executions to this logical core count unless otherwise specified via min/max-cores (0 = no limit) | +| limitGlobalExecution | boolean, _false_ | | Constrain all executions to a pool of logical cores specified in executeStageWidth | +| onlyMulticoreTests | boolean, _false_ | | Only permit tests to exceed the default coresvalue for their min/max-cores range specification (only works with non-zero defaultMaxCores) | +| allowBringYourOwnContainer | boolean, _false_ | | Enable execution in a custom Docker container | +| errorOperationRemainingResources | boolean, _false_ | | | +| realInputDirectories | List of Strings, _external_ | | A list of paths that will not be subject to the effects of linkInputDirectories setting, may also be used to provide writable directories as input roots for actions which expect to be able to write to an input location and will fail if they cannot | +| gracefulShutdownSeconds | Integer, 0 | | Time in seconds to allow for operations in flight to finish when shutdown signal is received | ``` worker: @@ -332,20 +333,21 @@ Example: ``` worker: - cas: - type: FILESYSTEM - path: "cache" - maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 - maxEntrySizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 - target: + storages: + - type: FILESYSTEM + path: "cache" + maxSizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 + maxEntrySizeBytes: 2147483648 # 2 * 1024 * 1024 * 1024 + target: ``` ``` worker: - cas: - type: GRPC - instanceName: external-cas - target: "cas.external.com:1234" + storages: + - type: FILESYSTEM + path: "cache" + - type: GRPC + target: "cas.external.com:1234" ``` ### Execution Policies diff --git a/_site/docs/metrics/metrics.md b/_site/docs/metrics/metrics.md index 5aa96fe56f..6079b7e9b7 100644 --- a/_site/docs/metrics/metrics.md +++ b/_site/docs/metrics/metrics.md @@ -124,6 +124,10 @@ Gauge for the number of operations in each stage (using a stage_name for each in Gauge for the completed operations status (using a status_code label for each individual GRPC code) +**operation_exit_code** + +Gauge for the completed operations exit code (using a exit_code label for each individual execution exit code) + **operation_worker** Gauge for the number of operations executed on each worker (using a worker_name label for each individual worker) diff --git a/_site/docs/quick_start.md b/_site/docs/quick_start.md index 5383c31199..7af957ee63 100644 --- a/_site/docs/quick_start.md +++ b/_site/docs/quick_start.md @@ -10,7 +10,7 @@ Here we describe how to use bazel remote caching or remote execution with buildf ## Setup -You can run this quick start on a single computer running nearly any flavor of linux. This computer is the localhost for the rest of the description. +You can run this quick start on a single computer running any flavor of linux that bazel supports. A C++ compiler is used here to demonstrate action execution. This computer is the localhost for the rest of the description. ### Backplane @@ -44,7 +44,7 @@ cc_binary( And an empty WORKSPACE file. -As a test, verify that `bazel run :main` builds your main program and runs it, and prints `Hello, World!`. This will ensure that you have properly installed bazel and a C++ compiler, and have a working target before moving on to remote execution. +As a test, verify that `bazel run :main` builds your main program and runs it, and prints `Hello, World!`. This will ensure that you have properly installed `bazel` and a C++ compiler, and have a working target before moving on to remote caching or remote execution. Download and extract the buildfarm repository. Each command sequence below will have the intended working directory indicated, between the client (workspace running bazel), and buildfarm. @@ -52,25 +52,35 @@ This tutorial assumes that you have a bazel binary in your path and you are in t ## Remote Caching -A Buildfarm server with an instance can be used strictly as an ActionCache and ContentAddressableStorage to improve build performance. This is an example of running a bazel client that will retrieve results if available, and store them if the cache is missed and the execution needs to run locally. +A Buildfarm cluster can be used strictly as an ActionCache (AC) and ContentAddressableStorage (CAS) to improve build performance. This is an example of running a bazel client that will retrieve results if available, otherwise store them on a cache miss after executing locally. Download the buildfarm repository and change into its directory, then: -run `bazelisk run src/main/java/build/buildfarm:buildfarm-server $PWD/examples/config.minimal.yml` + * run `bazel run src/main/java/build/buildfarm:buildfarm-server $PWD/examples/config.minimal.yml` This will wait while the server runs, indicating that it is ready for requests. -From another prompt (i.e. a separate terminal) in your newly created workspace directory from above: +A server alone does not itself store the content of action results. It acts as an endpoint for any number of workers that present storage, so we must also start a single worker. -run `bazel clean` -run `bazel run --remote_cache=grpc://localhost:8980 :main` +From another prompt (i.e. a separate terminal) in the buildfarm repository directory: + + * run `bazel run src/main/java/build/buildfarm:buildfarm-shard-worker -- --prometheus_port=9091 $PWD/examples/config.minimal.yml` + +The `--` option is bazel convention to treat all subsequent arguments as parameters to the running app, like our `--prometheus_port`, instead of interpreting them with `run` +The `--prometheus_port=9091` option allows this worker to run alongside our server, who will have started and logged that it has started a service on port `9090`. You can also turn this option off (with `--` separator), with `--prometheus_option=0` for either server or worker. +This will also wait while the worker runs, indicating it will be available to store cache content. + +From another prompt in your newly created workspace directory from above: + + * run `bazel clean` + * run `bazel run --remote_cache=grpc://localhost:8980 :main` Why do we clean here? Since we're verifying re-execution and caching, this ensures that we will execute any actions in the `run` step and interact with the remote cache. We should be attempting to retrieve cached results, and then when we miss - since we just started this memory resident server - bazel will upload the results of the execution for later use. There will be no change in the output of this bazel run if everything worked, since bazel does not provide output each time it uploads results. To prove that we have placed something in the action cache, we need to do the following: -run `bazel clean` -run `bazel run --remote_cache=localhost:8980 :main` + * run `bazel clean` + * run `bazel run --remote_cache=localhost:8980 :main` This should now print statistics on the `processes` line that indicate that you've retrieved results from the cache for your actions: @@ -80,20 +90,22 @@ INFO: 2 processes: 2 remote cache hit. ## Remote Execution (and caching) -Now we will use buildfarm for remote execution with a minimal configuration - a single memory instance, with a worker on the localhost that can execute a single process at a time - via a bazel invocation on our workspace. +Now we will use buildfarm for remote execution with a minimal configuration with a worker on the localhost that can execute a single process at a time, via a bazel invocation on our workspace. -First, we should restart the buildfarm server to ensure that we get remote execution (this can also be forced from the client by using `--noremote_accept_cached`). From the buildfarm server prompt and directory: +First, to clean out the results from the previous cached actions, flush your local redis database: -interrupt a running `buildfarm-server` -run `bazelisk run src/main/java/build/buildfarm:buildfarm-server $PWD/examples/config.minimal.yml` + * run `redis-cli flushdb` -From another prompt in the buildfarm repository directory: +Next, we should restart the buildfarm server, and delete the worker's cas storage to ensure that we get remote execution (this can also be forced from the client by using `--noremote_accept_cached`). From the buildfarm server prompt and directory: -run `bazelisk run src/main/java/build/buildfarm:buildfarm-shard-worker $PWD/examples/config.minimal.yml` + * interrupt the running `buildfarm-server` (i.e. Ctrl-C) + * run `bazel run src/main/java/build/buildfarm:buildfarm-server $PWD/examples/config.minimal.yml` + +You can leave the worker running from the Remote Caching step, it will not require a restart From another prompt, in your client workspace: -run `bazel run --remote_executor=grpc://localhost:8980 :main` + * run `bazel run --remote_executor=grpc://localhost:8980 :main` Your build should now print out the following on its `processes` summary line: @@ -117,6 +129,10 @@ To stop the containers, run: ./examples/bf-run stop ``` +## Next Steps + +We've started our worker on the same host as our server, and also the same host on which we built with bazel, but these services can be spread across many machines, per 'remote'. A large number of workers, with a relatively small number of servers (10:1 and 100:1 ratios have been used in practice), consolidating large disks and beefy multicore cpus/gpus on workers, with specialization of what work they perform for bazel builds (or other client work), and specializing servers to have hefty network connections to funnel content traffic. A buildfarm deployment can service hundreds or thousands of developers or CI processes, enabling them to benefit from each others' shared context in the AC/CAS, and the pooled execution of a fleet of worker hosts eager to consume operations and deliver results. + ## Buildfarm Manager You can now easily launch a new Buildfarm cluster locally or in AWS using an open sourced [Buildfarm Manager](https://github.com/80degreeswest/bfmgr). diff --git a/defs.bzl b/defs.bzl index 08f9a5c561..4b78136506 100644 --- a/defs.bzl +++ b/defs.bzl @@ -96,7 +96,7 @@ def buildfarm_init(name = "buildfarm"): "com.google.errorprone:error_prone_annotations:2.9.0", "com.google.errorprone:error_prone_core:0.92", "com.google.guava:failureaccess:1.0.1", - "com.google.guava:guava:31.1-jre", + "com.google.guava:guava:32.1.1-jre", "com.google.j2objc:j2objc-annotations:1.1", "com.google.jimfs:jimfs:1.1", "com.google.protobuf:protobuf-java-util:3.10.0", @@ -139,8 +139,8 @@ def buildfarm_init(name = "buildfarm"): ], generate_compat_repositories = True, repositories = [ - "https://repo.maven.apache.org/maven2", - "https://jcenter.bintray.com", + "https://repo1.maven.org/maven2", + "https://mirrors.ibiblio.org/pub/mirrors/maven2", ], ) diff --git a/deps.bzl b/deps.bzl index 5f5f0073b2..c35228c2ec 100644 --- a/deps.bzl +++ b/deps.bzl @@ -13,10 +13,10 @@ def archive_dependencies(third_party): { "name": "platforms", "urls": [ - "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.6/platforms-0.0.6.tar.gz", - "https://github.com/bazelbuild/platforms/releases/download/0.0.6/platforms-0.0.6.tar.gz", + "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.7/platforms-0.0.7.tar.gz", + "https://github.com/bazelbuild/platforms/releases/download/0.0.7/platforms-0.0.7.tar.gz", ], - "sha256": "5308fc1d8865406a49427ba24a9ab53087f17f5266a7aabbfc28823f3916e1ca", + "sha256": "3a561c99e7bdbe9173aa653fd579fe849f1d8d67395780ab4770b1f381431d51", }, { "name": "rules_jvm_external", @@ -111,10 +111,29 @@ def archive_dependencies(third_party): "patch_args": ["-p1"], "patches": ["%s:clang_toolchain.patch" % third_party], }, + + # Used to build release container images { "name": "io_bazel_rules_docker", "sha256": "b1e80761a8a8243d03ebca8845e9cc1ba6c82ce7c5179ce2b295cd36f7e394bf", "urls": ["https://github.com/bazelbuild/rules_docker/releases/download/v0.25.0/rules_docker-v0.25.0.tar.gz"], + "patch_args": ["-p0"], + "patches": ["%s:docker_go_toolchain.patch" % third_party], + }, + + # Updated versions of io_bazel_rules_docker dependencies for bazel compatibility + { + "name": "io_bazel_rules_go", + "sha256": "278b7ff5a826f3dc10f04feaf0b70d48b68748ccd512d7f98bf442077f043fe3", + "urls": [ + "https://mirror.bazel.build/github.com/bazelbuild/rules_go/releases/download/v0.41.0/rules_go-v0.41.0.zip", + "https://github.com/bazelbuild/rules_go/releases/download/v0.41.0/rules_go-v0.41.0.zip", + ], + }, + { + "name": "bazel_gazelle", + "sha256": "d3fa66a39028e97d76f9e2db8f1b0c11c099e8e01bf363a923074784e451f809", + "urls": ["https://github.com/bazelbuild/bazel-gazelle/releases/download/v0.33.0/bazel-gazelle-v0.33.0.tar.gz"], }, # Bazel is referenced as a dependency so that buildfarm can access the linux-sandbox as a potential execution wrapper. @@ -188,9 +207,9 @@ def buildfarm_dependencies(repository_name = "build_buildfarm"): maybe( http_jar, "opentelemetry", - sha256 = "0523287984978c091be0d22a5c61f0bce8267eeafbbae58c98abaf99c9396832", + sha256 = "eccd069da36031667e5698705a6838d173d527a5affce6cc514a14da9dbf57d7", urls = [ - "https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.11.0/opentelemetry-javaagent.jar", + "https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.28.0/opentelemetry-javaagent.jar", ], ) diff --git a/examples/config.yml b/examples/config.yml index 3c435fccc7..dd165a4d46 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -38,7 +38,6 @@ server: admin: deploymentEnvironment: AWS clusterEndpoint: "grpc://localhost" - enableGracefulShutdown: false metrics: publisher: LOG logLevel: FINEST @@ -126,6 +125,7 @@ worker: onlyMulticoreTests: false allowBringYourOwnContainer: false errorOperationRemainingResources: false + gracefulShutdownSeconds: 0 sandboxSettings: alwaysUse: false selectForBlockNetwork: false diff --git a/jvm_flags.bzl b/jvm_flags.bzl index 363f161465..4f628423df 100644 --- a/jvm_flags.bzl +++ b/jvm_flags.bzl @@ -46,7 +46,7 @@ RECOMMENDED_JVM_FLAGS = [ "-XX:+HeapDumpOnOutOfMemoryError", ] -DEFAULT_LOGGING_CONFIG = ["-Dlogging.config=file:/app/build_buildfarm/src/main/java/build/buildfarm/logging.properties"] +DEFAULT_LOGGING_CONFIG = ["-Dlogging.config=file:/etc/bazel-re/logging.properties"] def ensure_accurate_metadata(): return select({ diff --git a/kubernetes/helm-charts/buildfarm/.gitignore b/kubernetes/helm-charts/buildfarm/.gitignore new file mode 100644 index 0000000000..8d8946152c --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/.gitignore @@ -0,0 +1,2 @@ +charts +Chart.lock diff --git a/kubernetes/helm-charts/buildfarm/.helmignore b/kubernetes/helm-charts/buildfarm/.helmignore new file mode 100644 index 0000000000..0e8a0eb36f --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/kubernetes/helm-charts/buildfarm/Chart.yaml b/kubernetes/helm-charts/buildfarm/Chart.yaml new file mode 100644 index 0000000000..ce443957dd --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/Chart.yaml @@ -0,0 +1,30 @@ +apiVersion: v2 +name: buildfarm +description: A Helm chart for bazel buildfarm + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "v2.5.0" + +dependencies: + - condition: redis.enabled + name: redis + repository: https://charts.helm.sh/stable + version: 10.5.7 \ No newline at end of file diff --git a/kubernetes/helm-charts/buildfarm/templates/NOTES.txt b/kubernetes/helm-charts/buildfarm/templates/NOTES.txt new file mode 100644 index 0000000000..92421375fb --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.server.ingress.enabled }} +{{- range $host := .Values.server.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.server.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.server.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "buildfarm.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.server.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "buildfarm.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "buildfarm.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.server.service.port }} +{{- else if contains "ClusterIP" .Values.server.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "buildfarm.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/_helpers.tpl b/kubernetes/helm-charts/buildfarm/templates/_helpers.tpl new file mode 100644 index 0000000000..dffd8587bd --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/_helpers.tpl @@ -0,0 +1,73 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "buildfarm.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "buildfarm.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "buildfarm.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "buildfarm.labels" -}} +helm.sh/chart: {{ include "buildfarm.chart" . }} +{{ include "buildfarm.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "buildfarm.selectorLabels" -}} +app.kubernetes.io/name: {{ include "buildfarm.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "buildfarm.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "buildfarm.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} + + +{{/* Checks for `externalRedis` */}} +{{- if .Values.externalRedis.host }} + {{/* check if they are using externalRedis (the default value for `externalRedis.host` is "localhost") */}} + {{- if not (eq .Values.externalRedis.host "localhost") }} + {{- if .Values.redis.enabled }} + {{ required "If `externalRedis.host` is set, then `redis.enabled` should be `false`!" nil }} + {{- end }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/kubernetes/helm-charts/buildfarm/templates/configmap.yaml b/kubernetes/helm-charts/buildfarm/templates/configmap.yaml new file mode 100644 index 0000000000..809d83e049 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/configmap.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "buildfarm.fullname" . }}-config +data: + config.yml: |- + {{- range $key, $value := .Values.config }} + {{- if kindIs "map" $value }} + {{- else }} + {{ $key }}: {{ $value }}{{- end }} + {{- end }} + backplane: + {{- if .Values.redis.enabled }} + redisUri: "{{ .Values.redis.scheme }}://{{ printf "%s-redis-master.%s" (include "redis.fullname" .) (.Release.Namespace) }}:{{ "6379" }}" + {{- else }} + redisUri: "{{ .Values.externalRedis.uri }}" + {{- end }} + {{- with .Values.config.backplane }} + {{- toYaml . | nindent 6 }} + {{- end }} + {{- with .Values.config.server }} + server: + {{- toYaml . | nindent 6 }} + {{- end }} + {{- with .Values.config.worker }} + worker: + {{- toYaml . | nindent 6 }} + {{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/server/deployment.yaml b/kubernetes/helm-charts/buildfarm/templates/server/deployment.yaml new file mode 100644 index 0000000000..e56261f1fd --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/server/deployment.yaml @@ -0,0 +1,78 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "buildfarm.fullname" . }}-server + labels: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.labels" . | nindent 4 }} +spec: + replicas: {{ .Values.server.replicaCount }} + selector: + matchLabels: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + checksum/server-config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "buildfarm.serviceAccountName" . }} + containers: + - name: buildfarm-server + image: "{{ .Values.server.image.repository }}:{{ .Values.server.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.server.image.pullPolicy }} + command: + - bash + - /app/build_buildfarm/buildfarm-server.binary + args: + - /config/config.yml + env: + {{- if .Values.server.extraEnv }} + {{- toYaml .Values.server.extraEnv | nindent 12 }} + {{- end }} + ports: + - containerPort: 8980 + name: "server-comm" + - containerPort: 9090 + name: "metrics" + livenessProbe: + httpGet: + path: / + port: metrics + readinessProbe: + httpGet: + path: / + port: metrics + resources: + {{- toYaml .Values.server.resources | nindent 12 }} + volumeMounts: + - mountPath: /config + name: config + readOnly: true + {{- with .Values.server.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.server.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.server.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - configMap: + defaultMode: 420 + name: {{ include "buildfarm.fullname" . }}-config + name: config diff --git a/kubernetes/helm-charts/buildfarm/templates/server/service.yaml b/kubernetes/helm-charts/buildfarm/templates/server/service.yaml new file mode 100644 index 0000000000..6079f92dc0 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/server/service.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "buildfarm.fullname" . }}-server + labels: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.labels" . | nindent 4 }} + {{- with .Values.server.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.server.service.type }} + ports: + - port: {{ .Values.server.service.port }} + targetPort: server-comm + protocol: TCP + name: gprc + - port: 9090 + targetPort: metrics + protocol: TCP + name: metrics + selector: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.selectorLabels" . | nindent 4 }} diff --git a/kubernetes/helm-charts/buildfarm/templates/server/servicemonitor.yaml b/kubernetes/helm-charts/buildfarm/templates/server/servicemonitor.yaml new file mode 100644 index 0000000000..fe8a12b649 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/server/servicemonitor.yaml @@ -0,0 +1,37 @@ +{{- if .Values.server.serviceMonitor.enabled }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "buildfarm.fullname" . }}-server + labels: + {{- include "buildfarm.labels" . | nindent 4 }} +spec: + endpoints: + - port: "metrics" + {{- with .Values.server.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.server.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} + honorLabels: true + path: {{ .Values.server.serviceMonitor.path }} + scheme: {{ .Values.server.serviceMonitor.scheme }} + {{- with .Values.server.serviceMonitor.relabelings }} + relabelings: + {{- toYaml . | nindent 6 }} + {{- end }} + jobLabel: "{{ .Release.Name }}" + selector: + matchLabels: + name: {{ include "buildfarm.fullname" . }}-server + {{- include "buildfarm.labels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + {{- with .Values.server.serviceMonitor.targetLabels }} + targetLabels: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/serviceaccount.yaml b/kubernetes/helm-charts/buildfarm/templates/serviceaccount.yaml new file mode 100644 index 0000000000..f28779e3e4 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "buildfarm.serviceAccountName" . }} + labels: + {{- include "buildfarm.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/shard-worker/autoscaler.yaml b/kubernetes/helm-charts/buildfarm/templates/shard-worker/autoscaler.yaml new file mode 100644 index 0000000000..4389c793b6 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/shard-worker/autoscaler.yaml @@ -0,0 +1,21 @@ +{{- if .Values.shardWorker.autoscaling.enabled -}} +apiVersion: autoscaling/v1 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "buildfarm.fullname" . }}-shard-worker + labels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.labels" . | nindent 4 }} + {{- with .Values.shardWorker.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + maxReplicas: {{ .Values.shardWorker.autoscaling.maxReplicas }} + minReplicas: {{ .Values.shardWorker.autoscaling.minReplicas }} + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "buildfarm.fullname" . }}-shard-worker + targetCPUUtilizationPercentage: {{ .Values.shardWorker.autoscaling.targetCPUUtilizationPercentage }} +{{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/shard-worker/service.yaml b/kubernetes/helm-charts/buildfarm/templates/shard-worker/service.yaml new file mode 100644 index 0000000000..135756bd5f --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/shard-worker/service.yaml @@ -0,0 +1,25 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "buildfarm.fullname" . }}-shard-worker + labels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.labels" . | nindent 4 }} + {{- with .Values.shardWorker.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.shardWorker.service.type }} + ports: + - port: {{ .Values.shardWorker.service.port }} + targetPort: worker-comm + protocol: TCP + name: gprc + - port: 9090 + targetPort: metrics + protocol: TCP + name: metrics + selector: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.selectorLabels" . | nindent 4 }} diff --git a/kubernetes/helm-charts/buildfarm/templates/shard-worker/servicemonitor.yaml b/kubernetes/helm-charts/buildfarm/templates/shard-worker/servicemonitor.yaml new file mode 100644 index 0000000000..8ff1a59a56 --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/shard-worker/servicemonitor.yaml @@ -0,0 +1,37 @@ +{{- if .Values.shardWorker.serviceMonitor.enabled }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "buildfarm.fullname" . }}-shard-worker + labels: + {{- include "buildfarm.labels" . | nindent 4 }} +spec: + endpoints: + - port: "metrics" + {{- with .Values.shardWorker.serviceMonitor.interval }} + interval: {{ . }} + {{- end }} + {{- with .Values.shardWorker.serviceMonitor.scrapeTimeout }} + scrapeTimeout: {{ . }} + {{- end }} + honorLabels: true + path: {{ .Values.shardWorker.serviceMonitor.path }} + scheme: {{ .Values.shardWorker.serviceMonitor.scheme }} + {{- with .Values.shardWorker.serviceMonitor.relabelings }} + relabelings: + {{- toYaml . | nindent 6 }} + {{- end }} + jobLabel: "{{ .Release.Name }}" + selector: + matchLabels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.labels" . | nindent 6 }} + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + {{- with .Values.shardWorker.serviceMonitor.targetLabels }} + targetLabels: + {{- toYaml . | nindent 4 }} + {{- end }} + {{- end }} diff --git a/kubernetes/helm-charts/buildfarm/templates/shard-worker/statefulsets.yaml b/kubernetes/helm-charts/buildfarm/templates/shard-worker/statefulsets.yaml new file mode 100644 index 0000000000..62706a61ac --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/shard-worker/statefulsets.yaml @@ -0,0 +1,110 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ include "buildfarm.fullname" . }}-shard-worker + labels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.labels" . | nindent 4 }} +spec: + serviceName: {{ include "buildfarm.fullname" . }}-shard-worker + {{- if .Values.shardWorker.autoscaling.enabled }} + replicas: {{ .Values.shardWorker.autoscaling.minReplicas }} + {{- else }} + replicas: {{ .Values.shardWorker.replicaCount }} + {{- end }} + selector: + matchLabels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + checksum/worker-config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + name: {{ include "buildfarm.fullname" . }}-shard-worker + {{- include "buildfarm.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "buildfarm.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: buildfarm-worker + image: "{{ .Values.shardWorker.image.repository }}:{{ .Values.shardWorker.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.shardWorker.image.pullPolicy }} + args: + - /config/config.yml + - --public_name=$(POD_IP):8982 + command: + - bash + - /app/build_buildfarm/buildfarm-shard-worker.binary + env: + - name: POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + {{- if .Values.shardWorker.extraEnv }} + {{- toYaml .Values.shardWorker.extraEnv | nindent 12 }} + {{- end }} + ports: + - containerPort: 8981 + name: "worker-comm" + - containerPort: 9090 + name: "metrics" + livenessProbe: + httpGet: + path: / + port: metrics + readinessProbe: + httpGet: + path: / + port: metrics + resources: + {{- toYaml .Values.shardWorker.resources | nindent 12 }} + volumeMounts: + - mountPath: /config + name: config + readOnly: true + - mountPath: /tmp/worker + name: {{ include "buildfarm.fullname" . }}-shard-worker-data + {{- with .Values.extraVolumeMounts }} + {{- tpl (toYaml .) $ | nindent 12 -}} + {{- end }} + {{- with .Values.shardWorker.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.shardWorker.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.shardWorker.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - configMap: + defaultMode: 420 + name: {{ include "buildfarm.fullname" . }}-config + name: config + {{- with .Values.shardWorker.extraVolumes }} + {{- tpl (toYaml .) $ | nindent 8 }} + {{- end }} + volumeClaimTemplates: + - metadata: + name: {{ include "buildfarm.fullname" . }}-shard-worker-data + spec: + accessModes: ["ReadWriteOnce"] + {{- with .Values.shardWorker.storage.class }} + storageClassName: "{{ . }}" + {{- end }} + resources: + requests: + storage: "{{ .Values.shardWorker.storage.size }}" diff --git a/kubernetes/helm-charts/buildfarm/templates/tests/test-connection.yaml b/kubernetes/helm-charts/buildfarm/templates/tests/test-connection.yaml new file mode 100644 index 0000000000..7aea6f1cfe --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "buildfarm.fullname" . }}-test-connection" + labels: + {{- include "buildfarm.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: curl + image: appropriate/curl:latest + command: ['curl'] + args: ['--output', '/dev/null', '{{ include "buildfarm.fullname" . }}-server:{{ .Values.server.service.port }}'] + restartPolicy: Never diff --git a/kubernetes/helm-charts/buildfarm/values.yaml b/kubernetes/helm-charts/buildfarm/values.yaml new file mode 100644 index 0000000000..3bb2994b7a --- /dev/null +++ b/kubernetes/helm-charts/buildfarm/values.yaml @@ -0,0 +1,206 @@ +# Default values for buildfarm. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +nameOverride: "" +fullnameOverride: "" + +imagePullSecrets: [] + +config: + # see: https://github.com/bazelbuild/bazel-buildfarm/blob/main/examples/config.yml + digestFunction: SHA256 + defaultActionTimeout: 600 + maximumActionTimeout: 3600 + maxEntrySizeBytes: "2147483648" # 2 * 1024 * 1024 * 1024 + prometheusPort: 9090 + backplane: + queues: + - name: "cpu" + allowUnmatched: true + properties: + - name: "min-cores" + value: "*" + - name: "max-cores" + value: "*" + server: + name: "shard" + recordBesEvents: true + worker: + port: 8982 + publicName: "localhost:8982" + executeStageWidth: 80 + inputFetchStageWidth: 8 + realInputDirectories: + - "external" + root: "/tmp/worker" + storages: + - type: FILESYSTEM + path: "cache" + maxSizeBytes: 536870912000 # 500 * 1024 * 1024 * 1024 + +server: + image: + repository: bazelbuild/buildfarm-server + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + replicaCount: 1 + resources: { } + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + service: + type: ClusterIP + port: 8980 + + ingress: + enabled: false + className: "" + annotations: { } + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: chart-example.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [ ] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + + nodeSelector: {} + tolerations: [] + affinity: {} + extraVolumes: [] + # - name: additionalSecret + # secret: + # secretName: my-secret + # defaultMode: 0600 + + extraVolumeMounts: [] + # - name: customConfig + # mountPath: /mnt/config + # readOnly: true + extraEnv: + - name: JAVABIN + value: "/usr/bin/java" + + serviceMonitor: + ## If true, a ServiceMonitor CRD is created for a prometheus operator + ## https://github.com/coreos/prometheus-operator + ## + enabled: false + path: /metrics + # namespace: monitoring (defaults to use the namespace this chart is deployed to) + labels: {} + interval: 1m + scheme: http + tlsConfig: {} + scrapeTimeout: 30s + relabelings: [] + targetLabels: [] + +shardWorker: + image: + repository: bazelbuild/buildfarm-worker + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + replicaCount: 2 + autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 4 + targetCPUUtilizationPercentage: 50 + + resources: { } + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + storage: + # the storage class for pv, leave empty will using default + class: "" + size: 50Gi + + service: + type: ClusterIP + port: 8982 + + nodeSelector: {} + tolerations: [] + affinity: {} + extraVolumes: [] + # - name: additionalSecret + # secret: + # secretName: my-secret + # defaultMode: 0600 + + extraVolumeMounts: [] + # - name: customConfig + # mountPath: /mnt/config + # readOnly: true + extraEnv: + - name: JAVABIN + value: "/usr/bin/java" + serviceMonitor: + ## If true, a ServiceMonitor CRD is created for a prometheus operator + ## https://github.com/coreos/prometheus-operator + ## + enabled: false + path: /metrics + # namespace: monitoring (defaults to use the namespace this chart is deployed to) + labels: {} + interval: 1m + scheme: http + tlsConfig: {} + scrapeTimeout: 30s + relabelings: [] + targetLabels: [] + +################################### +## DATABASE | Embedded Redis +################################### +redis: + ## - set to `false` if using `externalRedis.*` + ## + enabled: true + scheme: "redis" + ## See more redis configs: https://github.com/bitnami/charts/blob/main/bitnami/redis/README.md + usePassword: false + ## configs for redis cluster mode + ## + cluster: + ## if redis runs in cluster mode + ## + enabled: false + + ## the number of redis slaves + ## + slaveCount: 1 + +externalRedis: + uri: "redis://localhost:6379" + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" diff --git a/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java b/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java index 5b971405cd..f40dd72c95 100644 --- a/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java +++ b/src/main/java/build/buildfarm/admin/aws/AwsAdmin.java @@ -14,6 +14,8 @@ package build.buildfarm.admin.aws; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.admin.Admin; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.v1test.AdminGrpc; @@ -41,8 +43,6 @@ import com.amazonaws.services.simplesystemsmanagement.model.SendCommandRequest; import com.google.protobuf.util.Timestamps; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import java.util.ArrayList; import java.util.Calendar; import java.util.Collections; @@ -127,7 +127,7 @@ public GetHostsResult getHosts(String filter, int ageInMinutes, String status) { } resultBuilder.addAllHosts(hosts); resultBuilder.setNumHosts(hosts.size()); - log.log(Level.FINE, String.format("Got %d hosts for filter: %s", hosts.size(), filter)); + log.log(Level.FINER, String.format("Got %d hosts for filter: %s", hosts.size(), filter)); return resultBuilder.build(); } @@ -206,9 +206,7 @@ public void disableHostScaleInProtection(String privateDnsName) { public void disableHostScaleInProtection(String clusterEndpoint, String instanceIp) { ManagedChannel channel = null; try { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(clusterEndpoint).negotiationType(NegotiationType.PLAINTEXT); - channel = builder.build(); + channel = createChannel(clusterEndpoint); AdminGrpc.AdminBlockingStub adminBlockingStub = AdminGrpc.newBlockingStub(channel); adminBlockingStub.disableScaleInProtection( DisableScaleInProtectionRequest.newBuilder().setInstanceName(instanceIp).build()); diff --git a/src/main/java/build/buildfarm/admin/aws/BUILD b/src/main/java/build/buildfarm/admin/aws/BUILD index ea544e0b68..a49e431bfd 100644 --- a/src/main/java/build/buildfarm/admin/aws/BUILD +++ b/src/main/java/build/buildfarm/admin/aws/BUILD @@ -6,6 +6,7 @@ java_library( deps = [ "//src/main/java/build/buildfarm/admin", "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "@googleapis//:google_rpc_code_java_proto", @@ -19,7 +20,6 @@ java_library( "@maven//:com_google_guava_guava", "@maven//:com_google_protobuf_protobuf_java_util", "@maven//:io_grpc_grpc_api", - "@maven//:io_grpc_grpc_netty", "@maven//:org_projectlombok_lombok", "@maven//:org_springframework_spring_beans", "@maven//:org_springframework_spring_context", diff --git a/src/main/java/build/buildfarm/cas/BUILD b/src/main/java/build/buildfarm/cas/BUILD index 301e922e13..146206caee 100644 --- a/src/main/java/build/buildfarm/cas/BUILD +++ b/src/main/java/build/buildfarm/cas/BUILD @@ -25,7 +25,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:io_prometheus_simpleclient", diff --git a/src/main/java/build/buildfarm/cas/ContentAddressableStorages.java b/src/main/java/build/buildfarm/cas/ContentAddressableStorages.java index ade381ff50..e1df38fae4 100644 --- a/src/main/java/build/buildfarm/cas/ContentAddressableStorages.java +++ b/src/main/java/build/buildfarm/cas/ContentAddressableStorages.java @@ -14,6 +14,7 @@ package build.buildfarm.cas; +import static build.buildfarm.common.grpc.Channels.createChannel; import static build.buildfarm.common.grpc.Retrier.NO_RETRIES; import static com.google.common.collect.Multimaps.synchronizedListMultimap; import static com.google.common.util.concurrent.MoreExecutors.directExecutor; @@ -29,8 +30,6 @@ import com.google.common.collect.ListMultimap; import com.google.common.collect.MultimapBuilder; import io.grpc.Channel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import java.io.IOException; import java.io.InputStream; import java.nio.file.NoSuchFileException; @@ -40,12 +39,6 @@ public final class ContentAddressableStorages { private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - private static Channel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static ContentAddressableStorage createGrpcCAS(Cas cas) { Channel channel = createChannel(cas.getTarget()); ByteStreamUploader byteStreamUploader = diff --git a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java index 6cd8c572c5..4a9aac68a8 100644 --- a/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java +++ b/src/main/java/build/buildfarm/cas/cfc/CASFileCache.java @@ -170,7 +170,7 @@ public abstract class CASFileCache implements ContentAddressableStorage { private final Consumer> onExpire; private final Executor accessRecorder; private final ExecutorService expireService; - private Thread prometheusMetricsThread; // TODO make this final, stop on shutdown + private Thread prometheusMetricsThread; private final Map directoryStorage = Maps.newConcurrentMap(); private final DirectoriesIndex directoriesIndex; @@ -552,7 +552,7 @@ private InputStream compressorInputStream(Compressor.Value compressor, InputStre @SuppressWarnings("ResultOfMethodCallIgnored") InputStream newLocalInput(Compressor.Value compressor, Digest digest, long offset) throws IOException { - log.log(Level.FINE, format("getting input stream for %s", DigestUtil.toString(digest))); + log.log(Level.FINER, format("getting input stream for %s", DigestUtil.toString(digest))); boolean isExecutable = false; do { String key = getKey(digest, isExecutable); @@ -726,7 +726,7 @@ void invalidateWrite(Digest digest) { public void put(Blob blob, Runnable onExpiration) throws InterruptedException { String key = getKey(blob.getDigest(), false); try { - log.log(Level.FINE, format("put: %s", key)); + log.log(Level.FINER, format("put: %s", key)); OutputStream out = putImpl( Compressor.Value.IDENTITY, @@ -863,8 +863,14 @@ Write newWrite(BlobWriteKey key, ListenableFuture future) { Write write = new Write() { CancellableOutputStream out = null; + + @GuardedBy("this") boolean isReset = false; + + @GuardedBy("this") SettableFuture closedFuture = null; + + @GuardedBy("this") long fileCommittedSize = -1; @Override @@ -945,6 +951,11 @@ public synchronized ListenableFuture getOutputFuture( directExecutor()); } + private synchronized void syncCancelled() { + out = null; + isReset = true; + } + @Override public synchronized FeedbackOutputStream getOutput( long deadlineAfter, TimeUnit deadlineAfterUnits, Runnable onReadyHandler) @@ -953,6 +964,9 @@ public synchronized FeedbackOutputStream getOutput( // will block until it is returned via a close. if (closedFuture != null) { try { + while (!closedFuture.isDone()) { + wait(); + } closedFuture.get(); } catch (ExecutionException e) { throw new IOException(e.getCause()); @@ -969,8 +983,7 @@ public synchronized FeedbackOutputStream getOutput( UUID.fromString(key.getIdentifier()), cancelled -> { if (cancelled) { - out = null; - isReset = true; + syncCancelled(); } outClosedFuture.set(null); }, @@ -980,7 +993,11 @@ public synchronized FeedbackOutputStream getOutput( return uniqueOut; } - private void commitOpenState( + private synchronized void syncNotify() { + notify(); + } + + private synchronized void commitOpenState( CancellableOutputStream out, SettableFuture closedFuture) { // transition the Write to an open state, and modify all internal state required // atomically @@ -988,6 +1005,7 @@ private void commitOpenState( this.out = out; this.closedFuture = closedFuture; + closedFuture.addListener(this::syncNotify, directExecutor()); // they will likely write to this, so we can no longer assume isReset. // might want to subscribe to a write event on the stream isReset = false; @@ -1052,7 +1070,7 @@ CancellableOutputStream newOutput( String key = getKey(digest, false); final CancellableOutputStream cancellableOut; try { - log.log(Level.FINE, format("getWrite: %s", key)); + log.log(Level.FINER, format("getWrite: %s", key)); cancellableOut = putImpl( compressor, @@ -1260,6 +1278,13 @@ public void initializeRootDirectory() throws IOException { fileStore = Files.getFileStore(root); } + public void stop() throws InterruptedException { + if (prometheusMetricsThread != null) { + prometheusMetricsThread.interrupt(); + prometheusMetricsThread.join(); + } + } + public StartupCacheResults start(boolean skipLoad) throws IOException, InterruptedException { return start(newDirectExecutorService(), skipLoad); } @@ -2159,7 +2184,7 @@ private void removeFilePath(Path path) throws IOException { } if (Files.isDirectory(temp)) { - log.log(Level.FINE, "removing existing directory " + path + " for fetch"); + log.log(Level.FINER, "removing existing directory " + path + " for fetch"); Directories.remove(temp, fileStore); } else { Files.delete(temp); @@ -2302,14 +2327,14 @@ public ListenableFuture putDirectory( // Claim the directory path so no other threads try to create/delete it. Path path = getDirectoryPath(digest); Lock l = locks.acquire(path); - log.log(Level.FINE, format("locking directory %s", path.getFileName())); + log.log(Level.FINER, format("locking directory %s", path.getFileName())); try { l.lockInterruptibly(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); return immediateFailedFuture(e); } - log.log(Level.FINE, format("locked directory %s", path.getFileName())); + log.log(Level.FINER, format("locked directory %s", path.getFileName())); // Now that a lock has been claimed, we can proceed to create the directory. ListenableFuture putFuture; @@ -2323,7 +2348,7 @@ public ListenableFuture putDirectory( putFuture.addListener( () -> { l.unlock(); - log.log(Level.FINE, format("directory %s has been unlocked", path.getFileName())); + log.log(Level.FINER, format("directory %s has been unlocked", path.getFileName())); }, service); return putFuture; @@ -2371,7 +2396,7 @@ private boolean directoryEntryExists( private ListenableFuture putDirectorySynchronized( Path path, Digest digest, Map directoriesByDigest, ExecutorService service) throws IOException { - log.log(Level.FINE, format("directory %s has been locked", path.getFileName())); + log.log(Level.FINER, format("directory %s has been locked", path.getFileName())); ListenableFuture expireFuture; synchronized (this) { DirectoryEntry e = directoryStorage.get(digest); @@ -2398,7 +2423,7 @@ private ListenableFuture putDirectorySynchronized( } if (e != null) { - log.log(Level.FINE, format("found existing entry for %s", path.getFileName())); + log.log(Level.FINER, format("found existing entry for %s", path.getFileName())); if (directoryEntryExists(path, e, directoriesByDigest)) { return immediateFuture(path); } @@ -2411,7 +2436,7 @@ private ListenableFuture putDirectorySynchronized( decrementReferencesSynchronized(inputsBuilder.build(), ImmutableList.of()); expireFuture = expireDirectory(digest, service); - log.log(Level.FINE, format("expiring existing entry for %s", path.getFileName())); + log.log(Level.FINER, format("expiring existing entry for %s", path.getFileName())); } } @@ -2433,7 +2458,7 @@ private ListenableFuture putDirectorySynchronized( transformAsync( deindexFuture, result -> { - log.log(Level.FINE, format("expiry complete, fetching %s", path.getFileName())); + log.log(Level.FINER, format("expiry complete, fetching %s", path.getFileName())); ImmutableList.Builder> putFuturesBuilder = ImmutableList.builder(); fetchDirectory( @@ -2507,7 +2532,7 @@ private ListenableFuture putDirectorySynchronized( } } try { - log.log(Level.FINE, "removing directory to roll back " + path); + log.log(Level.FINER, "removing directory to roll back " + path); Directories.remove(path, fileStore); } catch (IOException removeException) { log.log( @@ -2523,7 +2548,8 @@ private ListenableFuture putDirectorySynchronized( return transform( rollbackFuture, (results) -> { - log.log(Level.FINE, format("directory fetch complete, inserting %s", path.getFileName())); + log.log( + Level.FINER, format("directory fetch complete, inserting %s", path.getFileName())); DirectoryEntry e = new DirectoryEntry( // might want to have this treatment ahead of this @@ -2575,13 +2601,13 @@ Path putAndCopy(Digest digest, boolean isExecutable) throws IOException, Interru complete = true; } finally { try { - log.log(Level.FINE, format("closing output stream for %s", DigestUtil.toString(digest))); + log.log(Level.FINER, format("closing output stream for %s", DigestUtil.toString(digest))); if (complete) { out.close(); } else { out.cancel(); } - log.log(Level.FINE, format("output stream closed for %s", DigestUtil.toString(digest))); + log.log(Level.FINER, format("output stream closed for %s", DigestUtil.toString(digest))); } catch (IOException e) { if (Thread.interrupted()) { log.log( @@ -2593,7 +2619,7 @@ Path putAndCopy(Digest digest, boolean isExecutable) throws IOException, Interru throw new InterruptedException(); } else { log.log( - Level.FINE, + Level.FINER, format("failed output stream close for %s", DigestUtil.toString(digest)), e); } @@ -2625,7 +2651,7 @@ private static Exception extractStatusException(IOException e) { private void copyExternalInput(Digest digest, CancellableOutputStream out) throws IOException, InterruptedException { Retrier retrier = new Retrier(Backoff.sequential(5), Retrier.DEFAULT_IS_RETRIABLE); - log.log(Level.FINE, format("downloading %s", DigestUtil.toString(digest))); + log.log(Level.FINER, format("downloading %s", DigestUtil.toString(digest))); try { retrier.execute( () -> { @@ -2646,7 +2672,7 @@ private void copyExternalInput(Digest digest, CancellableOutputStream out) e); // prevent burial by early end of stream during close throw e; } - log.log(Level.FINE, format("download of %s complete", DigestUtil.toString(digest))); + log.log(Level.FINER, format("download of %s complete", DigestUtil.toString(digest))); } @FunctionalInterface @@ -2700,7 +2726,7 @@ private CancellableOutputStream putImpl( if (out == DUPLICATE_OUTPUT_STREAM) { return null; } - log.log(Level.FINE, format("entry %s is missing, downloading and populating", key)); + log.log(Level.FINER, format("entry %s is missing, downloading and populating", key)); return newCancellableOutputStream(out); } @@ -2930,7 +2956,7 @@ private boolean charge(String key, long blobSizeInBytes, AtomicBoolean requiresD return immediateFuture(null); } expiredKeyCounter.inc(); - log.log(Level.INFO, format("expired key %s", expiredKey)); + log.log(Level.FINE, format("expired key %s", expiredKey)); return immediateFuture(fileEntryKey.getDigest()); }, expireService)); @@ -3136,7 +3162,7 @@ void commit() throws IOException { existingEntry = safeStorageInsertion(key, entry); inserted = existingEntry == null; } catch (FileAlreadyExistsException e) { - log.log(Level.FINE, "file already exists for " + key + ", nonexistent entry will fail"); + log.log(Level.FINER, "file already exists for " + key + ", nonexistent entry will fail"); } finally { if (Files.exists(writePath)) { Files.delete(writePath); @@ -3163,20 +3189,20 @@ void commit() throws IOException { } if (existingEntry != null) { - log.log(Level.FINE, "lost the race to insert " + key); + log.log(Level.FINER, "lost the race to insert " + key); if (!referenceIfExists(key)) { // we would lose our accountability and have a presumed reference if we returned throw new IllegalStateException("storage conflict with existing key for " + key); } } else if (writeWinner.get()) { - log.log(Level.FINE, "won the race to insert " + key); + log.log(Level.FINER, "won the race to insert " + key); try { onInsert.run(); } catch (RuntimeException e) { throw new IOException(e); } } else { - log.log(Level.FINE, "did not win the race to insert " + key); + log.log(Level.FINER, "did not win the race to insert " + key); } } }; @@ -3230,7 +3256,7 @@ public boolean incrementReference() { "entry " + key + " has " + referenceCount + " references and is being incremented..."); } log.log( - Level.FINER, + Level.FINEST, "incrementing references to " + key + " from " @@ -3260,7 +3286,7 @@ public boolean decrementReference(Entry header) { "entry " + key + " has 0 references and is being decremented..."); } log.log( - Level.FINER, + Level.FINEST, "decrementing references to " + key + " from " diff --git a/src/main/java/build/buildfarm/common/config/Admin.java b/src/main/java/build/buildfarm/common/config/Admin.java index 07deb4ce70..f4f8168225 100644 --- a/src/main/java/build/buildfarm/common/config/Admin.java +++ b/src/main/java/build/buildfarm/common/config/Admin.java @@ -11,5 +11,7 @@ public enum DEPLOYMENT_ENVIRONMENT { private DEPLOYMENT_ENVIRONMENT deploymentEnvironment; private String clusterEndpoint; + // This configuration is deprecated but is left here for backwards compatibility. Use + // worker:gracefulShutdownSeconds instead. private boolean enableGracefulShutdown; } diff --git a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java index 432a11a90a..68106d56c7 100644 --- a/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java +++ b/src/main/java/build/buildfarm/common/config/BuildfarmConfigs.java @@ -78,6 +78,9 @@ public static BuildfarmConfigs loadServerConfigs(String[] args) throws Configura if (options.port > 0) { buildfarmConfigs.getServer().setPort(options.port); } + if (options.prometheusPort >= 0) { + buildfarmConfigs.setPrometheusPort(options.prometheusPort); + } adjustServerConfigs(buildfarmConfigs); return buildfarmConfigs; } @@ -94,6 +97,9 @@ public static BuildfarmConfigs loadWorkerConfigs(String[] args) throws Configura if (!Strings.isNullOrEmpty(options.publicName)) { buildfarmConfigs.getWorker().setPublicName(options.publicName); } + if (options.prometheusPort >= 0) { + buildfarmConfigs.setPrometheusPort(options.prometheusPort); + } adjustWorkerConfigs(buildfarmConfigs); return buildfarmConfigs; } diff --git a/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java b/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java new file mode 100644 index 0000000000..defeebbf48 --- /dev/null +++ b/src/main/java/build/buildfarm/common/config/BuildfarmOptions.java @@ -0,0 +1,30 @@ +// Copyright 2023 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.common.config; + +import com.google.devtools.common.options.Option; +import com.google.devtools.common.options.OptionsBase; + +/** Command-line options definition for Worker. */ +public class BuildfarmOptions extends OptionsBase { + @Option(name = "help", abbrev = 'h', help = "Prints usage info.", defaultValue = "true") + public boolean help; + + @Option( + name = "prometheus_port", + help = "Port for the prometheus service. '0' will disable prometheus hosting", + defaultValue = "-1") + public int prometheusPort; +} diff --git a/src/main/java/build/buildfarm/common/config/GrpcMetrics.java b/src/main/java/build/buildfarm/common/config/GrpcMetrics.java index a028035673..8855ae5fdb 100644 --- a/src/main/java/build/buildfarm/common/config/GrpcMetrics.java +++ b/src/main/java/build/buildfarm/common/config/GrpcMetrics.java @@ -27,7 +27,7 @@ public static void handleGrpcMetricIntercepts( // provide custom latency buckets if (grpcMetrics.getLatencyBuckets() != null) { - grpcConfig.withLatencyBuckets(grpcMetrics.getLatencyBuckets()); + grpcConfig = grpcConfig.withLatencyBuckets(grpcMetrics.getLatencyBuckets()); } // Apply config to create an interceptor and apply it to the GRPC server. diff --git a/src/main/java/build/buildfarm/common/config/ServerOptions.java b/src/main/java/build/buildfarm/common/config/ServerOptions.java index 5c1e00a0d5..35f47d6d13 100644 --- a/src/main/java/build/buildfarm/common/config/ServerOptions.java +++ b/src/main/java/build/buildfarm/common/config/ServerOptions.java @@ -15,13 +15,9 @@ package build.buildfarm.common.config; import com.google.devtools.common.options.Option; -import com.google.devtools.common.options.OptionsBase; /** Command-line options definition for example server. */ -public class ServerOptions extends OptionsBase { - @Option(name = "help", abbrev = 'h', help = "Prints usage info.", defaultValue = "true") - public boolean help; - +public class ServerOptions extends BuildfarmOptions { @Option(name = "port", abbrev = 'p', help = "Port to use.", defaultValue = "-1") public int port; diff --git a/src/main/java/build/buildfarm/common/config/ShardWorkerOptions.java b/src/main/java/build/buildfarm/common/config/ShardWorkerOptions.java index 7671ae0a4f..c116d31673 100644 --- a/src/main/java/build/buildfarm/common/config/ShardWorkerOptions.java +++ b/src/main/java/build/buildfarm/common/config/ShardWorkerOptions.java @@ -15,13 +15,9 @@ package build.buildfarm.common.config; import com.google.devtools.common.options.Option; -import com.google.devtools.common.options.OptionsBase; /** Command-line options definition for Worker. */ -public class ShardWorkerOptions extends OptionsBase { - @Option(name = "help", abbrev = 'h', help = "Prints usage info.", defaultValue = "true") - public boolean help; - +public class ShardWorkerOptions extends BuildfarmOptions { @Option( name = "root", help = "Root base directory for all work being performed.", diff --git a/src/main/java/build/buildfarm/common/config/Worker.java b/src/main/java/build/buildfarm/common/config/Worker.java index 294b70dc2f..4caaff2b02 100644 --- a/src/main/java/build/buildfarm/common/config/Worker.java +++ b/src/main/java/build/buildfarm/common/config/Worker.java @@ -35,6 +35,7 @@ public class Worker { private boolean onlyMulticoreTests = false; private boolean allowBringYourOwnContainer = false; private boolean errorOperationRemainingResources = false; + private int gracefulShutdownSeconds = 0; private ExecutionPolicy[] executionPolicies = {}; private SandboxSettings sandboxSettings = new SandboxSettings(); diff --git a/src/main/java/build/buildfarm/common/grpc/BUILD b/src/main/java/build/buildfarm/common/grpc/BUILD index 7bff874ad8..af68ba7d63 100644 --- a/src/main/java/build/buildfarm/common/grpc/BUILD +++ b/src/main/java/build/buildfarm/common/grpc/BUILD @@ -13,6 +13,7 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", + "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:org_projectlombok_lombok", diff --git a/src/main/java/build/buildfarm/common/grpc/Channels.java b/src/main/java/build/buildfarm/common/grpc/Channels.java new file mode 100644 index 0000000000..0531218f23 --- /dev/null +++ b/src/main/java/build/buildfarm/common/grpc/Channels.java @@ -0,0 +1,40 @@ +// Copyright 2023 The Bazel Authors. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package build.buildfarm.common.grpc; + +import io.grpc.ManagedChannel; +import io.grpc.netty.NegotiationType; +import io.grpc.netty.NettyChannelBuilder; + +public final class Channels { + private static final String GRPCS_URL_PREFIX = "grpcs://"; + private static final String GRPC_URL_PREFIX = "grpc://"; + + private Channels() {} + + public static ManagedChannel createChannel(String target) { + NegotiationType negotiationType = NegotiationType.PLAINTEXT; + if (target.startsWith(GRPCS_URL_PREFIX)) { + target = target.substring(GRPCS_URL_PREFIX.length()); + negotiationType = NegotiationType.TLS; + } else if (target.startsWith(GRPC_URL_PREFIX)) { + target = target.substring(GRPC_URL_PREFIX.length()); + negotiationType = NegotiationType.PLAINTEXT; + } + NettyChannelBuilder builder = + NettyChannelBuilder.forTarget(target).negotiationType(negotiationType); + return builder.build(); + } +} diff --git a/src/main/java/build/buildfarm/common/services/BUILD b/src/main/java/build/buildfarm/common/services/BUILD index f2f643001e..9a885cbafe 100644 --- a/src/main/java/build/buildfarm/common/services/BUILD +++ b/src/main/java/build/buildfarm/common/services/BUILD @@ -23,7 +23,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_services", "@maven//:io_grpc_grpc_stub", diff --git a/src/main/java/build/buildfarm/common/services/ByteStreamService.java b/src/main/java/build/buildfarm/common/services/ByteStreamService.java index f457eee7fe..d2003c32e3 100644 --- a/src/main/java/build/buildfarm/common/services/ByteStreamService.java +++ b/src/main/java/build/buildfarm/common/services/ByteStreamService.java @@ -341,7 +341,7 @@ public void read(ReadRequest request, StreamObserver responseObser long offset = request.getReadOffset(); long limit = request.getReadLimit(); log.log( - Level.FINER, + Level.FINEST, format("read resource_name=%s offset=%d limit=%d", resourceName, offset, limit)); try { @@ -356,7 +356,7 @@ public void queryWriteStatus( QueryWriteStatusRequest request, StreamObserver responseObserver) { String resourceName = request.getResourceName(); try { - log.log(Level.FINE, format("queryWriteStatus(%s)", resourceName)); + log.log(Level.FINER, format("queryWriteStatus(%s)", resourceName)); Write write = getWrite(resourceName); responseObserver.onNext( QueryWriteStatusResponse.newBuilder() @@ -365,7 +365,7 @@ public void queryWriteStatus( .build()); responseObserver.onCompleted(); log.log( - Level.FINE, + Level.FINER, format( "queryWriteStatus(%s) => committed_size = %d, complete = %s", resourceName, write.getCommittedSize(), write.isComplete())); diff --git a/src/main/java/build/buildfarm/common/services/ContentAddressableStorageService.java b/src/main/java/build/buildfarm/common/services/ContentAddressableStorageService.java index 9395d63fea..6e00e39f67 100644 --- a/src/main/java/build/buildfarm/common/services/ContentAddressableStorageService.java +++ b/src/main/java/build/buildfarm/common/services/ContentAddressableStorageService.java @@ -109,7 +109,7 @@ public void onSuccess(FindMissingBlobsResponse.Builder builder) { long elapsedMicros = stopwatch.elapsed(MICROSECONDS); missingBlobs.observe(request.getBlobDigestsList().size()); log.log( - Level.FINE, + Level.FINER, "FindMissingBlobs(" + instance.getName() + ") for " diff --git a/src/main/java/build/buildfarm/common/services/WriteStreamObserver.java b/src/main/java/build/buildfarm/common/services/WriteStreamObserver.java index 4e0d28d68b..5bb88bd010 100644 --- a/src/main/java/build/buildfarm/common/services/WriteStreamObserver.java +++ b/src/main/java/build/buildfarm/common/services/WriteStreamObserver.java @@ -111,7 +111,7 @@ public synchronized void onNext(WriteRequest request) { Status status = Status.fromThrowable(e); if (errorResponse(status.asException())) { log.log( - status.getCode() == Status.Code.CANCELLED ? Level.FINE : Level.SEVERE, + status.getCode() == Status.Code.CANCELLED ? Level.FINER : Level.SEVERE, format("error writing %s", (name == null ? request.getResourceName() : name)), e); } @@ -156,7 +156,7 @@ synchronized void commitSynchronized(long committedSize) { if (Context.current().isCancelled()) { log.log( - Level.FINER, + Level.FINEST, format("skipped delivering committed_size to %s for cancelled context", name)); } else { try { @@ -178,7 +178,7 @@ synchronized void commitSynchronized(long committedSize) { Status status = Status.fromThrowable(e); if (errorResponse(status.asException())) { log.log( - status.getCode() == Status.Code.CANCELLED ? Level.FINE : Level.SEVERE, + status.getCode() == Status.Code.CANCELLED ? Level.FINER : Level.SEVERE, format( "%s-%s: %s -> %s -> %s: error committing %s", requestMetadata.getToolDetails().getToolName(), @@ -198,7 +198,8 @@ void commitActive(long committedSize) { if (exception.compareAndSet(null, null)) { try { - log.log(Level.FINER, format("delivering committed_size for %s of %d", name, committedSize)); + log.log( + Level.FINEST, format("delivering committed_size for %s of %d", name, committedSize)); responseObserver.onNext(response); responseObserver.onCompleted(); } catch (Exception e) { @@ -218,9 +219,9 @@ private void initialize(WriteRequest request) throws InvalidResourceNameExceptio name = resourceName; try { write = getWrite(resourceName); - if (log.isLoggable(Level.FINER)) { + if (log.isLoggable(Level.FINEST)) { log.log( - Level.FINER, + Level.FINEST, format( "registering callback for %s: committed_size = %d (transient), complete = %s", resourceName, write.getCommittedSize(), write.isComplete())); @@ -287,6 +288,13 @@ private boolean errorResponse(Throwable t) { requestMetadata.getToolInvocationId(), requestMetadata.getActionId(), name)); + } else { + log.log( + Level.WARNING, + format( + "error %s after %d requests and %d bytes at offset %d", + name, requestCount, requestBytes, earliestOffset), + t); } return true; } @@ -380,7 +388,7 @@ private void handleWrite(String resourceName, long offset, ByteString data, bool data = data.substring(skipBytes); } log.log( - Level.FINER, + Level.FINEST, format( "writing %d to %s at %d%s", bytesToWrite, name, offset, finishWrite ? " with finish_write" : "")); @@ -396,7 +404,7 @@ private void handleWrite(String resourceName, long offset, ByteString data, bool @GuardedBy("this") private void close() { - log.log(Level.FINER, format("closing stream due to finishWrite for %s", name)); + log.log(Level.FINEST, format("closing stream due to finishWrite for %s", name)); try { getOutput().close(); } catch (DigestMismatchException e) { @@ -484,11 +492,11 @@ private FeedbackOutputStream getOutput() throws IOException { @Override public void onError(Throwable t) { - log.log(Level.FINE, format("write error for %s", name), t); + log.log(Level.FINER, format("write error for %s", name), t); } @Override public void onCompleted() { - log.log(Level.FINE, format("write completed for %s", name)); + log.log(Level.FINER, format("write completed for %s", name)); } } diff --git a/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java b/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java index 611a23e9b9..f0f9b4e9b7 100644 --- a/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java +++ b/src/main/java/build/buildfarm/instance/server/AbstractServerInstance.java @@ -1180,6 +1180,9 @@ void validateCommand( } else { Directory directory = directoriesIndex.get(inputRootDigest); for (String segment : workingDirectory.split("/")) { + if (segment.equals(".")) { + continue; + } Directory nextDirectory = directory; // linear for now for (DirectoryNode dirNode : directory.getDirectoriesList()) { diff --git a/src/main/java/build/buildfarm/instance/server/BUILD b/src/main/java/build/buildfarm/instance/server/BUILD index 601519c867..ecff968e79 100644 --- a/src/main/java/build/buildfarm/instance/server/BUILD +++ b/src/main/java/build/buildfarm/instance/server/BUILD @@ -27,7 +27,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:io_netty_netty_codec_http", diff --git a/src/main/java/build/buildfarm/instance/shard/BUILD b/src/main/java/build/buildfarm/instance/shard/BUILD index 9b11f1543e..38e6992aae 100644 --- a/src/main/java/build/buildfarm/instance/shard/BUILD +++ b/src/main/java/build/buildfarm/instance/shard/BUILD @@ -28,7 +28,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:io_prometheus_simpleclient", diff --git a/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java b/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java index 922558b50b..5f85da29f6 100644 --- a/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java +++ b/src/main/java/build/buildfarm/instance/shard/JedisClusterFactory.java @@ -30,6 +30,7 @@ import redis.clients.jedis.JedisPoolConfig; import redis.clients.jedis.ScanParams; import redis.clients.jedis.ScanResult; +import redis.clients.jedis.util.JedisURIHelper; /** * @class JedisClusterFactory @@ -43,15 +44,17 @@ public class JedisClusterFactory { * @brief Create a jedis cluster instance. * @details Use proto configuration to connect to a redis cluster server and provide a jedis * client. - * @param config Configuration for connecting to a redis cluster server. + * @param identifier Redis Client name. * @return An established jedis client used to operate on the redis cluster. * @note Suggested return identifier: jedis. + * @link Redis Client name */ - public static Supplier create() throws ConfigurationException { + public static Supplier create(String identifier) throws ConfigurationException { // null password is required to elicit no auth in jedis String[] redisNodes = configs.getBackplane().getRedisNodes(); if (redisNodes != null && redisNodes.length > 0) { return createJedisClusterFactory( + identifier, list2Set(redisNodes), configs.getBackplane().getTimeout(), configs.getBackplane().getMaxAttempts(), @@ -63,6 +66,7 @@ public static Supplier create() throws ConfigurationException { // support "" as redis password. return createJedisClusterFactory( + identifier, parseUri(configs.getBackplane().getRedisUri()), configs.getBackplane().getTimeout(), configs.getBackplane().getMaxAttempts(), @@ -80,7 +84,7 @@ public static Supplier create() throws ConfigurationException { * @note Suggested return identifier: jedis. */ public static JedisCluster createTest() throws Exception { - JedisCluster redis = JedisClusterFactory.create().get(); + JedisCluster redis = JedisClusterFactory.create("test").get(); // use the client to create an empty redis cluster // this will prevent any persistent data across test runs @@ -151,7 +155,12 @@ private static void deleteExistingKeys(Jedis node) { * @note Suggested return identifier: jedis. */ private static Supplier createJedisClusterFactory( - URI redisUri, int timeout, int maxAttempts, String password, JedisPoolConfig poolConfig) { + String identifier, + URI redisUri, + int timeout, + int maxAttempts, + String password, + JedisPoolConfig poolConfig) { return () -> new JedisCluster( new HostAndPort(redisUri.getHost(), redisUri.getPort()), @@ -159,7 +168,9 @@ private static Supplier createJedisClusterFactory( /* soTimeout=*/ Integer.max(2000, timeout), Integer.max(5, maxAttempts), password, - poolConfig); + identifier, + poolConfig, + /* ssl=*/ JedisURIHelper.isRedisSSLScheme(redisUri)); } /** @@ -174,6 +185,7 @@ private static Supplier createJedisClusterFactory( * @note Suggested return identifier: jedis. */ private static Supplier createJedisClusterFactory( + String identifier, Set redisUrisNodes, int timeout, int maxAttempts, @@ -186,12 +198,13 @@ private static Supplier createJedisClusterFactory( /* soTimeout=*/ Integer.max(2000, timeout), Integer.max(5, maxAttempts), password, - poolConfig); + identifier, + poolConfig, + /* ssl=*/ false); } /** * @brief Create a jedis pool config. * @details Use configuration to build the appropriate jedis pool configuration. - * @param config Configuration for connecting to a redis cluster server. * @return A created jedis pool config. * @note Suggested return identifier: poolConfig. */ diff --git a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java index cc780707f7..a0d96ea081 100644 --- a/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java +++ b/src/main/java/build/buildfarm/instance/shard/RedisShardBackplane.java @@ -152,7 +152,7 @@ public RedisShardBackplane( Function onPublish, Function onComplete) throws ConfigurationException { - this(source, onPublish, onComplete, JedisClusterFactory.create()); + this(source, onPublish, onComplete, JedisClusterFactory.create(source)); } public RedisShardBackplane( @@ -206,7 +206,7 @@ public void visit(String entry) { JsonFormat.parser().merge(entry, executeEntry); visit(executeEntry.build(), entry); } catch (InvalidProtocolBufferException e) { - log.log(Level.FINE, "invalid ExecuteEntry json: " + entry, e); + log.log(Level.FINER, "invalid ExecuteEntry json: " + entry, e); } } } @@ -330,10 +330,10 @@ private void updateWatchers(JedisCluster jedis) { if (!expiringChannels.isEmpty()) { log.log( - Level.FINE, + Level.FINER, format("Scan %d watches, %s, expiresAt: %s", expiringChannels.size(), now, expiresAt)); - log.log(Level.FINE, "Scan prequeue"); + log.log(Level.FINER, "Scan prequeue"); // scan prequeue, pet watches scanPrequeue(jedis, resetChannel); } @@ -342,7 +342,7 @@ private void updateWatchers(JedisCluster jedis) { scanProcessing(jedis, resetChannel, now); if (!expiringChannels.isEmpty()) { - log.log(Level.FINE, "Scan queue"); + log.log(Level.FINER, "Scan queue"); // scan queue, pet watches scanQueue(jedis, resetChannel); } @@ -351,7 +351,7 @@ private void updateWatchers(JedisCluster jedis) { scanDispatching(jedis, resetChannel, now); if (!expiringChannels.isEmpty()) { - log.log(Level.FINE, "Scan dispatched"); + log.log(Level.FINER, "Scan dispatched"); // scan dispatched pet watches scanDispatched(jedis, resetChannel); } @@ -445,7 +445,7 @@ public void updateWatchedIfDone(JedisCluster jedis) { } subscriber.onOperation(operationChannel(operationName), operation, nextExpiresAt(now)); log.log( - Level.FINE, + Level.FINER, format( "operation %s done due to %s", operationName, operation == null ? "null" : "completed")); @@ -538,24 +538,24 @@ public synchronized void stop() throws InterruptedException { if (failsafeOperationThread != null) { failsafeOperationThread.interrupt(); failsafeOperationThread.join(); - log.log(Level.FINE, "failsafeOperationThread has been stopped"); + log.log(Level.FINER, "failsafeOperationThread has been stopped"); } if (operationSubscription != null) { operationSubscription.stop(); if (subscriptionThread != null) { subscriptionThread.join(); } - log.log(Level.FINE, "subscriptionThread has been stopped"); + log.log(Level.FINER, "subscriptionThread has been stopped"); } if (subscriberService != null) { subscriberService.shutdown(); subscriberService.awaitTermination(10, SECONDS); - log.log(Level.FINE, "subscriberService has been stopped"); + log.log(Level.FINER, "subscriberService has been stopped"); } if (client != null) { client.close(); client = null; - log.log(Level.FINE, "client has been closed"); + log.log(Level.FINER, "client has been closed"); } } diff --git a/src/main/java/build/buildfarm/instance/shard/RedisShardSubscriber.java b/src/main/java/build/buildfarm/instance/shard/RedisShardSubscriber.java index 50357bd30d..cd1d65972e 100644 --- a/src/main/java/build/buildfarm/instance/shard/RedisShardSubscriber.java +++ b/src/main/java/build/buildfarm/instance/shard/RedisShardSubscriber.java @@ -142,7 +142,7 @@ public ListenableFuture watch(String channel, TimedWatcher watcher) { new TimedWatchFuture(watcher) { @Override public void unwatch() { - log.log(Level.FINE, format("unwatching %s", channel)); + log.log(Level.FINER, format("unwatching %s", channel)); RedisShardSubscriber.this.unwatch(channel, this); } }; @@ -204,7 +204,7 @@ private void onOperation( @Nullable Instant expiresAt) { List operationWatchers = watchers.get(channel); boolean observe = operation == null || operation.hasMetadata() || operation.getDone(); - log.log(Level.FINE, format("onOperation %s: %s", channel, operation)); + log.log(Level.FINER, format("onOperation %s: %s", channel, operation)); synchronized (watchers) { ImmutableList.Builder> observers = ImmutableList.builder(); for (TimedWatchFuture watchFuture : operationWatchers) { @@ -220,7 +220,7 @@ private void onOperation( executor.execute( () -> { if (observe) { - log.log(Level.FINE, "observing " + operation); + log.log(Level.FINER, "observing " + operation); observer.accept(operation); } }); diff --git a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java b/src/main/java/build/buildfarm/instance/shard/ShardInstance.java index 1ed4c8b41b..cc0ad69737 100644 --- a/src/main/java/build/buildfarm/instance/shard/ShardInstance.java +++ b/src/main/java/build/buildfarm/instance/shard/ShardInstance.java @@ -416,14 +416,14 @@ ListenableFuture iterate() throws IOException, InterruptedException { () -> {}, Deadline.after(5, MINUTES)); try { - log.log(Level.FINE, "queueing " + operationName); + log.log(Level.FINER, "queueing " + operationName); ListenableFuture queueFuture = queue(executeEntry, poller, queueTimeout); addCallback( queueFuture, new FutureCallback() { @Override public void onSuccess(Void result) { - log.log(Level.FINE, "successfully queued " + operationName); + log.log(Level.FINER, "successfully queued " + operationName); // nothing } @@ -437,7 +437,7 @@ public void onFailure(Throwable t) { long operationTransformDispatchUSecs = stopwatch.elapsed(MICROSECONDS) - canQueueUSecs; log.log( - Level.FINE, + Level.FINER, format( "OperationQueuer: Dispatched To Transform %s: %dus in canQueue, %dus in transform dispatch", operationName, canQueueUSecs, operationTransformDispatchUSecs)); @@ -452,7 +452,7 @@ public void onFailure(Throwable t) { @Override public void run() { - log.log(Level.FINE, "OperationQueuer: Running"); + log.log(Level.FINER, "OperationQueuer: Running"); try { while (transformTokensQueue.offer(new Object(), 5, MINUTES)) { stopwatch.start(); @@ -485,7 +485,7 @@ public void run() { } catch (Exception t) { log.log(Level.SEVERE, "OperationQueuer: fatal exception during iteration", t); } finally { - log.log(Level.FINE, "OperationQueuer: Exiting"); + log.log(Level.FINER, "OperationQueuer: Exiting"); } operationQueuer = null; try { @@ -570,7 +570,7 @@ public void stop() throws InterruptedException { return; } stopping = true; - log.log(Level.FINE, format("Instance %s is stopping", getName())); + log.log(Level.FINER, format("Instance %s is stopping", getName())); if (operationQueuer != null) { operationQueuer.stop(); } @@ -606,7 +606,7 @@ public void stop() throws InterruptedException { } actionCacheFetchService.shutdownNow(); workerStubs.invalidateAll(); - log.log(Level.FINE, format("Instance %s has been stopped", getName())); + log.log(Level.FINER, format("Instance %s has been stopped", getName())); stopping = false; stopped = true; } @@ -647,81 +647,42 @@ public ListenableFuture> findMissingBlobs( return immediateFailedFuture(Status.fromThrowable(e).asException()); } - // Empty blobs are an exceptional case. Filter them out. - // If the user only requested empty blobs we can immedaitely tell them we already have it. + // Empty blobs are an exceptional case. Filter them out. + // If the user only requested empty blobs we can immediately tell them we already have it. Iterable nonEmptyDigests = Iterables.filter(blobDigests, (digest) -> digest.getSizeBytes() != 0); if (Iterables.isEmpty(nonEmptyDigests)) { return immediateFuture(ImmutableList.of()); } - // This is a faster strategy to check missing blobs which does not require querying the CAS. - // With hundreds of worker machines, it may be too expensive to query all of them for "find - // missing blobs". - // Workers register themselves with the backplane for a 30-second window, and if they fail to - // re-register within this time frame, they are automatically removed from the backplane. While - // this alternative strategy for finding missing blobs is faster and more cost-effective than - // the exhaustive approach of querying each worker to find the digest, it comes with a higher - // risk of returning expired workers despite filtering by active workers below. This is because - // the strategy may return workers that have expired in the last 30 seconds. However, checking - // workers directly is not a guarantee either since workers could leave the cluster after being - // queried. Ultimately, it will come down to the client's resiliency if the backplane is - // out-of-date and the server lies about which blobs are actually present. We provide this - // alternative strategy for calculating missing blobs. - if (configs.getServer().isFindMissingBlobsViaBackplane()) { - try { - Set uniqueDigests = new HashSet<>(); - nonEmptyDigests.forEach(uniqueDigests::add); - Map> foundBlobs = backplane.getBlobDigestsWorkers(uniqueDigests); - Set workerSet = backplane.getStorageWorkers(); - Map workersStartTime = backplane.getWorkersStartTimeInEpochSecs(workerSet); - return immediateFuture( - uniqueDigests.stream() - .filter( // best effort to present digests only missing on active workers - digest -> { - try { - Set initialWorkers = - foundBlobs.getOrDefault(digest, Collections.emptySet()); - Set activeWorkers = Sets.intersection(initialWorkers, workerSet); - long insertTime = backplane.getDigestInsertTime(digest); - Set workersStartedBeforeDigestInsertion = - activeWorkers.stream() - .filter( - worker -> - workersStartTime.getOrDefault( - worker, Instant.now().getEpochSecond()) - < insertTime) - .collect(Collectors.toSet()); - Set workersToBeRemoved = - Sets.difference(initialWorkers, workersStartedBeforeDigestInsertion) - .immutableCopy(); - if (!workersToBeRemoved.isEmpty()) { - log.log( - Level.INFO, format("adjusting locations for the digest %s", digest)); - backplane.adjustBlobLocations( - digest, Collections.emptySet(), workersToBeRemoved); - } - return workersStartedBeforeDigestInsertion.isEmpty(); - } catch (IOException e) { - // Treat error as missing digest. - log.log( - Level.WARNING, - format("failed to get digest (%s) insertion time", digest)); - return true; - } - }) - .collect(Collectors.toList())); - } catch (Exception e) { - log.log(Level.SEVERE, "find missing blob via backplane failed", e); - return immediateFailedFuture(Status.fromThrowable(e).asException()); - } + return findMissingBlobsViaBackplane(nonEmptyDigests); } - // A more accurate way to verify missing blobs is to ask the CAS participants directly if they - // have the blobs. To do this, we get all of the worker nodes that are particpating in the CAS - // as a random list to begin our search. If there are no workers avaiable, tell the client all - // blobs are missing. + return findMissingBlobsQueryingEachWorker(nonEmptyDigests, requestMetadata); + } + + class FindMissingResponseEntry { + final String worker; + final long elapsedMicros; + final Throwable exception; + final int stillMissingAfter; + + FindMissingResponseEntry( + String worker, long elapsedMicros, Throwable exception, int stillMissingAfter) { + this.worker = worker; + this.elapsedMicros = elapsedMicros; + this.exception = exception; + this.stillMissingAfter = stillMissingAfter; + } + } + + // A more accurate way to verify missing blobs is to ask the CAS participants directly if they + // have the blobs. To do this, we get all the worker nodes that are participating in the CAS + // as a random list to begin our search. If there are no workers available, tell the client all + // blobs are missing. + private ListenableFuture> findMissingBlobsQueryingEachWorker( + Iterable nonEmptyDigests, RequestMetadata requestMetadata) { Deque workers; try { List workersList = new ArrayList<>(backplane.getStorageWorkers()); @@ -734,7 +695,7 @@ public ListenableFuture> findMissingBlobs( return immediateFuture(nonEmptyDigests); } - // Search through all of the workers to decide how many CAS blobs are missing. + // Search through all of the workers to decide which CAS blobs are missing. SettableFuture> missingDigestsFuture = SettableFuture.create(); findMissingBlobsOnWorker( UUID.randomUUID().toString(), @@ -748,19 +709,80 @@ public ListenableFuture> findMissingBlobs( return missingDigestsFuture; } - class FindMissingResponseEntry { - final String worker; - final long elapsedMicros; - final Throwable exception; - final int stillMissingAfter; + // This is a faster strategy to check missing blobs which does not require querying the CAS. + // With hundreds of worker machines, it may be too expensive to query all of them for "find + // missing blobs". + // Workers register themselves with the backplane for a 30-second window, and if they fail to + // re-register within this time frame, they are automatically removed from the backplane. While + // this alternative strategy for finding missing blobs is faster and more cost-effective than + // the exhaustive approach of querying each worker to find the digest, it comes with a higher + // risk of returning expired workers despite filtering by active workers below. This is because + // the strategy may return workers that have expired in the last 30 seconds. However, checking + // workers directly is not a guarantee either since workers could leave the cluster after being + // queried. Ultimately, it will come down to the client's resiliency if the backplane is + // out-of-date and the server lies about which blobs are actually present. We provide this + // alternative strategy for calculating missing blobs. + private ListenableFuture> findMissingBlobsViaBackplane( + Iterable nonEmptyDigests) { + try { + Set uniqueDigests = new HashSet<>(); + nonEmptyDigests.forEach(uniqueDigests::add); + Map> foundBlobs = backplane.getBlobDigestsWorkers(uniqueDigests); + Set workerSet = backplane.getStorageWorkers(); + Map workersStartTime = backplane.getWorkersStartTimeInEpochSecs(workerSet); + return immediateFuture( + uniqueDigests.stream() + .filter( // best effort to present digests only missing on active workers + digest -> { + Set initialWorkers = + foundBlobs.getOrDefault(digest, Collections.emptySet()); + return filterAndAdjustWorkersForDigest( + digest, initialWorkers, workerSet, workersStartTime) + .isEmpty(); + }) + .collect(Collectors.toList())); + } catch (Exception e) { + log.log(Level.SEVERE, "find missing blob via backplane failed", e); + return immediateFailedFuture(Status.fromThrowable(e).asException()); + } + } - FindMissingResponseEntry( - String worker, long elapsedMicros, Throwable exception, int stillMissingAfter) { - this.worker = worker; - this.elapsedMicros = elapsedMicros; - this.exception = exception; - this.stillMissingAfter = stillMissingAfter; + private Set filterAndAdjustWorkersForDigest( + Digest digest, + Set originalWorkerSetWithDigest, + Set activeWorkers, + Map workersStartTime) { + long insertTime; + try { + insertTime = backplane.getDigestInsertTime(digest); + } catch (IOException e) { + log.log(Level.WARNING, format("failed to get digest (%s) insertion time", digest)); + return Collections.emptySet(); + } + Set activeWorkersWithDigest = + Sets.intersection(originalWorkerSetWithDigest, activeWorkers); + Set workersStartedBeforeDigestInsertion = + activeWorkersWithDigest.stream() + .filter( + worker -> + workersStartTime.getOrDefault(worker, Instant.now().getEpochSecond()) + < insertTime) + .collect(Collectors.toSet()); + Set workersToBeRemoved = + Sets.difference(originalWorkerSetWithDigest, workersStartedBeforeDigestInsertion) + .immutableCopy(); + if (!workersToBeRemoved.isEmpty()) { + try { + log.log(Level.INFO, format("adjusting locations for the digest %s", digest)); + backplane.adjustBlobLocations(digest, Collections.emptySet(), workersToBeRemoved); + } catch (IOException e) { + log.log( + Level.WARNING, + format("error adjusting blob location for %s", DigestUtil.toString(digest)), + e); + } } + return workersStartedBeforeDigestInsertion; } private void findMissingBlobsOnWorker( @@ -877,7 +899,7 @@ private void fetchBlobFromWorker( public void onNext(ByteString nextChunk) { blobObserver.onNext(nextChunk); received += nextChunk.size(); - ioMetric.observe(received); + ioMetric.observe(nextChunk.size()); } @Override @@ -890,7 +912,7 @@ public void onError(Throwable t) { } else if (status.getCode() == Code.NOT_FOUND) { casMissCounter.inc(); log.log( - configs.getServer().isEnsureOutputsPresent() ? Level.WARNING : Level.FINE, + configs.getServer().isEnsureOutputsPresent() ? Level.WARNING : Level.FINER, worker + " did not contain " + DigestUtil.toString(blobDigest)); // ignore this, the worker will update the backplane eventually } else if (status.getCode() != Code.DEADLINE_EXCEEDED @@ -901,8 +923,11 @@ public void onError(Throwable t) { log.log( Level.WARNING, format( - "DEADLINE_EXCEEDED: read(%s) on worker %s after %d bytes of content", - DigestUtil.toString(blobDigest), worker, received)); + "%s: read(%s) on worker %s after %d bytes of content", + status.getCode().name(), + DigestUtil.toString(blobDigest), + worker, + received)); blobObserver.onError(t); return; } @@ -1001,7 +1026,7 @@ public void getBlob( final ListenableFuture> populatedWorkerListFuture; if (emptyWorkerList) { log.log( - Level.FINE, + Level.FINER, format( "worker list was initially empty for %s, attempting to correct", DigestUtil.toString(blobDigest))); @@ -1017,7 +1042,7 @@ public void getBlob( RequestMetadata.getDefaultInstance()), (foundOnWorkers) -> { log.log( - Level.FINE, + Level.FINER, format( "worker list was corrected for %s to be %s", DigestUtil.toString(blobDigest), foundOnWorkers.toString())); @@ -1047,7 +1072,7 @@ public void onError(Throwable t) { workersList.clear(); final ListenableFuture> workersListFuture; log.log( - Level.FINE, + Level.FINER, format( "worker list was depleted for %s, attempting to correct", DigestUtil.toString(blobDigest))); @@ -1063,7 +1088,7 @@ public void onError(Throwable t) { RequestMetadata.getDefaultInstance()), (foundOnWorkers) -> { log.log( - Level.FINE, + Level.FINER, format( "worker list was corrected after depletion for %s to be %s", DigestUtil.toString(blobDigest), foundOnWorkers.toString())); @@ -1395,7 +1420,7 @@ ListenableFuture expectDirectory( @Override public CompletableFuture apply(Digest digest, Executor executor) { log.log( - Level.FINE, + Level.FINER, format( "transformQueuedOperation(%s): fetching directory %s", reason, DigestUtil.toString(directoryBlobDigest))); @@ -1530,7 +1555,7 @@ private ListenableFuture transformQueuedOperation( expectCommand(commandDigest, requestMetadata), (command) -> { log.log( - Level.FINE, + Level.FINER, format("transformQueuedOperation(%s): fetched command", operationName)); if (command != null) { queuedOperationBuilder.setCommand(command); @@ -2020,7 +2045,7 @@ public ListenableFuture execute( executionSuccess.inc(); log.log( - Level.FINE, + Level.FINER, new StringBuilder() .append("ExecutionSuccess: ") .append(requestMetadata.getToolInvocationId()) @@ -2033,7 +2058,7 @@ public ListenableFuture execute( actionCache.invalidate(DigestUtil.asActionKey(actionDigest)); if (!skipCacheLookup && recentCacheServedExecutions.getIfPresent(requestMetadata) != null) { log.log( - Level.FINE, + Level.FINER, format("Operation %s will have skip_cache_lookup = true due to retry", operationName)); skipCacheLookup = true; } @@ -2258,7 +2283,7 @@ public ListenableFuture queue(ExecuteEntry executeEntry, Poller poller, Du poller.pause(); long checkCacheUSecs = stopwatch.elapsed(MICROSECONDS); log.log( - Level.FINE, + Level.FINER, format( "ShardInstance(%s): checkCache(%s): %sus elapsed", getName(), operation.getName(), checkCacheUSecs)); @@ -2285,7 +2310,7 @@ private ListenableFuture transformAndQueue( Digest actionDigest = metadata.getActionDigest(); SettableFuture queueFuture = SettableFuture.create(); log.log( - Level.FINE, + Level.FINER, format( "ShardInstance(%s): queue(%s): fetching action %s", getName(), operation.getName(), actionDigest.getHash())); @@ -2328,7 +2353,7 @@ private ListenableFuture transformAndQueue( actionFuture, (action) -> { log.log( - Level.FINE, + Level.FINER, format( "ShardInstance(%s): queue(%s): fetched action %s transforming queuedOperation", getName(), operation.getName(), actionDigest.getHash())); @@ -2358,7 +2383,7 @@ private ListenableFuture transformAndQueue( queuedFuture, (profiledQueuedMetadata) -> { log.log( - Level.FINE, + Level.FINER, format( "ShardInstance(%s): queue(%s): queuedOperation %s transformed, validating", getName(), @@ -2380,7 +2405,7 @@ private ListenableFuture transformAndQueue( validatedFuture, (profiledQueuedMetadata) -> { log.log( - Level.FINE, + Level.FINER, format( "ShardInstance(%s): queue(%s): queuedOperation %s validated, uploading", getName(), @@ -2432,7 +2457,7 @@ public void onSuccess(ProfiledQueuedOperationMetadata profiledQueuedMetadata) { long elapsedUSecs = stopwatch.elapsed(MICROSECONDS); long queueUSecs = elapsedUSecs - startQueueUSecs; log.log( - Level.FINE, + Level.FINER, format( "ShardInstance(%s): queue(%s): %dus checkCache, %dus transform, %dus validate, %dus upload, %dus queue, %dus elapsed", getName(), diff --git a/src/main/java/build/buildfarm/instance/shard/Util.java b/src/main/java/build/buildfarm/instance/shard/Util.java index 5e3493d0c6..7070097659 100644 --- a/src/main/java/build/buildfarm/instance/shard/Util.java +++ b/src/main/java/build/buildfarm/instance/shard/Util.java @@ -141,7 +141,7 @@ public void onFailure(Throwable t) { } }; log.log( - Level.FINE, + Level.FINER, format( "scanning through %d workers to find %s", workerSet.size(), DigestUtil.toString(digest))); @@ -184,7 +184,7 @@ static void checkMissingBlobOnInstance( public void onSuccess(Iterable missingDigests) { boolean found = Iterables.isEmpty(missingDigests); log.log( - Level.FINE, + Level.FINER, format( "check missing response for %s to %s was %sfound", DigestUtil.toString(digest), worker, found ? "" : "not ")); @@ -197,7 +197,7 @@ public void onFailure(Throwable t) { Status status = Status.fromThrowable(t); if (status.getCode() == Code.UNAVAILABLE) { log.log( - Level.FINE, + Level.FINER, format( "check missing response for %s to %s was not found for unavailable", DigestUtil.toString(digest), worker)); diff --git a/src/main/java/build/buildfarm/instance/shard/WorkerStubs.java b/src/main/java/build/buildfarm/instance/shard/WorkerStubs.java index 28a29afd7f..abaf5f71e1 100644 --- a/src/main/java/build/buildfarm/instance/shard/WorkerStubs.java +++ b/src/main/java/build/buildfarm/instance/shard/WorkerStubs.java @@ -14,6 +14,7 @@ package build.buildfarm.instance.shard; +import static build.buildfarm.common.grpc.Channels.createChannel; import static com.google.common.util.concurrent.MoreExecutors.listeningDecorator; import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; @@ -28,9 +29,6 @@ import com.google.common.cache.RemovalListener; import com.google.common.util.concurrent.ListeningScheduledExecutorService; import com.google.protobuf.Duration; -import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import java.util.concurrent.TimeUnit; public final class WorkerStubs { @@ -59,17 +57,12 @@ private static Instance newStubInstance(String worker, DigestUtil digestUtil, Du worker, digestUtil, createChannel(worker), + createChannel(worker), // separate write channel timeout, newStubRetrier(), newStubRetryService()); } - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - private static Retrier newStubRetrier() { return new Retrier( Backoff.exponential( diff --git a/src/main/java/build/buildfarm/instance/stub/BUILD b/src/main/java/build/buildfarm/instance/stub/BUILD index c8b2b82e77..1f5a2b916d 100644 --- a/src/main/java/build/buildfarm/instance/stub/BUILD +++ b/src/main/java/build/buildfarm/instance/stub/BUILD @@ -23,7 +23,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:org_projectlombok_lombok", diff --git a/src/main/java/build/buildfarm/instance/stub/StubInstance.java b/src/main/java/build/buildfarm/instance/stub/StubInstance.java index a4dd880c5c..8045dc5db9 100644 --- a/src/main/java/build/buildfarm/instance/stub/StubInstance.java +++ b/src/main/java/build/buildfarm/instance/stub/StubInstance.java @@ -161,6 +161,7 @@ public class StubInstance implements Instance { private final String identifier; private final DigestUtil digestUtil; private final ManagedChannel channel; + private final ManagedChannel writeChannel; private final @Nullable Duration grpcTimeout; private final Retrier retrier; private final @Nullable ListeningScheduledExecutorService retryService; @@ -187,12 +188,24 @@ public StubInstance( this(name, identifier, digestUtil, channel, grpcTimeout, NO_RETRIES, /* retryService=*/ null); } + public StubInstance( + String name, + String identifier, + DigestUtil digestUtil, + ManagedChannel channel, + Duration grpcTimeout, + Retrier retrier, + @Nullable ListeningScheduledExecutorService retryService) { + this(name, identifier, digestUtil, channel, channel, grpcTimeout, retrier, retryService); + } + @SuppressWarnings("NullableProblems") public StubInstance( String name, String identifier, DigestUtil digestUtil, ManagedChannel channel, + ManagedChannel writeChannel, Duration grpcTimeout, Retrier retrier, @Nullable ListeningScheduledExecutorService retryService) { @@ -200,6 +213,7 @@ public StubInstance( this.identifier = identifier; this.digestUtil = digestUtil; this.channel = channel; + this.writeChannel = writeChannel; this.grpcTimeout = grpcTimeout; this.retrier = retrier; this.retryService = retryService; @@ -359,8 +373,14 @@ public void start(String publicName) {} @Override public void stop() throws InterruptedException { isStopped = true; - channel.shutdownNow(); - channel.awaitTermination(0, TimeUnit.SECONDS); + if (!channel.isShutdown()) { + channel.shutdownNow(); + channel.awaitTermination(0, TimeUnit.SECONDS); + } + if (!writeChannel.isShutdown()) { + writeChannel.shutdownNow(); + writeChannel.awaitTermination(0, TimeUnit.SECONDS); + } if (retryService != null && !shutdownAndAwaitTermination(retryService, 10, TimeUnit.SECONDS)) { log.log(Level.SEVERE, format("Could not shut down retry service for %s", identifier)); } @@ -662,7 +682,7 @@ Write getWrite( deadlined(bsBlockingStub).withInterceptors(attachMetadataInterceptor(requestMetadata)), Suppliers.memoize( () -> - ByteStreamGrpc.newStub(channel) + ByteStreamGrpc.newStub(writeChannel) .withInterceptors(attachMetadataInterceptor(requestMetadata))), resourceName, exceptionTranslator, diff --git a/src/main/java/build/buildfarm/metrics/AbstractMetricsPublisher.java b/src/main/java/build/buildfarm/metrics/AbstractMetricsPublisher.java index 5f9422a421..7dd09b6d0f 100644 --- a/src/main/java/build/buildfarm/metrics/AbstractMetricsPublisher.java +++ b/src/main/java/build/buildfarm/metrics/AbstractMetricsPublisher.java @@ -51,6 +51,13 @@ public abstract class AbstractMetricsPublisher implements MetricsPublisher { .labelNames("worker_name") .help("Operations per worker.") .register(); + + private static final Gauge operationExitCode = + Gauge.build() + .name("operation_exit_code") + .labelNames("exit_code") + .help("Operation execution exit code.") + .register(); private static final Histogram queuedTime = Histogram.build().name("queued_time_ms").help("Queued time in ms.").register(); private static final Histogram outputUploadTime = @@ -97,6 +104,11 @@ protected OperationRequestMetadata populateRequestMetadata( Integer.toString( operationRequestMetadata.getExecuteResponse().getStatus().getCode())) .inc(); + operationExitCode + .labels( + Integer.toString( + operationRequestMetadata.getExecuteResponse().getResult().getExitCode())) + .inc(); if (operationRequestMetadata.getExecuteResponse().hasResult() && operationRequestMetadata.getExecuteResponse().getResult().hasExecutionMetadata()) { operationsPerWorker @@ -172,7 +184,7 @@ protected static String formatRequestMetadataToJson( .usingTypeRegistry(typeRegistry) .omittingInsignificantWhitespace() .print(operationRequestMetadata); - log.log(Level.FINE, "{}", formattedRequestMetadata); + log.log(Level.FINER, "{}", formattedRequestMetadata); return formattedRequestMetadata; } } diff --git a/src/main/java/build/buildfarm/operations/finder/BUILD b/src/main/java/build/buildfarm/operations/finder/BUILD index 5c8342609b..8b6a2cc557 100644 --- a/src/main/java/build/buildfarm/operations/finder/BUILD +++ b/src/main/java/build/buildfarm/operations/finder/BUILD @@ -22,7 +22,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:org_apache_commons_commons_pool2", diff --git a/src/main/java/build/buildfarm/server/services/AdminService.java b/src/main/java/build/buildfarm/server/services/AdminService.java index 968edc1572..94178fbf27 100644 --- a/src/main/java/build/buildfarm/server/services/AdminService.java +++ b/src/main/java/build/buildfarm/server/services/AdminService.java @@ -14,6 +14,8 @@ package build.buildfarm.server.services; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.admin.Admin; import build.buildfarm.admin.aws.AwsAdmin; import build.buildfarm.admin.gcp.GcpAdmin; @@ -39,8 +41,6 @@ import com.google.rpc.Code; import com.google.rpc.Status; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import io.grpc.stub.StreamObserver; import java.util.logging.Level; import lombok.extern.java.Log; @@ -191,9 +191,7 @@ public void shutDownWorkerGracefully( private void informWorkerToPrepareForShutdown(String host) { ManagedChannel channel = null; try { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(host).negotiationType(NegotiationType.PLAINTEXT); - channel = builder.build(); + channel = createChannel(host); ShutDownWorkerGrpc.ShutDownWorkerBlockingStub shutDownWorkerBlockingStub = ShutDownWorkerGrpc.newBlockingStub(channel); shutDownWorkerBlockingStub.prepareWorkerForGracefulShutdown( diff --git a/src/main/java/build/buildfarm/server/services/BUILD b/src/main/java/build/buildfarm/server/services/BUILD index aff642ed7a..54b7af993c 100644 --- a/src/main/java/build/buildfarm/server/services/BUILD +++ b/src/main/java/build/buildfarm/server/services/BUILD @@ -30,7 +30,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_services", "@maven//:io_grpc_grpc_stub", diff --git a/src/main/java/build/buildfarm/tools/Ac.java b/src/main/java/build/buildfarm/tools/Ac.java index adfca598ff..9515fe0ec4 100644 --- a/src/main/java/build/buildfarm/tools/Ac.java +++ b/src/main/java/build/buildfarm/tools/Ac.java @@ -14,6 +14,8 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.bazel.remote.execution.v2.ActionResult; import build.buildfarm.common.DigestUtil; import build.buildfarm.common.DigestUtil.HashFunction; @@ -21,18 +23,10 @@ import build.buildfarm.instance.stub.StubInstance; import com.google.protobuf.ByteString; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; // This tool can be used to interact directly with the Action Cache API. // ./tool shard SHA256 class Ac { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static void main(String[] args) throws Exception { // get arguments for establishing an instance String host = args[0]; diff --git a/src/main/java/build/buildfarm/tools/BUILD b/src/main/java/build/buildfarm/tools/BUILD index c2f9d94ece..37c6bdb58a 100644 --- a/src/main/java/build/buildfarm/tools/BUILD +++ b/src/main/java/build/buildfarm/tools/BUILD @@ -5,6 +5,7 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "//src/main/java/build/buildfarm/worker", @@ -15,7 +16,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], @@ -57,6 +57,7 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance/stub", "@googleapis//:google_bytestream_bytestream_java_grpc", "@googleapis//:google_bytestream_bytestream_java_proto", @@ -67,7 +68,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_grpc", @@ -90,7 +90,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", @@ -108,6 +107,7 @@ java_binary( deps = [ ":worker-profiler-printer", "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", @@ -119,7 +119,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], @@ -137,6 +136,7 @@ java_binary( ":worker-profiler-printer", "//src/main/java/build/buildfarm/common", "//src/main/java/build/buildfarm/common/config", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/common/redis", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/shard", @@ -151,7 +151,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], @@ -168,6 +167,7 @@ java_binary( deps = [ ":worker-profiler-printer", "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", @@ -179,7 +179,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], @@ -191,10 +190,10 @@ java_binary( main_class = "build.buildfarm.tools.GracefulShutdownTest", visibility = ["//visibility:public"], deps = [ + "//src/main/java/build/buildfarm/common/grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_grpc", "//src/main/protobuf:build_buildfarm_v1test_buildfarm_java_proto", "@maven//:io_grpc_grpc_api", - "@maven//:io_grpc_grpc_netty", ], ) @@ -205,6 +204,7 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "@googleapis//:google_longrunning_operations_java_proto", @@ -214,7 +214,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", @@ -228,6 +227,7 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "@googleapis//:google_longrunning_operations_java_proto", @@ -237,7 +237,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", @@ -251,6 +250,7 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "@googleapis//:google_longrunning_operations_java_proto", @@ -260,7 +260,6 @@ java_binary( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@remote_apis//:build_bazel_remote_execution_v2_remote_execution_java_proto", @@ -281,12 +280,12 @@ java_binary( visibility = ["//visibility:public"], deps = [ "//src/main/java/build/buildfarm/common", + "//src/main/java/build/buildfarm/common/grpc", "//src/main/java/build/buildfarm/instance", "//src/main/java/build/buildfarm/instance/stub", "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], @@ -313,7 +312,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", ], diff --git a/src/main/java/build/buildfarm/tools/Cancel.java b/src/main/java/build/buildfarm/tools/Cancel.java index 5945df708e..24805034dc 100644 --- a/src/main/java/build/buildfarm/tools/Cancel.java +++ b/src/main/java/build/buildfarm/tools/Cancel.java @@ -14,20 +14,14 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.common.DigestUtil; import build.buildfarm.instance.Instance; import build.buildfarm.instance.stub.StubInstance; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; class Cancel { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static void main(String[] args) throws Exception { String host = args[0]; String instanceName = args[1]; diff --git a/src/main/java/build/buildfarm/tools/Cat.java b/src/main/java/build/buildfarm/tools/Cat.java index 399ca7f1fb..7c8e561454 100644 --- a/src/main/java/build/buildfarm/tools/Cat.java +++ b/src/main/java/build/buildfarm/tools/Cat.java @@ -14,6 +14,7 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; import static build.buildfarm.instance.Utils.getBlob; import static com.google.common.util.concurrent.MoreExecutors.shutdownAndAwaitTermination; import static java.lang.String.format; @@ -68,8 +69,6 @@ import io.grpc.Context; import io.grpc.ManagedChannel; import io.grpc.Status; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -86,12 +85,6 @@ import java.util.stream.StreamSupport; class Cat { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - private static void printCapabilities(ServerCapabilities capabilities) { System.out.println(capabilities); } diff --git a/src/main/java/build/buildfarm/tools/Executor.java b/src/main/java/build/buildfarm/tools/Executor.java index 77ca4002c1..4f2f9f3d6f 100644 --- a/src/main/java/build/buildfarm/tools/Executor.java +++ b/src/main/java/build/buildfarm/tools/Executor.java @@ -15,6 +15,7 @@ package build.buildfarm.tools; import static build.bazel.remote.execution.v2.ExecutionStage.Value.EXECUTING; +import static build.buildfarm.common.grpc.Channels.createChannel; import static build.buildfarm.common.io.Utils.stat; import static build.buildfarm.instance.stub.ByteStreamUploader.uploadResourceName; import static com.google.common.base.Preconditions.checkState; @@ -53,8 +54,6 @@ import com.google.rpc.Code; import io.grpc.Channel; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import io.grpc.stub.StreamObserver; import java.io.IOException; import java.io.InputStream; @@ -223,12 +222,6 @@ static void executeActions( shutdownAndAwaitTermination(service, 1, SECONDS); } - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - private static void loadFilesIntoCAS(String instanceName, Channel channel, Path blobsDir) throws Exception { ContentAddressableStorageBlockingStub casStub = diff --git a/src/main/java/build/buildfarm/tools/Extract.java b/src/main/java/build/buildfarm/tools/Extract.java index e4de193ae2..fed81ac267 100644 --- a/src/main/java/build/buildfarm/tools/Extract.java +++ b/src/main/java/build/buildfarm/tools/Extract.java @@ -14,6 +14,7 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; import static com.google.common.util.concurrent.MoreExecutors.listeningDecorator; import static java.util.concurrent.Executors.newSingleThreadExecutor; import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; @@ -41,8 +42,6 @@ import io.grpc.ManagedChannel; import io.grpc.Status; import io.grpc.Status.Code; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import io.grpc.stub.StreamObserver; import java.io.IOException; import java.io.InputStream; @@ -61,12 +60,6 @@ import java.util.concurrent.atomic.AtomicLong; class Extract { - static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static void main(String[] args) throws Exception { String host = args[0]; String instanceName = args[1]; diff --git a/src/main/java/build/buildfarm/tools/FindOperations.java b/src/main/java/build/buildfarm/tools/FindOperations.java index c858f121f6..f1d8494dff 100644 --- a/src/main/java/build/buildfarm/tools/FindOperations.java +++ b/src/main/java/build/buildfarm/tools/FindOperations.java @@ -14,14 +14,14 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.common.DigestUtil; import build.buildfarm.instance.Instance; import build.buildfarm.instance.stub.StubInstance; import com.google.common.collect.ImmutableList; import com.google.longrunning.Operation; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; // This tool can be used to find Operations based on their particular properties. // For example, it could find all of the operations executed by a particular user or particular @@ -29,12 +29,6 @@ // ./tool shard SHA256 // The operations that match the query will be printed. class FindOperations { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static void main(String[] args) throws Exception { // get arguments for establishing an instance String host = args[0]; diff --git a/src/main/java/build/buildfarm/tools/GracefulShutdownTest.java b/src/main/java/build/buildfarm/tools/GracefulShutdownTest.java index f98cddbde4..85ae5ab78d 100644 --- a/src/main/java/build/buildfarm/tools/GracefulShutdownTest.java +++ b/src/main/java/build/buildfarm/tools/GracefulShutdownTest.java @@ -14,22 +14,16 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.v1test.AdminGrpc; import build.buildfarm.v1test.DisableScaleInProtectionRequest; import build.buildfarm.v1test.PrepareWorkerForGracefulShutDownRequest; import build.buildfarm.v1test.ShutDownWorkerGracefullyRequest; import build.buildfarm.v1test.ShutDownWorkerGrpc; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; class GracefulShutdownTest { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - /** * Example command: GracefulShutdownTest ShutDown workerIp buildfarm-endpoint * diff --git a/src/main/java/build/buildfarm/tools/Hist.java b/src/main/java/build/buildfarm/tools/Hist.java index c8ec6c2bfa..2abdf55f7d 100644 --- a/src/main/java/build/buildfarm/tools/Hist.java +++ b/src/main/java/build/buildfarm/tools/Hist.java @@ -14,6 +14,8 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.bazel.remote.execution.v2.ExecuteOperationMetadata; import build.bazel.remote.execution.v2.ExecutionStage; import build.buildfarm.common.DigestUtil; @@ -23,16 +25,8 @@ import com.google.longrunning.Operation; import com.google.protobuf.InvalidProtocolBufferException; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; class Hist { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - @SuppressWarnings("ConstantConditions") private static void printHistogramValue(int executing) { StringBuilder s = new StringBuilder(); diff --git a/src/main/java/build/buildfarm/tools/IndexWorker.java b/src/main/java/build/buildfarm/tools/IndexWorker.java index a36e3f9217..317a5ff637 100644 --- a/src/main/java/build/buildfarm/tools/IndexWorker.java +++ b/src/main/java/build/buildfarm/tools/IndexWorker.java @@ -14,25 +14,19 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.common.CasIndexResults; import build.buildfarm.common.DigestUtil; import build.buildfarm.instance.Instance; import build.buildfarm.instance.stub.StubInstance; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; // This tool can be used to remove worker entries from the CAS. // This is usually done via the admin service when a worker is departing from the cluster. // ./tool shard SHA256 // The results of the removal are printed after the CAS entries have been removed. class IndexWorker { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - public static void main(String[] args) throws Exception { String host = args[0]; String instanceName = args[1]; diff --git a/src/main/java/build/buildfarm/tools/Mount.java b/src/main/java/build/buildfarm/tools/Mount.java index 43061d12bc..a0a4528d22 100644 --- a/src/main/java/build/buildfarm/tools/Mount.java +++ b/src/main/java/build/buildfarm/tools/Mount.java @@ -14,6 +14,7 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; import static build.buildfarm.instance.Utils.getBlob; import static com.google.common.base.Preconditions.checkArgument; @@ -27,8 +28,6 @@ import build.buildfarm.worker.FuseCAS; import com.google.protobuf.ByteString; import io.grpc.ManagedChannel; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import java.io.IOException; import java.io.InputStream; import java.nio.file.Path; @@ -37,12 +36,6 @@ import java.util.Map; class Mount { - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - @SuppressWarnings("BusyWait") public static void main(String[] args) throws Exception { String host = args[0]; diff --git a/src/main/java/build/buildfarm/tools/WorkerProfile.java b/src/main/java/build/buildfarm/tools/WorkerProfile.java index d820446a53..69dbb8cdee 100644 --- a/src/main/java/build/buildfarm/tools/WorkerProfile.java +++ b/src/main/java/build/buildfarm/tools/WorkerProfile.java @@ -14,6 +14,8 @@ package build.buildfarm.tools; +import static build.buildfarm.common.grpc.Channels.createChannel; + import build.buildfarm.common.DigestUtil; import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.common.config.ShardWorkerOptions; @@ -30,10 +32,7 @@ import com.google.protobuf.InvalidProtocolBufferException; import com.google.protobuf.util.Durations; import com.google.protobuf.util.JsonFormat; -import io.grpc.ManagedChannel; import io.grpc.StatusRuntimeException; -import io.grpc.netty.NegotiationType; -import io.grpc.netty.NettyChannelBuilder; import java.io.IOException; import java.nio.file.Paths; import java.util.HashMap; @@ -46,12 +45,6 @@ class WorkerProfile { private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); - private static ManagedChannel createChannel(String target) { - NettyChannelBuilder builder = - NettyChannelBuilder.forTarget(target).negotiationType(NegotiationType.PLAINTEXT); - return builder.build(); - } - /** * Transform worker string from "ip-10-135-31-210.ec2:8981" to "10.135.31.210". * @@ -116,7 +109,7 @@ private static Set getWorkers(String[] args) throws ConfigurationExcepti } catch (IOException e) { System.out.println("Could not parse yml configuration file." + e); } - RedisClient client = new RedisClient(JedisClusterFactory.create().get()); + RedisClient client = new RedisClient(JedisClusterFactory.create("worker-profile").get()); return client.call(jedis -> fetchWorkers(jedis, System.currentTimeMillis())); } diff --git a/src/main/java/build/buildfarm/worker/BUILD b/src/main/java/build/buildfarm/worker/BUILD index 417d530e9a..c25b9e74d6 100644 --- a/src/main/java/build/buildfarm/worker/BUILD +++ b/src/main/java/build/buildfarm/worker/BUILD @@ -32,7 +32,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_stub", "@maven//:io_prometheus_simpleclient", diff --git a/src/main/java/build/buildfarm/worker/Executor.java b/src/main/java/build/buildfarm/worker/Executor.java index 588d38c208..0416c2354d 100644 --- a/src/main/java/build/buildfarm/worker/Executor.java +++ b/src/main/java/build/buildfarm/worker/Executor.java @@ -199,7 +199,7 @@ private long executePolled( Stopwatch stopwatch) throws InterruptedException { /* execute command */ - log.log(Level.FINE, "Executor: Operation " + operation.getName() + " Executing command"); + log.log(Level.FINER, "Executor: Operation " + operation.getName() + " Executing command"); ActionResult.Builder resultBuilder = operationContext.executeResponse.getResultBuilder(); resultBuilder @@ -291,7 +291,7 @@ private long executePolled( long executeUSecs = stopwatch.elapsed(MICROSECONDS); log.log( - Level.FINE, + Level.FINER, String.format( "Executor::executeCommand(%s): Completed command: exit code %d", operationName, resultBuilder.getExitCode())); @@ -309,7 +309,7 @@ private long executePolled( throw e; } } else { - log.log(Level.FINE, "Executor: Operation " + operationName + " Failed to claim output"); + log.log(Level.FINER, "Executor: Operation " + operationName + " Failed to claim output"); boolean wasInterrupted = Thread.interrupted(); try { putError(); diff --git a/src/main/java/build/buildfarm/worker/InputFetcher.java b/src/main/java/build/buildfarm/worker/InputFetcher.java index 1c99df6a91..7be7a29e96 100644 --- a/src/main/java/build/buildfarm/worker/InputFetcher.java +++ b/src/main/java/build/buildfarm/worker/InputFetcher.java @@ -168,7 +168,7 @@ static String getExecutablePath( @VisibleForTesting long fetchPolled(Stopwatch stopwatch) throws InterruptedException { String operationName = operationContext.queueEntry.getExecuteEntry().getOperationName(); - log.log(Level.FINE, format("fetching inputs: %s", operationName)); + log.log(Level.FINER, format("fetching inputs: %s", operationName)); ExecutedActionMetadata.Builder executedAction = operationContext @@ -278,7 +278,7 @@ private void proceedToOutput(Action action, Command command, Path execDir) } } else { String operationName = operationContext.queueEntry.getExecuteEntry().getOperationName(); - log.log(Level.FINE, "InputFetcher: Operation " + operationName + " Failed to claim output"); + log.log(Level.FINER, "InputFetcher: Operation " + operationName + " Failed to claim output"); owner.error().put(operationContext); } diff --git a/src/main/java/build/buildfarm/worker/OperationContext.java b/src/main/java/build/buildfarm/worker/OperationContext.java index e07649d03a..71b1975783 100644 --- a/src/main/java/build/buildfarm/worker/OperationContext.java +++ b/src/main/java/build/buildfarm/worker/OperationContext.java @@ -74,11 +74,6 @@ private Builder( this.queueEntry = queueEntry; } - public Builder setExecuteResponseBuilder(ExecuteResponse.Builder executeResponse) { - this.executeResponse = executeResponse; - return this; - } - public Builder setOperation(Operation operation) { this.operation = operation; return this; diff --git a/src/main/java/build/buildfarm/worker/Pipeline.java b/src/main/java/build/buildfarm/worker/Pipeline.java index 0ed4e7b40a..4198b537e6 100644 --- a/src/main/java/build/buildfarm/worker/Pipeline.java +++ b/src/main/java/build/buildfarm/worker/Pipeline.java @@ -143,7 +143,7 @@ private void join(boolean closeStage) throws InterruptedException { } } if (stageToClose != null && !stageToClose.isClosed()) { - log.log(Level.FINE, "Closing stage at priority " + maxPriority); + log.log(Level.FINER, "Closing stage at priority " + maxPriority); stageToClose.close(); } } @@ -166,7 +166,7 @@ private void join(boolean closeStage) throws InterruptedException { if (!thread.isAlive()) { log.log( - Level.FINE, + Level.FINER, "Stage " + stage.name() + " has exited at priority " diff --git a/src/main/java/build/buildfarm/worker/PipelineStage.java b/src/main/java/build/buildfarm/worker/PipelineStage.java index da34172a2a..f40de3c54c 100644 --- a/src/main/java/build/buildfarm/worker/PipelineStage.java +++ b/src/main/java/build/buildfarm/worker/PipelineStage.java @@ -141,7 +141,7 @@ protected void logStart(String operationName) { } protected void logStart(String operationName, String message) { - getLogger().log(Level.FINE, String.format("%s: %s", logIterateId(operationName), message)); + getLogger().log(Level.FINER, String.format("%s: %s", logIterateId(operationName), message)); } protected void logComplete(String operationName, long usecs, long stallUSecs, boolean success) { @@ -151,7 +151,7 @@ protected void logComplete(String operationName, long usecs, long stallUSecs, bo protected void logComplete(String operationName, long usecs, long stallUSecs, String status) { getLogger() .log( - Level.FINE, + Level.FINER, String.format( "%s: %g ms (%g ms stalled) %s", logIterateId(operationName), usecs / 1000.0f, stallUSecs / 1000.0f, status)); diff --git a/src/main/java/build/buildfarm/worker/shard/BUILD b/src/main/java/build/buildfarm/worker/shard/BUILD index 3df1ab7e77..beeaaae918 100644 --- a/src/main/java/build/buildfarm/worker/shard/BUILD +++ b/src/main/java/build/buildfarm/worker/shard/BUILD @@ -32,7 +32,6 @@ java_library( "@maven//:io_grpc_grpc_api", "@maven//:io_grpc_grpc_context", "@maven//:io_grpc_grpc_core", - "@maven//:io_grpc_grpc_netty", "@maven//:io_grpc_grpc_protobuf", "@maven//:io_grpc_grpc_services", "@maven//:io_grpc_grpc_stub", diff --git a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java index e5371894c4..500dcb2f6d 100644 --- a/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/CFCExecFileSystem.java @@ -142,7 +142,8 @@ public void start(Consumer> onDigests, boolean skipLoad) } @Override - public void stop() { + public void stop() throws InterruptedException { + fileCache.stop(); if (!shutdownAndAwaitTermination(fetchService, 1, MINUTES)) { log.log(Level.SEVERE, "could not terminate fetchService"); } @@ -374,6 +375,8 @@ public Path createExecDir( ImmutableList.Builder inputFiles = new ImmutableList.Builder<>(); ImmutableList.Builder inputDirectories = new ImmutableList.Builder<>(); + log.log( + Level.FINER, "ExecFileSystem::createExecDir(" + operationName + ") calling fetchInputs"); // Get lock keys so we can increment them prior to downloading // and no other threads can to create/delete during // eviction or the invocation of fetchInputs @@ -440,7 +443,7 @@ public Path createExecDir( rootInputDirectories.put(execDir, inputDirectories.build()); log.log( - Level.FINE, + Level.FINER, "ExecFileSystem::createExecDir(" + operationName + ") stamping output directories"); boolean stamped = false; try { diff --git a/src/main/java/build/buildfarm/worker/shard/ExecFileSystem.java b/src/main/java/build/buildfarm/worker/shard/ExecFileSystem.java index 916b43cef0..b55601d598 100644 --- a/src/main/java/build/buildfarm/worker/shard/ExecFileSystem.java +++ b/src/main/java/build/buildfarm/worker/shard/ExecFileSystem.java @@ -30,7 +30,7 @@ public interface ExecFileSystem extends InputStreamFactory { void start(Consumer> onDigests, boolean skipLoad) throws IOException, InterruptedException; - void stop(); + void stop() throws InterruptedException; Path root(); diff --git a/src/main/java/build/buildfarm/worker/shard/RemoteCasWriter.java b/src/main/java/build/buildfarm/worker/shard/RemoteCasWriter.java index d7d5f0a776..7ec1aa1bcf 100644 --- a/src/main/java/build/buildfarm/worker/shard/RemoteCasWriter.java +++ b/src/main/java/build/buildfarm/worker/shard/RemoteCasWriter.java @@ -76,7 +76,7 @@ private void insertFileToCasMember(Digest digest, DigestFunction.Value digestFun Throwable cause = e.getCause(); Throwables.throwIfInstanceOf(cause, IOException.class); Throwables.throwIfUnchecked(cause); - throw new RuntimeException(cause); + throw new IOException(cause); } } @@ -93,7 +93,7 @@ private long writeToCasMember(Digest digest, DigestFunction.Value digestFunction Throwables.throwIfInstanceOf(cause, IOException.class); // prevent a discard of this frame Status status = Status.fromThrowable(cause); - throw status.asRuntimeException(); + throw new IOException(status.asException()); } } @@ -123,14 +123,14 @@ private void insertBlobToCasMember(Digest digest, DigestFunction.Value digestFun Throwable cause = e.getCause(); Throwables.throwIfInstanceOf(cause, IOException.class); Throwables.throwIfUnchecked(cause); - throw new RuntimeException(cause); + throw new IOException(cause); } } private String getRandomWorker() throws IOException { synchronized (workerSet) { if (workerSet.isEmpty()) { - throw new RuntimeException("no available workers"); + throw new IOException("no available workers"); } Random rand = new Random(); int index = rand.nextInt(workerSet.size()); diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java index 27e169960a..2516791662 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardWorkerContext.java @@ -240,12 +240,12 @@ public void resumePoller( } else { operationPollerCounter.inc(); log.log( - Level.INFO, format("%s: poller: Completed Poll for %s: OK", name, operationName)); + Level.FINE, format("%s: poller: Completed Poll for %s: OK", name, operationName)); } return success; }, () -> { - log.log(Level.INFO, format("%s: poller: Deadline expired for %s", name, operationName)); + log.log(Level.FINE, format("%s: poller: Deadline expired for %s", name, operationName)); onFailure.run(); }, deadline); @@ -485,7 +485,7 @@ private void uploadOutputFile( throws IOException, InterruptedException { String outputFile = actionRoot.relativize(outputPath).toString(); if (!Files.exists(outputPath)) { - log.log(Level.FINE, "ReportResultStage: " + outputFile + " does not exist..."); + log.log(Level.FINER, "ReportResultStage: " + outputFile + " does not exist..."); return; } @@ -493,7 +493,7 @@ private void uploadOutputFile( String message = String.format( "ReportResultStage: %s is a directory but it should have been a file", outputPath); - log.log(Level.FINE, message); + log.log(Level.FINER, message); preconditionFailure .addViolationsBuilder() .setType(VIOLATION_TYPE_INVALID) @@ -574,12 +574,12 @@ private void uploadOutputDirectory( throws IOException, InterruptedException { String outputDir = actionRoot.relativize(outputDirPath).toString(); if (!Files.exists(outputDirPath)) { - log.log(Level.FINE, "ReportResultStage: " + outputDir + " does not exist..."); + log.log(Level.FINER, "ReportResultStage: " + outputDir + " does not exist..."); return; } if (!Files.isDirectory(outputDirPath)) { - log.log(Level.FINE, "ReportResultStage: " + outputDir + " is not a directory..."); + log.log(Level.FINER, "ReportResultStage: " + outputDir + " is not a directory..."); preconditionFailure .addViolationsBuilder() .setType(VIOLATION_TYPE_INVALID) @@ -702,7 +702,7 @@ public boolean putOperation(Operation operation) throws IOException, Interrupted boolean success = createBackplaneRetrier().execute(() -> instance.putOperation(operation)); if (success && operation.getDone()) { completedOperations.inc(); - log.log(Level.FINE, "CompletedOperation: " + operation.getName()); + log.log(Level.FINER, "CompletedOperation: " + operation.getName()); } return success; } diff --git a/src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java b/src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java index 04c11d315b..a891af7faa 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java +++ b/src/main/java/build/buildfarm/worker/shard/ShardWorkerInstance.java @@ -56,6 +56,7 @@ import io.grpc.Status; import io.grpc.Status.Code; import io.grpc.stub.ServerCallStreamObserver; +import io.prometheus.client.Counter; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; @@ -68,6 +69,9 @@ @Log public class ShardWorkerInstance extends AbstractServerInstance { + private static final Counter IO_METRIC = + Counter.build().name("io_bytes_read").help("Read I/O (bytes)").register(); + private final Backplane backplane; public ShardWorkerInstance( @@ -132,6 +136,7 @@ public void getBlob( @Override public void onNext(ByteString data) { blobObserver.onNext(data); + IO_METRIC.inc(data.size()); } void removeBlobLocation() { diff --git a/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java b/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java index 63b1b205cb..711c3cf0d3 100644 --- a/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java +++ b/src/main/java/build/buildfarm/worker/shard/ShutDownWorkerGracefully.java @@ -14,9 +14,6 @@ package build.buildfarm.worker.shard; -import static java.util.logging.Level.WARNING; - -import build.buildfarm.common.config.BuildfarmConfigs; import build.buildfarm.v1test.PrepareWorkerForGracefulShutDownRequest; import build.buildfarm.v1test.PrepareWorkerForGracefulShutDownRequestResults; import build.buildfarm.v1test.ShutDownWorkerGrpc; @@ -26,7 +23,6 @@ @Log public class ShutDownWorkerGracefully extends ShutDownWorkerGrpc.ShutDownWorkerImplBase { - private static BuildfarmConfigs configs = BuildfarmConfigs.getInstance(); private final Worker worker; public ShutDownWorkerGracefully(Worker worker) { @@ -44,33 +40,6 @@ public ShutDownWorkerGracefully(Worker worker) { public void prepareWorkerForGracefulShutdown( PrepareWorkerForGracefulShutDownRequest request, StreamObserver responseObserver) { - String clusterId = configs.getServer().getClusterId(); - String clusterEndpoint = configs.getServer().getAdmin().getClusterEndpoint(); - if (clusterId == null - || clusterId.equals("") - || clusterEndpoint == null - || clusterEndpoint.equals("")) { - String errorMessage = - String.format( - "Current AdminConfig doesn't have cluster_id or cluster_endpoint set, " - + "the worker %s won't be shut down.", - configs.getWorker().getPublicName()); - log.log(WARNING, errorMessage); - responseObserver.onError(new RuntimeException(errorMessage)); - return; - } - - if (!configs.getServer().getAdmin().isEnableGracefulShutdown()) { - String errorMessage = - String.format( - "Current AdminConfig doesn't support shut down worker gracefully, " - + "the worker %s won't be shut down.", - configs.getWorker().getPublicName()); - log.log(WARNING, errorMessage); - responseObserver.onError(new RuntimeException(errorMessage)); - return; - } - try { CompletableFuture.runAsync(worker::prepareWorkerForGracefulShutdown); responseObserver.onNext(PrepareWorkerForGracefulShutDownRequestResults.newBuilder().build()); diff --git a/src/main/java/build/buildfarm/worker/shard/Worker.java b/src/main/java/build/buildfarm/worker/shard/Worker.java index 9d99018688..d740dd7341 100644 --- a/src/main/java/build/buildfarm/worker/shard/Worker.java +++ b/src/main/java/build/buildfarm/worker/shard/Worker.java @@ -28,7 +28,6 @@ import build.bazel.remote.execution.v2.Compressor; import build.bazel.remote.execution.v2.Digest; -import build.buildfarm.admin.aws.AwsAdmin; import build.buildfarm.backplane.Backplane; import build.buildfarm.cas.ContentAddressableStorage; import build.buildfarm.cas.ContentAddressableStorage.Blob; @@ -42,6 +41,7 @@ import build.buildfarm.common.config.GrpcMetrics; import build.buildfarm.common.grpc.Retrier; import build.buildfarm.common.grpc.Retrier.Backoff; +import build.buildfarm.common.grpc.TracingMetadataUtils.ServerHeadersInterceptor; import build.buildfarm.common.services.ByteStreamService; import build.buildfarm.common.services.ContentAddressableStorageService; import build.buildfarm.instance.Instance; @@ -144,53 +144,52 @@ public class Worker { private Pipeline pipeline; private Backplane backplane; private LoadingCache workerStubs; - @Autowired private AwsAdmin awsAdmin; @Autowired private ApplicationContext springContext; /** - * The method will prepare the worker for graceful shutdown and send out grpc request to disable - * scale in protection when the worker is ready. If unexpected errors happened, it will cancel the - * graceful shutdown progress make the worker available again. + * The method will prepare the worker for graceful shutdown when the worker is ready. Note on + * using stderr here instead of log. By the time this is called in PreDestroy, the log is no + * longer available and is not logging messages. */ public void prepareWorkerForGracefulShutdown() { - inGracefulShutdown = true; - log.log( - Level.INFO, - "The current worker will not be registered again and should be shutdown gracefully!"); - pipeline.stopMatchingOperations(); - int scanRate = 30; // check every 30 seconds - int timeWaited = 0; - int timeOut = 60 * 15; // 15 minutes - - try { - while (!pipeline.isEmpty() && timeWaited < timeOut) { - SECONDS.sleep(scanRate); - timeWaited += scanRate; - log.log(INFO, String.format("Pipeline is still not empty after %d seconds.", timeWaited)); - } - } catch (InterruptedException e) { - log.log(Level.SEVERE, "The worker gracefully shutdown is interrupted: " + e.getMessage()); - } finally { - // make a grpc call to disable scale protection - String clusterEndpoint = configs.getServer().getAdmin().getClusterEndpoint(); - log.log( - INFO, + if (configs.getWorker().getGracefulShutdownSeconds() == 0) { + System.err.println( String.format( - "It took the worker %d seconds to %s", - timeWaited, - pipeline.isEmpty() ? "finish all actions" : "but still cannot finish all actions")); + "Graceful Shutdown is not enabled. Worker is shutting down without finishing executions in progress.")); + } else { + inGracefulShutdown = true; + System.err.println( + "Graceful Shutdown - The current worker will not be registered again and should be shutdown gracefully!"); + pipeline.stopMatchingOperations(); + int scanRate = 30; // check every 30 seconds + int timeWaited = 0; + int timeOut = configs.getWorker().getGracefulShutdownSeconds(); + try { - awsAdmin.disableHostScaleInProtection(clusterEndpoint, configs.getWorker().getPublicName()); - } catch (Exception e) { - log.log( - SEVERE, + if (pipeline.isEmpty()) { + System.err.println("Graceful Shutdown - no work in the pipeline."); + } else { + System.err.println( + String.format("Graceful Shutdown - waiting for executions to finish.")); + } + while (!pipeline.isEmpty() && timeWaited < timeOut) { + SECONDS.sleep(scanRate); + timeWaited += scanRate; + System.err.println( + String.format( + "Graceful Shutdown - Pipeline is still not empty after %d seconds.", timeWaited)); + } + } catch (InterruptedException e) { + System.err.println( + "Graceful Shutdown - The worker gracefully shutdown is interrupted: " + e.getMessage()); + } finally { + System.err.println( String.format( - "gRPC call to AdminService to disable scale in protection failed with exception: %s and stacktrace %s", - e.getMessage(), Arrays.toString(e.getStackTrace()))); - // Gracefully shutdown cannot be performed successfully because of error in - // AdminService side. Under this scenario, the worker has to be added back to the worker - // pool. - inGracefulShutdown = false; + "Graceful Shutdown - It took the worker %d seconds to %s", + timeWaited, + pipeline.isEmpty() + ? "finish all actions" + : "gracefully shutdown but still cannot finish all actions")); } } } @@ -276,6 +275,7 @@ private Server createServer( storage, inputFetchStage, executeActionStage, context, completeStage, backplane)); } GrpcMetrics.handleGrpcMetricIntercepts(serverBuilder, configs.getWorker().getGrpcMetrics()); + serverBuilder.intercept(new ServerHeadersInterceptor()); return serverBuilder.build(); } @@ -684,6 +684,7 @@ public void start() throws ConfigurationException, InterruptedException, IOExcep @PreDestroy public void stop() throws InterruptedException { System.err.println("*** shutting down gRPC server since JVM is shutting down"); + prepareWorkerForGracefulShutdown(); PrometheusPublisher.stopHttpServer(); boolean interrupted = Thread.interrupted(); if (pipeline != null) { diff --git a/third_party/docker_go_toolchain.patch b/third_party/docker_go_toolchain.patch new file mode 100644 index 0000000000..3b00ff333c --- /dev/null +++ b/third_party/docker_go_toolchain.patch @@ -0,0 +1,11 @@ +--- repositories/go_repositories.bzl.orig 2023-09-23 08:36:00.148468653 -0400 ++++ repositories/go_repositories.bzl 2023-09-23 08:33:22.502127476 -0400 +@@ -37,7 +37,7 @@ + go_repository_default_config (str, optional): A file used to determine the root of the workspace. + """ + go_rules_dependencies() +- go_register_toolchains() ++ go_register_toolchains("1.21.0") + gazelle_dependencies(go_repository_default_config = go_repository_default_config) + excludes = native.existing_rules().keys() + if "com_github_google_go_containerregistry" not in excludes: diff --git a/tools/buildfarm-indexer.py b/tools/buildfarm-indexer.py index 824763dd21..b020cb91d5 100755 --- a/tools/buildfarm-indexer.py +++ b/tools/buildfarm-indexer.py @@ -1,5 +1,5 @@ from redis.client import Pipeline -from rediscluster import StrictRedisCluster +from rediscluster import RedisCluster import sys def get_cas_page(r, cursor, count): @@ -15,7 +15,7 @@ def get_cas_page(r, cursor, count): print ("usage: buildfarm-indexer.py ") sys.exit(1) -r = StrictRedisCluster(startup_nodes=[{"host": redis_host, "port": 6379}], skip_full_coverage_check=True) +r = RedisCluster(startup_nodes=[{"host": redis_host, "port": 6379}], skip_full_coverage_check=True) nodes = r.connection_pool.nodes @@ -30,14 +30,15 @@ def get_cas_page(r, cursor, count): slots.remove(slot) node_keys[slot] = str(node_key) -workers = r.hkeys("Workers") +# config f"{backplane.workersHashName}_storage" +workers = r.hkeys("Workers_storage") worker_count = len(workers) print ("%d workers" % worker_count) p = r.pipeline() -for node_key in node_keys.viewvalues(): +for node_key in node_keys.values(): p.delete("{%s}:intersecting-workers" % node_key) p.sadd("{%s}:intersecting-workers" % node_key, *workers) p.execute() @@ -101,8 +102,9 @@ def process(self, cas_names, conn): count = len(cas_names) p = self.pipeline(conn) for i in range(count): - name = cas_names[i] - node_key = node_keys[nodes.keyslot(str(name))] + name = cas_names[i].decode() + keyslot = nodes.keyslot(name) + node_key = node_keys[keyslot] set_key = "{%s}:intersecting-workers" % node_key p.sinterstore(name, set_key, name) p.execute() @@ -116,8 +118,8 @@ def process(self, cas_names, conn): map_cas_page(r, 10000, indexer.process) p = r.pipeline() -for node_key in node_keys.viewvalues(): +for node_key in node_keys.values(): p.delete("{%s}:intersecting-workers" % node_key) p.execute() -print("\n%d processed" % (indexer.processed)) \ No newline at end of file +print("\n%d processed" % (indexer.processed))