From 7e3a34427fd66cf03e4f16f64ba22b9e46d4d89d Mon Sep 17 00:00:00 2001 From: Nathan Na Date: Mon, 10 Nov 2025 19:16:40 +0000 Subject: [PATCH 1/2] Fix version parsing logic of ofi nccl plugin Signed-off-by: Nathan Na --- 4.validation_and_observability/efa-versions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/4.validation_and_observability/efa-versions.py b/4.validation_and_observability/efa-versions.py index bd7af39b5..cd2a806c7 100644 --- a/4.validation_and_observability/efa-versions.py +++ b/4.validation_and_observability/efa-versions.py @@ -41,12 +41,12 @@ def get_nccl_version(container=[]): def get_aws_ofi_nccl_version(container=[]): try: - version = subprocess.check_output(container + ['strings', '/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-net.so']) + lib_path = subprocess.check_output(container + ['cat', '/etc/ld.so.conf.d/100_ofinccl.conf']).decode().strip() + version = subprocess.check_output(container + ['strings', f'{lib_path}/libnccl-net.so']) version = version.decode('utf-8') version = re.search(r'NET/OFI Initializing aws-ofi-nccl (\d+\.\d+\.\d+)', version).group(1) return version except Exception as e: - print(f'Error: {e}') return None From 1b9de22f7839106fdd8deaa86fba04f5eb4c1b94 Mon Sep 17 00:00:00 2001 From: Nathan Na Date: Tue, 18 Nov 2025 22:07:27 +0000 Subject: [PATCH 2/2] docs: fix typos and improve formatting consistency --- .../3.efa-node-exporter/Dockerfile | 6 +++--- .../3.efa-node-exporter/buildspec.yaml | 1 - .../3.efa-node-exporter/docker-compose.yml | 11 +++++------ 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/4.validation_and_observability/3.efa-node-exporter/Dockerfile b/4.validation_and_observability/3.efa-node-exporter/Dockerfile index 6f42168e7..4df585027 100644 --- a/4.validation_and_observability/3.efa-node-exporter/Dockerfile +++ b/4.validation_and_observability/3.efa-node-exporter/Dockerfile @@ -5,7 +5,7 @@ ARG PROCFS_EXPORTER_VERSION=v0.19.2 # Install ProcFS RUN git clone -b $PROCFS_EXPORTER_VERSION https://github.com/prometheus/procfs.git /workspace/procfs -COPY class_amazon_efa.go /workspace/procfs/sysfs/ +COPY class_amazon_efa.go /workspace/procfs/sysfs/ RUN cd /workspace/procfs && make test # Install Node Exporter @@ -13,8 +13,8 @@ RUN git clone -b $NODE_EXPORTER_VERSION https://github.com/prometheus/node_expor COPY amazon_efa_linux.go /workspace/node_exporter/collector/ WORKDIR /workspace/node_exporter -RUN go mod edit --replace=github.com/prometheus/procfs=/workspace/procfs -RUN go mod tidy && CGO_ENABLED=0 go build -o /go/bin/node_exporter +RUN go mod edit --replace=github.com/prometheus/procfs=/workspace/procfs +RUN go mod tidy && CGO_ENABLED=0 go build -o /go/bin/node_exporter FROM gcr.io/distroless/static-debian12 COPY --from=build /go/bin/node_exporter /workspace/ diff --git a/4.validation_and_observability/3.efa-node-exporter/buildspec.yaml b/4.validation_and_observability/3.efa-node-exporter/buildspec.yaml index eca452511..5792fd47f 100644 --- a/4.validation_and_observability/3.efa-node-exporter/buildspec.yaml +++ b/4.validation_and_observability/3.efa-node-exporter/buildspec.yaml @@ -34,4 +34,3 @@ phases: - docker image tag ${REPO_URI}:${TAG} public.ecr.aws/hpc-cloud/${ECR_REPOSITORY_NAME}:latest - docker push public.ecr.aws/hpc-cloud/${ECR_REPOSITORY_NAME}:${TAG} - docker push public.ecr.aws/hpc-cloud/${ECR_REPOSITORY_NAME}:latest - diff --git a/4.validation_and_observability/3.efa-node-exporter/docker-compose.yml b/4.validation_and_observability/3.efa-node-exporter/docker-compose.yml index 4960062c9..0ad7a098f 100644 --- a/4.validation_and_observability/3.efa-node-exporter/docker-compose.yml +++ b/4.validation_and_observability/3.efa-node-exporter/docker-compose.yml @@ -1,7 +1,6 @@ -version: '2.1' +version: "2.1" services: - node_exporter_efa: build: . container_name: node_exporter_efa @@ -10,9 +9,9 @@ services: - /sys:/host/sys:ro - /:/rootfs:ro command: - - '--path.procfs=/host/proc' - - '--path.rootfs=/rootfs' - - '--path.sysfs=/host/sys' - - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + - "--path.procfs=/host/proc" + - "--path.rootfs=/rootfs" + - "--path.sysfs=/host/sys" + - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)" restart: unless-stopped network_mode: host