From 86ceb5d666c4388db3f8215e58b37c35120485e7 Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Sun, 31 Aug 2025 21:22:52 +0000 Subject: [PATCH 01/12] br: emacs Signed-off-by: Brian Roland --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 59ef90009..fba97c10a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -49,6 +49,7 @@ apt-get install -qyy \ curl \ pre-commit \ sudo \ + emacs-nox \ gnupg \ unzip \ libsqlite3-dev From 3d9e8b615ac6d14ac6e5b5f2f26c5ae386b40347 Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Sun, 31 Aug 2025 22:51:45 +0000 Subject: [PATCH 02/12] br: printing output Signed-off-by: Brian Roland --- 3rdparty/NeMo | 2 +- sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/3rdparty/NeMo b/3rdparty/NeMo index 7ccb0d4c5..654437102 160000 --- a/3rdparty/NeMo +++ b/3rdparty/NeMo @@ -1 +1 @@ -Subproject commit 7ccb0d4c5544dbcc454930acb3a1fe29d9db5090 +Subproject commit 654437102f3ac09cd0ba69ae78d8f5c0576d8239 diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py index 4be3a8d0b..f04725669 100644 --- a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py @@ -630,7 +630,7 @@ def train(args: argparse.Namespace) -> nl.Trainer: lora_transform = None if args.lora_finetune: lora_transform = Evo2LoRA(peft_ckpt_path=args.lora_checkpoint_path) - + print("********************train: init llm.HyenaModel*******") model = llm.HyenaModel(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) else: # mamba if args.no_weight_decay_embeddings: @@ -855,6 +855,7 @@ def train(args: argparse.Namespace) -> nl.Trainer: ), val_check_interval=args.val_check_interval, enable_checkpointing=args.create_checkpoint_callback, + enable_progress_bar=True, ) # Logger setup @@ -892,15 +893,18 @@ def train(args: argparse.Namespace) -> nl.Trainer: opt = MegatronOptimizerModule(opt_config, sched, no_weight_decay_cond=model_config.hyena_no_weight_decay_cond_fn) opt.connect(model) # Start training + print("*******************train: before trainer.fit") trainer.fit(model, data_module) + print("*******************train: after trainer.fit") return trainer def main(): """Parsing args and running evo2 training.""" args = parse_args() + print("*******************main: before train") train(args=args) - + print("*******************main: after train") if __name__ == "__main__": main() From 46b299059fced9f5a689e3c250b8cadb5cab89bb Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Mon, 1 Sep 2025 17:34:52 +0000 Subject: [PATCH 03/12] br: added scripts [skip ci] Signed-off-by: Brian Roland --- internal/scripts_br/build_dev_image_br.sh | 37 +++++ internal/scripts_br/install_tap.sh | 14 ++ internal/scripts_br/run_dev_br.sh | 146 ++++++++++++++++++ internal/scripts_br/run_evo2_train.sh | 89 +++++++++++ internal/scripts_br/run_mig_br.sh | 46 ++++++ .../scripts_br/run_nsys_with_evo2_train.sh | 108 +++++++++++++ internal/scripts_br/run_precommit.sh | 43 ++++++ internal/scripts_br/run_pytest.sh | 51 ++++++ internal/scripts_br/run_simple_torch_app.py | 45 ++++++ .../scripts_br/run_tap_with_evo2_train.sh | 122 +++++++++++++++ .../scripts_br/run_update_git_submodules.sh | 2 + .../scripts_br/show_git_submodule_config.sh | 2 + internal/scripts_br/venv_create.sh | 67 ++++++++ 13 files changed, 772 insertions(+) create mode 100755 internal/scripts_br/build_dev_image_br.sh create mode 100755 internal/scripts_br/install_tap.sh create mode 100755 internal/scripts_br/run_dev_br.sh create mode 100755 internal/scripts_br/run_evo2_train.sh create mode 100755 internal/scripts_br/run_mig_br.sh create mode 100755 internal/scripts_br/run_nsys_with_evo2_train.sh create mode 100755 internal/scripts_br/run_precommit.sh create mode 100755 internal/scripts_br/run_pytest.sh create mode 100644 internal/scripts_br/run_simple_torch_app.py create mode 100755 internal/scripts_br/run_tap_with_evo2_train.sh create mode 100755 internal/scripts_br/run_update_git_submodules.sh create mode 100755 internal/scripts_br/show_git_submodule_config.sh create mode 100755 internal/scripts_br/venv_create.sh diff --git a/internal/scripts_br/build_dev_image_br.sh b/internal/scripts_br/build_dev_image_br.sh new file mode 100755 index 000000000..03eaa9aae --- /dev/null +++ b/internal/scripts_br/build_dev_image_br.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# ------------------------------------------------------------------------ +# (0) preamble +# ------------------------------------------------------------------------ +MESSAGE_TEMPLATE='********build_dev_image_br.sh: %s\n' +DATE_OF_SCRIPT=$(date +'%Y%m%dT%H%M%S') +SCRIPT_DIR="$(dirname "$(realpath "$BASH_SOURCE")")" +printf "${MESSAGE_TEMPLATE}" "SCRIPT_DIR=${SCRIPT_DIR}" +printf "${MESSAGE_TEMPLATE}" "hostname=$(hostname)" +printf "${MESSAGE_TEMPLATE}" "whoami=$(whoami)" +printf "${MESSAGE_TEMPLATE}" "uid=$(id -u)" +printf "${MESSAGE_TEMPLATE}" "gid=$(id -g)" + +set -euo pipefail + +BRANCH=$(git rev-parse --abbrev-ref HEAD) +COMMIT=$(git rev-parse --short HEAD) +DATE=$(date --iso-8601=seconds -u) + +set -x +DOCKER_BUILDKIT=1 docker buildx build \ + -t "nvcr.io/nvidian/cvai_bnmo_trng/bionemo:dev-${BRANCH}-${DATE_OF_SCRIPT}-${COMMIT}" \ + --ulimit 'nofile=65535:65535' \ + --target="development" \ + --load \ + --cache-from nvcr.io/nvidia/clara/bionemo-framework:nightly \ + --cache-to type=inline \ + --label com.nvidia.bionemo.git_sha=${COMMIT} \ + --label com.nvidia.bionemo.created_at=${DATE} \ + -f ./Dockerfile \ + . + +# ---------------------- +# (-1) post-amble +# -------------------------- +printf "${MESSAGE_TEMPLATE}" "end script" \ No newline at end of file diff --git a/internal/scripts_br/install_tap.sh b/internal/scripts_br/install_tap.sh new file mode 100755 index 000000000..b8771128f --- /dev/null +++ b/internal/scripts_br/install_tap.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# prelim required by release version of TAP +mkdir -p /workspace/bionemo2/.local/lib/python3.12/site-packages/ +touch /workspace/bionemo2/.local/lib/python3.12/site-packages/usercustomize.py + +# install from gitlab server +pip install git+https://gitlab-master.nvidia.com/dl/gwe/torch_automated_profiler@release + +# RUN --mount=type=ssh cd /opt && git clone ssh://git@gitlab-master.nvidia.com:12051/dl/gwe/torch_automated_profiler.git\ +# && cd torch_automated_profiler\ +# && git fetch origin br_max_depth_1\ +# && git checkout -b br_max_depth_1 origin/br_max_depth_1\ +# && pip install -e . -v \ No newline at end of file diff --git a/internal/scripts_br/run_dev_br.sh b/internal/scripts_br/run_dev_br.sh new file mode 100755 index 000000000..3e66b1995 --- /dev/null +++ b/internal/scripts_br/run_dev_br.sh @@ -0,0 +1,146 @@ +#!/usr/bin/env bash + +# ------------------------------------------------------------------------ +# (0) preamble +# ------------------------------------------------------------------------ +MESSAGE_TEMPLATE='********run_dev_br.sh: %s\n' +DATE_OF_SCRIPT=$(date +'%Y%m%dT%H%M%S') +SCRIPT_DIR="$(dirname "$(realpath "$BASH_SOURCE")")" +printf "${MESSAGE_TEMPLATE}" "SCRIPT_DIR=${SCRIPT_DIR}" +printf "${MESSAGE_TEMPLATE}" "hostname=$(hostname)" +printf "${MESSAGE_TEMPLATE}" "whoami=$(whoami)" +printf "${MESSAGE_TEMPLATE}" "uid=$(id -u)" +printf "${MESSAGE_TEMPLATE}" "gid=$(id -g)" + + +#set -euo pipefail + +source .env + + +# ----------------------------------------------------- +# (1) user paramerters +# ----------------------------------------------------- +USER_IN_CTR=root # if profiling, run as root +HOME_IN_CTR=/opt/${USER_IN_CTR} + +#GPU_ARG='--gpus "\"device=0,1,2,3,4,5,6,7\""' +GPU_ARG='--gpus all' +LOCAL_RESULTS_PATH="/home/scratch.broland_sw_1/data_for_projects/evo2/results/bionemo2_results" +LOCAL_DATA_PATH="./data" +LOCAL_MODELS_PATH="./models" + +COMMIT_AT_START=$(git rev-parse --short HEAD) +BRANCH_AT_START=$(git rev-parse --abbrev-ref HEAD) +IMAGE_REPO='nvcr.io/nvidian/cvai_bnmo_trng/bionemo' +IMAGE_TAG='dev-br_bnm2532_dlsim_val_in_fw_a-20250831T164028-a29272f1' +IMAGE_NAME="${IMAGE_REPO}:${IMAGE_TAG}" + +DOCKER_REPO_PATH="/workspace/bionemo2" +DOCKER_RESULTS_PATH="/workspace/bionemo2/results" +DOCKER_MODELS_PATH="/workspace/bionemo2/models" +DOCKER_DATA_PATH="/workspace/bionemo2/data" + +# ----------------------------------------------------- +# (2) santity checks +# ---------------------------------------------------- +LOCAL_REPO_PATH="$(realpath $(pwd))" +if [[ "$(basename ${LOCAL_REPO_PATH})" != *"bionemo-framework"* ]]; then + echo "ERROR: must run this script from the bionemo repository root!" + exit 1 +fi + +# --------------------------------------------------------------------- +# (3) make expected directories in external filesystem as user, not as docker +# ---------------------------------------------------- +expected_local_dirs=("${LOCAL_RESULTS_PATH}" "${LOCAL_DATA_PATH}" "${LOCAL_MODELS_PATH}" "./htmlcov") +for expected_local_dir in "${expected_local_dirs[@]}"; do + printf "${MESSAGE_TEMPLATE}" "expected_local_dir=${expected_local_dir}" + mkdir -p "${expected_local_dir}" + chmod -R a+rw "${expected_local_dir}" +done + +# --------------------------------------------------------------------- +# (4) delete external directories with state +# ---------------------------------------------------- +sudo rm -rf ${LOCAL_RESULTS_PATH}/evo2 + +# ----------------------------------------------------- +# (5) assemble docker run command +# ---------------------------------------------------- + +printf "${MESSAGE_TEMPLATE}" "create DOCKER_RUN_COMMAND" + +read -r -d '' DOCKER_RUN_OPTIONS_FOR_PROFILING <&1 | tee -a ${LOG_FILE} +EOF + +printf "${MESSAGE_TEMPLATE}" "PY_COMMAND=${PY_COMMAND}" +eval "${PY_COMMAND}" + +# ---------------------------------------- +# (-1) post-amble +# ---------------------------------------- +printf "${MESSAGE_TEMPLATE}" "LOG_FILE=${LOG_FILE}" +printf "${MESSAGE_TEMPLATE}" "end with success" \ No newline at end of file diff --git a/internal/scripts_br/run_mig_br.sh b/internal/scripts_br/run_mig_br.sh new file mode 100755 index 000000000..446615479 --- /dev/null +++ b/internal/scripts_br/run_mig_br.sh @@ -0,0 +1,46 @@ +#!/bin/bash + + +# profiles are specified to each GPU, e.g. profile 15 can be used to divide into 4 devices of size 20gb +# NVIDIA H100 80GB HBM3 +#| => sudo nvidia-smi mig -i 3 -cgi 15 -C +#Successfully created GPU instance ID 5 on GPU 3 using profile MIG 1g.20gb (ID 15) +#Successfully created compute instance ID 0 on GPU 3 GPU instance ID 5 using profile MIG 1g.20gb (ID 7) + + + + +# GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-afddd1b4-4464-96c8-a712-aaeb0acf1170) # cudo 0 on torch +# GPU 1: NVIDIA H100 80GB HBM3 (UUID: GPU-6faf0136-7870-5767-10be-a0827a158829) +# GPU 2: NVIDIA H100 80GB HBM3 (UUID: GPU-20d20fc3-bcc7-e715-32d6-ffd646ea062f) +# GPU 3: NVIDIA H100 80GB HBM3 (UUID: GPU-182e6bd5-b7ac-e0a6-48cf-96e198063dd3) +# MIG 1g.20gb Device 0: (UUID: MIG-56679450-0984-50db-83a3-7e549eb60883) # cudo 4 on torch +# MIG 1g.20gb Device 1: (UUID: MIG-a155b8d5-2484-52fc-a2ed-e47dc89996cd) +# MIG 1g.20gb Device 2: (UUID: MIG-9dc27b3c-b567-5802-a2a7-27ad657ab079) +# MIG 1g.20gb Device 3: (UUID: MIG-f6102e7f-bbf5-5db4-abea-156619dd4ce2) + + + +# Split into to 40gb device sudo nvidia-smi mig -i 5 -cgi 5,5 + +# (0) choose a device +DEVICE_INDEX_FOR_MIG=1 +PROFILE=15 +PROFILE=9 # + +# (1) show all gpu indices, uuids, and product names +nvidia-smi -L + +# (2) list all MIG instances +sudo nvidia-smi mig -lgi + +# (3) activate multi-instance gpu for +sudo nvidia-smi --id ${DEVICE_INDEX_FOR_MIG} -mig 1 + +# split device with index 0 into 3 compute instances +for i in {0..3}; do + sudo nvidia-smi mig --id ${DEVICE_INDEX_FOR_MIG} -cgi ${PROFILE} -C +done + +# show all gpu indices, uuids, and produce names +nvidia-smi -L diff --git a/internal/scripts_br/run_nsys_with_evo2_train.sh b/internal/scripts_br/run_nsys_with_evo2_train.sh new file mode 100755 index 000000000..e59cf0fb8 --- /dev/null +++ b/internal/scripts_br/run_nsys_with_evo2_train.sh @@ -0,0 +1,108 @@ +#!/bin/bash +# +# +# nsys option like --pytorch function-trace: +# nvtx markers for common torch operations at the pytorch level like torch.Tensor.to +# +# nsys option like --pytorch autograd-shapes-nvtx: +# nvtx markers for common torch operations at the kernel level like "to", "to_copy" +# + + +# ---------------------------------------- +# (0) preamble +# ---------------------------------------- +MESSAGE_TEMPLATE='********run_evo2_train.sh: %s\n' +DATE_OF_SCRIPT=$(date +'%Y%m%dT%H%M') +SCRIPT_DIR="$(dirname "$(realpath "$BASH_SOURCE")")" +printf "${MESSAGE_TEMPLATE}" "begin" +printf "${MESSAGE_TEMPLATE}" "DATE_OF_SCRIPT=${DATE_OF_SCRIPT}" + +# ---------------------------------------- +# (1) set some user parameters +# ---------------------------------------- +RESULTS_DIR="./results/run_nsys_with_evo2_train" + +read -r -d '' NSYS_PROFILE_OPTIONS < ReLU -> Linear +class SimpleModel(nn.Module): + def __init__(self, input_size=10, hidden_size=20, output_size=5): + super(SimpleModel, self).__init__() + self.fc1 = nn.Linear(input_size, hidden_size) + self.fc2 = nn.Linear(hidden_size, output_size) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return x + +# Instantiate the model +model = SimpleModel() + +# Generate random input data (batch_size=4, input_size=10) +x = torch.randn(4, 10) + + +with torch.profiler.profile( + activities=[torch.profiler.ProfilerActivity.CPU, + torch.profiler.ProfilerActivity.CUDA], + on_trace_ready=torch.profiler.tensorboard_trace_handler("./log"), + record_shapes=True, + with_stack=True +) as prof: + for _ in range(5): + output = model(x) + + print("Input:", x) + print("Output:", output) + +print( + prof.key_averages(group_by_input_shape=True).table( + sort_by="cuda_time_total", row_limit=10 + ) +) diff --git a/internal/scripts_br/run_tap_with_evo2_train.sh b/internal/scripts_br/run_tap_with_evo2_train.sh new file mode 100755 index 000000000..f22a8fb36 --- /dev/null +++ b/internal/scripts_br/run_tap_with_evo2_train.sh @@ -0,0 +1,122 @@ +#!/bin/bash +# +# +# nsys option like --pytorch function-trace: +# nvtx markers for common torch operations at the pytorch level like torch.Tensor.to +# +# nsys option like --pytorch autograd-shapes-nvtx: +# nvtx markers for common torch operations at the kernel level like "to", "to_copy" +# + + +# ---------------------------------------- +# (0) preamble +# ---------------------------------------- +MESSAGE_TEMPLATE='********run_evo2_train.sh: %s\n' +DATE_OF_SCRIPT=$(date +'%Y%m%dT%H%M') +SCRIPT_DIR="$(dirname "$(realpath "$BASH_SOURCE")")" +printf "${MESSAGE_TEMPLATE}" "begin" +printf "${MESSAGE_TEMPLATE}" "DATE_OF_SCRIPT=${DATE_OF_SCRIPT}" + +# ---------------------------------------- +# (1) set some user parameters +# ---------------------------------------- +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +export TAP_NSIGHT_LOCATION='/usr/local/cuda/bin/nsys' +export TAP_LOG_LEVEL='debug' +export TAP_MODE='nsight' # '', nsight, or anna..... '' means ignore all profiling +export TAP_NVTX='pytorch' # pytorch, apex, python +export TAP_BACKWARD_NVTX='false' # true or false +export TAP_PROFILE_MEMORY='false' +export TAP_WAIT_STEPS='1' # 2 is my default +export TAP_WARMUP_STEPS='1' # 12 is my default, +export TAP_ACTIVE_STEPS='4' # 1 is my default +export TAP_WAIT_EPOCHS='1' +#!/usr/bin/env sh + + +export TAP_EXIT_ON_STOP=true + +#export APP_NVTX_CATEGORIES='main,lit_module,dataset' +export TAP_NSIGHT_FLAGS='--trace nvtx,cuda' +#export TAP_MAX_DEPTH=14 # minimal is 4, since torch compile adds a ldevel, default is 14 + + +RESULTS_DIR="./results/run_tap_with_evo2_train" + +TRAIN_ARGS_ARRAY=( + "--mock-data" + "--seq-length" + "256" + "--micro-batch-size" + "1" + "--model-size" + "test" + "--max-steps" + "30" + "--context-parallel-size" + "1" + "--devices" + "1" + "--val-check-interval" "0" +) + +RUN_LABEL_PREFIX="tap_bionemo_evo2_train" +PYTHON_SCRIPT_PATH=sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py + +# ---------------------------------------- +# (2) dump parameters +# ---------------------------------------- +printf "${MESSAGE_TEMPLATE}" "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}" + +# ---------------------------------------- +# (3) create output dir names and file names +# - create log file name and report filename +# ---------------------------------------- +run_label_arr=( + ${RUN_LABEL_PREFIX} + "mock-data" + ${BRANCH_AT_START} + ${DATE_OF_SCRIPT} + ${COMMIT_AT_START} +) +RUN_LABEL="$(IFS='_'; echo "${run_label_arr[*]}")" + +RESULTS_SUBDIR="${RESULTS_DIR}/${RUN_LABEL}" +export TAP_SAVE_DIR="${RESULTS_SUBDIR}" +mkdir -p "${RESULTS_SUBDIR}" +chmod a+r "${RESULTS_SUBDIR}" + +LOG_FILE="${RESULTS_SUBDIR}/${RUN_LABEL}.log" +REPORT_FILE="${RESULTS_SUBDIR}/${RUN_LABEL}.nsys-rep" + +# ---------------------------------------- +# (4) create command +# ---------------------------------------- +APPLICATION_TO_PROFILE="python ${PYTHON_SCRIPT_PATH} ${TRAIN_ARGS_ARRAY[@]}" +#APPLICATION_TO_PROFILE="python -c 'import torch; x= torch.ones(500)'" + +read -r -d '' TAP_PROFILE_CMD <&1 | tee -a ${LOG_FILE} +EOF +# ---------------------------------------- +# (5) run command +# ---------------------------------------- +printf "\n" +printf "${MESSAGE_TEMPLATE}" "nsys version: $(nsys --version)" + +printf "\n" +printf "${MESSAGE_TEMPLATE}" "APPLICATION_TO_PROFILE=${APPLICATION_TO_PROFILE}" +printf "\n" +printf "${MESSAGE_TEMPLATE}" "TAP_PROFILE_CMD" +echo "${TAP_PROFILE_CMD}" +eval "${TAP_PROFILE_CMD}" + +if [[ -f '/tmp/.tap_dummy_nsight_report.nsys-rep' ]]; then + cp /tmp/.tap_dummy_nsight_report.nsys-rep ${RESULTS_SUBDIR}/tap_dummy_nsight_report.nsys-rep +fi +# ---------------------------------------- +# (-1) post-amble +# ---------------------------------------- +printf "${MESSAGE_TEMPLATE}" "TAP_SAVE_DIR=${TAP_SAVE_DIR}" +printf "${MESSAGE_TEMPLATE}" "end script" \ No newline at end of file diff --git a/internal/scripts_br/run_update_git_submodules.sh b/internal/scripts_br/run_update_git_submodules.sh new file mode 100755 index 000000000..1b0c74c98 --- /dev/null +++ b/internal/scripts_br/run_update_git_submodules.sh @@ -0,0 +1,2 @@ +#!/bin/bash +git submodule update --init --recursive \ No newline at end of file diff --git a/internal/scripts_br/show_git_submodule_config.sh b/internal/scripts_br/show_git_submodule_config.sh new file mode 100755 index 000000000..1289082cd --- /dev/null +++ b/internal/scripts_br/show_git_submodule_config.sh @@ -0,0 +1,2 @@ +#!/bin/bash +git config --local --get-regexp submodule \ No newline at end of file diff --git a/internal/scripts_br/venv_create.sh b/internal/scripts_br/venv_create.sh new file mode 100755 index 000000000..80a20d63b --- /dev/null +++ b/internal/scripts_br/venv_create.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# +# title: virtual_env_create.sh +# usage: +# cd ; ./scripts/virtual_env_create.sh +# +# create a virtual environment for the benchmarking repo +# +MESSAGE_TEMPLATE='********virtual_env_create.sh: %s\n' +DATE_OF_SCRIPT=$(date +'%Y%m%dT%H%M%S') +SCRIPT_DIR="$(dirname "$(realpath "$BASH_SOURCE")")" +GIT_BRANCH=$(git branch --show-current) + +# ----------------------------------------------- +# (1) set script-level parameters +# ------------------------------------------------ +ENV_DIR=./venv_bionemo_fw + +# ----------------------------------------------- +# (2) create venv +# ------------------------------------------------ +printf "${MESSAGE_TEMPLATE}" "attempt to created a virtual env in directory ${ENV_DIR}" + +# -------------------------------------------------- +# on computelab run +# (1) cannot run as sudo +# ------------------------------------------------- +#apt update +#apt install -y python3 python3-pip python3.10-venv + + +sudo add-apt-repository ppa:deadsnakes/ppa -y +sudo apt update +sudo apt install python3.13 python3.13-venv python3.13-dev + + +python3 -m venv ${ENV_DIR} + + +# ----------------------------------------------- +# (3) enter venv +# ------------------------------------------------ +source ${ENV_DIR}/bin/activate +printf "${MESSAGE_TEMPLATE}" "you are in virtual env in directory ${ENV_DIR}" + +# ----------------------------------------------- +# (3) install pip to virtual environment +# ------------------------------------------------ +if [[ "$(hostname)" == *viking-prod* ]]; then + printf "${MESSAGE_TEMPLATE}" "installing pip inside virtual-environment on viking host" + sudo apt update + sudo apt install -y python3-pip +else + printf "${MESSAGE_TEMPLATE}" "installing pip inside virtual-environment" + apt update python3-pip python3.10-venv + apt install -y python3-pip +fi + + +# ----------------------------------------------- +# (4) pip install +# ------------------------------------------------ +pip install pre-commit==4.1.0 + +printf "${MESSAGE_TEMPLATE}" "to enter this virtual env, source ${ENV_DIR}/bin/activate" +printf "${MESSAGE_TEMPLATE}" "to exit this virtual env, deactivate" +printf "${MESSAGE_TEMPLATE}" "exiting" \ No newline at end of file From 860b557b0a2bf2c36d7380b6239e5e587b3661df Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Mon, 1 Sep 2025 21:19:50 +0000 Subject: [PATCH 04/12] br: [skip ci] Signed-off-by: Brian Roland --- 3rdparty/NeMo | 2 +- ..._train.sh => run_evo2_train_to_profile.sh} | 20 +++++++++++++++---- .../src/bionemo/evo2/run/train.py | 6 ++++++ 3 files changed, 23 insertions(+), 5 deletions(-) rename internal/scripts_br/{run_evo2_train.sh => run_evo2_train_to_profile.sh} (77%) diff --git a/3rdparty/NeMo b/3rdparty/NeMo index 654437102..ea54a221a 160000 --- a/3rdparty/NeMo +++ b/3rdparty/NeMo @@ -1 +1 @@ -Subproject commit 654437102f3ac09cd0ba69ae78d8f5c0576d8239 +Subproject commit ea54a221a495b4dd8afae3993a18167bac42ba3f diff --git a/internal/scripts_br/run_evo2_train.sh b/internal/scripts_br/run_evo2_train_to_profile.sh similarity index 77% rename from internal/scripts_br/run_evo2_train.sh rename to internal/scripts_br/run_evo2_train_to_profile.sh index 93b057f20..1edbd1365 100755 --- a/internal/scripts_br/run_evo2_train.sh +++ b/internal/scripts_br/run_evo2_train_to_profile.sh @@ -7,15 +7,21 @@ MESSAGE_TEMPLATE='********run_evo2_train.sh: %s\n' DATE_OF_SCRIPT=$(date +'%Y%m%dT%H%M') WHOAMI="$(whoami)" SCRIPT_DIR="$(dirname "$(realpath "$BASH_SOURCE")")" +LIT_VERSION=$(pip show lightning | grep Version) +TORCH_VERSION=$(pip show torch | grep Version) +PYTHON_VERSION=$(python --version | grep Python) printf "${MESSAGE_TEMPLATE}" "begin" printf "${MESSAGE_TEMPLATE}" "DATE_OF_SCRIPT=${DATE_OF_SCRIPT}" printf "${MESSAGE_TEMPLATE}" "WHOAMI=${WHOAMI}" +printf "${MESSAGE_TEMPLATE}" "LIT_VERSION=${LIT_VERSION}" +printf "${MESSAGE_TEMPLATE}" "TORCH_VERSION=${TORCH_VERSION}" +printf "${MESSAGE_TEMPLATE}" "PYTHON_VERSION=${PYTHON_VERSION}" # ---------------------------------------- # (1) set some user parameters # ---------------------------------------- RESULTS_DIR="./results" # i.e. /workspace/bionemo2/results -RESULTS_THIS_APP_DIR="${RESULTS_DIR}/run_evo2_train" +RESULTS_THIS_APP_DIR="${RESULTS_DIR}/run_evo2_train_to_profile" RUN_LABEL_PREFIX="bionemo_evo2_train" PYTHON_SCRIPT_PATH=sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py @@ -29,7 +35,7 @@ TRAIN_ARGS_ARRAY=( "--model-size" "test" "--max-steps" - "40" + "1" "--context-parallel-size" "1" "--devices" @@ -66,12 +72,13 @@ RUN_LABEL=$(IFS='_'; echo "${run_label_arr[*]}") printf "${MESSAGE_TEMPLATE}" "RUN_LABEL=${RUN_LABEL}" RESULTS_THIS_APP_THIS_RUN_DIR="${RESULTS_THIS_APP_DIR}/${RUN_LABEL}" +export RESULTS_THIS_APP_THIS_RUN_DIR=${RESULTS_THIS_APP_THIS_RUN_DIR} + mkdir -p ${RESULTS_THIS_APP_THIS_RUN_DIR} chmod a+rw ${RESULTS_THIS_APP_THIS_RUN_DIR} LOG_FILE="${RESULTS_THIS_APP_THIS_RUN_DIR}/${RUN_LABEL}.log" - - +export BNM_MODULE_HOOK_MANAGER_RESULTS_DIR=${RESULTS_THIS_APP_THIS_RUN_DIR} # ---------------------------------------- # (5) create python training script comman # --------------------------------------- @@ -85,5 +92,10 @@ eval "${PY_COMMAND}" # ---------------------------------------- # (-1) post-amble # ---------------------------------------- +printf "${MESSAGE_TEMPLATE}" "script summary:" printf "${MESSAGE_TEMPLATE}" "LOG_FILE=${LOG_FILE}" +printf "${MESSAGE_TEMPLATE}" "LIT_VERSION=${LIT_VERSION}" +printf "${MESSAGE_TEMPLATE}" "TORCH_VERSION=${TORCH_VERSION}" +printf "${MESSAGE_TEMPLATE}" "PYTHON_VERSION=${PYTHON_VERSION}" + printf "${MESSAGE_TEMPLATE}" "end with success" \ No newline at end of file diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py index f04725669..b6716514c 100644 --- a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py @@ -55,6 +55,8 @@ from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger +from bionemo.evo2.utils.logging.bnm_module_hook_manager import BNM_MODULE_HOOK_HANDLES + torch._dynamo.config.suppress_errors = True @@ -894,7 +896,11 @@ def train(args: argparse.Namespace) -> nl.Trainer: opt.connect(model) # Start training print("*******************train: before trainer.fit") + print(f"*************type(model.modules)={type(model.modules())}*********") trainer.fit(model, data_module) + for h in BNM_MODULE_HOOK_HANDLES: + h.remove() + print(f"*************type(model.module)={type(model.module)}*********") print("*******************train: after trainer.fit") return trainer From 32d5345961b9484ace0af7dc0c4865850fa4a683 Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Tue, 2 Sep 2025 15:47:38 +0000 Subject: [PATCH 05/12] br: [skip ci] Signed-off-by: Brian Roland --- internal/scripts_br/build_dev_image_br.sh | 37 ---- internal/scripts_br/install_tap.sh | 14 -- internal/scripts_br/run_dev_br.sh | 146 --------------- .../scripts_br/run_evo2_train_to_profile.sh | 101 ----------- internal/scripts_br/run_mig_br.sh | 46 ----- .../scripts_br/run_nsys_with_evo2_train.sh | 108 ----------- internal/scripts_br/run_precommit.sh | 43 ----- internal/scripts_br/run_pytest.sh | 51 ------ internal/scripts_br/run_simple_torch_app.py | 45 ----- .../scripts_br/run_tap_with_evo2_train.sh | 122 ------------- .../scripts_br/run_update_git_submodules.sh | 2 - .../scripts_br/show_git_submodule_config.sh | 2 - internal/scripts_br/venv_create.sh | 67 ------- .../utils/logging/bnm_module_hook_manager.py | 171 ++++++++++++++++++ 14 files changed, 171 insertions(+), 784 deletions(-) delete mode 100755 internal/scripts_br/build_dev_image_br.sh delete mode 100755 internal/scripts_br/install_tap.sh delete mode 100755 internal/scripts_br/run_dev_br.sh delete mode 100755 internal/scripts_br/run_evo2_train_to_profile.sh delete mode 100755 internal/scripts_br/run_mig_br.sh delete mode 100755 internal/scripts_br/run_nsys_with_evo2_train.sh delete mode 100755 internal/scripts_br/run_precommit.sh delete mode 100755 internal/scripts_br/run_pytest.sh delete mode 100644 internal/scripts_br/run_simple_torch_app.py delete mode 100755 internal/scripts_br/run_tap_with_evo2_train.sh delete mode 100755 internal/scripts_br/run_update_git_submodules.sh delete mode 100755 internal/scripts_br/show_git_submodule_config.sh delete mode 100755 internal/scripts_br/venv_create.sh create mode 100644 sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py diff --git a/internal/scripts_br/build_dev_image_br.sh b/internal/scripts_br/build_dev_image_br.sh deleted file mode 100755 index 03eaa9aae..000000000 --- a/internal/scripts_br/build_dev_image_br.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash - -# ------------------------------------------------------------------------ -# (0) preamble -# ------------------------------------------------------------------------ -MESSAGE_TEMPLATE='********build_dev_image_br.sh: %s\n' -DATE_OF_SCRIPT=$(date +'%Y%m%dT%H%M%S') -SCRIPT_DIR="$(dirname "$(realpath "$BASH_SOURCE")")" -printf "${MESSAGE_TEMPLATE}" "SCRIPT_DIR=${SCRIPT_DIR}" -printf "${MESSAGE_TEMPLATE}" "hostname=$(hostname)" -printf "${MESSAGE_TEMPLATE}" "whoami=$(whoami)" -printf "${MESSAGE_TEMPLATE}" "uid=$(id -u)" -printf "${MESSAGE_TEMPLATE}" "gid=$(id -g)" - -set -euo pipefail - -BRANCH=$(git rev-parse --abbrev-ref HEAD) -COMMIT=$(git rev-parse --short HEAD) -DATE=$(date --iso-8601=seconds -u) - -set -x -DOCKER_BUILDKIT=1 docker buildx build \ - -t "nvcr.io/nvidian/cvai_bnmo_trng/bionemo:dev-${BRANCH}-${DATE_OF_SCRIPT}-${COMMIT}" \ - --ulimit 'nofile=65535:65535' \ - --target="development" \ - --load \ - --cache-from nvcr.io/nvidia/clara/bionemo-framework:nightly \ - --cache-to type=inline \ - --label com.nvidia.bionemo.git_sha=${COMMIT} \ - --label com.nvidia.bionemo.created_at=${DATE} \ - -f ./Dockerfile \ - . - -# ---------------------- -# (-1) post-amble -# -------------------------- -printf "${MESSAGE_TEMPLATE}" "end script" \ No newline at end of file diff --git a/internal/scripts_br/install_tap.sh b/internal/scripts_br/install_tap.sh deleted file mode 100755 index b8771128f..000000000 --- a/internal/scripts_br/install_tap.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# prelim required by release version of TAP -mkdir -p /workspace/bionemo2/.local/lib/python3.12/site-packages/ -touch /workspace/bionemo2/.local/lib/python3.12/site-packages/usercustomize.py - -# install from gitlab server -pip install git+https://gitlab-master.nvidia.com/dl/gwe/torch_automated_profiler@release - -# RUN --mount=type=ssh cd /opt && git clone ssh://git@gitlab-master.nvidia.com:12051/dl/gwe/torch_automated_profiler.git\ -# && cd torch_automated_profiler\ -# && git fetch origin br_max_depth_1\ -# && git checkout -b br_max_depth_1 origin/br_max_depth_1\ -# && pip install -e . -v \ No newline at end of file diff --git a/internal/scripts_br/run_dev_br.sh b/internal/scripts_br/run_dev_br.sh deleted file mode 100755 index 3e66b1995..000000000 --- a/internal/scripts_br/run_dev_br.sh +++ /dev/null @@ -1,146 +0,0 @@ -#!/usr/bin/env bash - -# ------------------------------------------------------------------------ -# (0) preamble -# ------------------------------------------------------------------------ -MESSAGE_TEMPLATE='********run_dev_br.sh: %s\n' -DATE_OF_SCRIPT=$(date +'%Y%m%dT%H%M%S') -SCRIPT_DIR="$(dirname "$(realpath "$BASH_SOURCE")")" -printf "${MESSAGE_TEMPLATE}" "SCRIPT_DIR=${SCRIPT_DIR}" -printf "${MESSAGE_TEMPLATE}" "hostname=$(hostname)" -printf "${MESSAGE_TEMPLATE}" "whoami=$(whoami)" -printf "${MESSAGE_TEMPLATE}" "uid=$(id -u)" -printf "${MESSAGE_TEMPLATE}" "gid=$(id -g)" - - -#set -euo pipefail - -source .env - - -# ----------------------------------------------------- -# (1) user paramerters -# ----------------------------------------------------- -USER_IN_CTR=root # if profiling, run as root -HOME_IN_CTR=/opt/${USER_IN_CTR} - -#GPU_ARG='--gpus "\"device=0,1,2,3,4,5,6,7\""' -GPU_ARG='--gpus all' -LOCAL_RESULTS_PATH="/home/scratch.broland_sw_1/data_for_projects/evo2/results/bionemo2_results" -LOCAL_DATA_PATH="./data" -LOCAL_MODELS_PATH="./models" - -COMMIT_AT_START=$(git rev-parse --short HEAD) -BRANCH_AT_START=$(git rev-parse --abbrev-ref HEAD) -IMAGE_REPO='nvcr.io/nvidian/cvai_bnmo_trng/bionemo' -IMAGE_TAG='dev-br_bnm2532_dlsim_val_in_fw_a-20250831T164028-a29272f1' -IMAGE_NAME="${IMAGE_REPO}:${IMAGE_TAG}" - -DOCKER_REPO_PATH="/workspace/bionemo2" -DOCKER_RESULTS_PATH="/workspace/bionemo2/results" -DOCKER_MODELS_PATH="/workspace/bionemo2/models" -DOCKER_DATA_PATH="/workspace/bionemo2/data" - -# ----------------------------------------------------- -# (2) santity checks -# ---------------------------------------------------- -LOCAL_REPO_PATH="$(realpath $(pwd))" -if [[ "$(basename ${LOCAL_REPO_PATH})" != *"bionemo-framework"* ]]; then - echo "ERROR: must run this script from the bionemo repository root!" - exit 1 -fi - -# --------------------------------------------------------------------- -# (3) make expected directories in external filesystem as user, not as docker -# ---------------------------------------------------- -expected_local_dirs=("${LOCAL_RESULTS_PATH}" "${LOCAL_DATA_PATH}" "${LOCAL_MODELS_PATH}" "./htmlcov") -for expected_local_dir in "${expected_local_dirs[@]}"; do - printf "${MESSAGE_TEMPLATE}" "expected_local_dir=${expected_local_dir}" - mkdir -p "${expected_local_dir}" - chmod -R a+rw "${expected_local_dir}" -done - -# --------------------------------------------------------------------- -# (4) delete external directories with state -# ---------------------------------------------------- -sudo rm -rf ${LOCAL_RESULTS_PATH}/evo2 - -# ----------------------------------------------------- -# (5) assemble docker run command -# ---------------------------------------------------- - -printf "${MESSAGE_TEMPLATE}" "create DOCKER_RUN_COMMAND" - -read -r -d '' DOCKER_RUN_OPTIONS_FOR_PROFILING <&1 | tee -a ${LOG_FILE} -EOF - -printf "${MESSAGE_TEMPLATE}" "PY_COMMAND=${PY_COMMAND}" -eval "${PY_COMMAND}" - -# ---------------------------------------- -# (-1) post-amble -# ---------------------------------------- -printf "${MESSAGE_TEMPLATE}" "script summary:" -printf "${MESSAGE_TEMPLATE}" "LOG_FILE=${LOG_FILE}" -printf "${MESSAGE_TEMPLATE}" "LIT_VERSION=${LIT_VERSION}" -printf "${MESSAGE_TEMPLATE}" "TORCH_VERSION=${TORCH_VERSION}" -printf "${MESSAGE_TEMPLATE}" "PYTHON_VERSION=${PYTHON_VERSION}" - -printf "${MESSAGE_TEMPLATE}" "end with success" \ No newline at end of file diff --git a/internal/scripts_br/run_mig_br.sh b/internal/scripts_br/run_mig_br.sh deleted file mode 100755 index 446615479..000000000 --- a/internal/scripts_br/run_mig_br.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash - - -# profiles are specified to each GPU, e.g. profile 15 can be used to divide into 4 devices of size 20gb -# NVIDIA H100 80GB HBM3 -#| => sudo nvidia-smi mig -i 3 -cgi 15 -C -#Successfully created GPU instance ID 5 on GPU 3 using profile MIG 1g.20gb (ID 15) -#Successfully created compute instance ID 0 on GPU 3 GPU instance ID 5 using profile MIG 1g.20gb (ID 7) - - - - -# GPU 0: NVIDIA H100 80GB HBM3 (UUID: GPU-afddd1b4-4464-96c8-a712-aaeb0acf1170) # cudo 0 on torch -# GPU 1: NVIDIA H100 80GB HBM3 (UUID: GPU-6faf0136-7870-5767-10be-a0827a158829) -# GPU 2: NVIDIA H100 80GB HBM3 (UUID: GPU-20d20fc3-bcc7-e715-32d6-ffd646ea062f) -# GPU 3: NVIDIA H100 80GB HBM3 (UUID: GPU-182e6bd5-b7ac-e0a6-48cf-96e198063dd3) -# MIG 1g.20gb Device 0: (UUID: MIG-56679450-0984-50db-83a3-7e549eb60883) # cudo 4 on torch -# MIG 1g.20gb Device 1: (UUID: MIG-a155b8d5-2484-52fc-a2ed-e47dc89996cd) -# MIG 1g.20gb Device 2: (UUID: MIG-9dc27b3c-b567-5802-a2a7-27ad657ab079) -# MIG 1g.20gb Device 3: (UUID: MIG-f6102e7f-bbf5-5db4-abea-156619dd4ce2) - - - -# Split into to 40gb device sudo nvidia-smi mig -i 5 -cgi 5,5 - -# (0) choose a device -DEVICE_INDEX_FOR_MIG=1 -PROFILE=15 -PROFILE=9 # - -# (1) show all gpu indices, uuids, and product names -nvidia-smi -L - -# (2) list all MIG instances -sudo nvidia-smi mig -lgi - -# (3) activate multi-instance gpu for -sudo nvidia-smi --id ${DEVICE_INDEX_FOR_MIG} -mig 1 - -# split device with index 0 into 3 compute instances -for i in {0..3}; do - sudo nvidia-smi mig --id ${DEVICE_INDEX_FOR_MIG} -cgi ${PROFILE} -C -done - -# show all gpu indices, uuids, and produce names -nvidia-smi -L diff --git a/internal/scripts_br/run_nsys_with_evo2_train.sh b/internal/scripts_br/run_nsys_with_evo2_train.sh deleted file mode 100755 index e59cf0fb8..000000000 --- a/internal/scripts_br/run_nsys_with_evo2_train.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/bash -# -# -# nsys option like --pytorch function-trace: -# nvtx markers for common torch operations at the pytorch level like torch.Tensor.to -# -# nsys option like --pytorch autograd-shapes-nvtx: -# nvtx markers for common torch operations at the kernel level like "to", "to_copy" -# - - -# ---------------------------------------- -# (0) preamble -# ---------------------------------------- -MESSAGE_TEMPLATE='********run_evo2_train.sh: %s\n' -DATE_OF_SCRIPT=$(date +'%Y%m%dT%H%M') -SCRIPT_DIR="$(dirname "$(realpath "$BASH_SOURCE")")" -printf "${MESSAGE_TEMPLATE}" "begin" -printf "${MESSAGE_TEMPLATE}" "DATE_OF_SCRIPT=${DATE_OF_SCRIPT}" - -# ---------------------------------------- -# (1) set some user parameters -# ---------------------------------------- -RESULTS_DIR="./results/run_nsys_with_evo2_train" - -read -r -d '' NSYS_PROFILE_OPTIONS < ReLU -> Linear -class SimpleModel(nn.Module): - def __init__(self, input_size=10, hidden_size=20, output_size=5): - super(SimpleModel, self).__init__() - self.fc1 = nn.Linear(input_size, hidden_size) - self.fc2 = nn.Linear(hidden_size, output_size) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = self.fc2(x) - return x - -# Instantiate the model -model = SimpleModel() - -# Generate random input data (batch_size=4, input_size=10) -x = torch.randn(4, 10) - - -with torch.profiler.profile( - activities=[torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA], - on_trace_ready=torch.profiler.tensorboard_trace_handler("./log"), - record_shapes=True, - with_stack=True -) as prof: - for _ in range(5): - output = model(x) - - print("Input:", x) - print("Output:", output) - -print( - prof.key_averages(group_by_input_shape=True).table( - sort_by="cuda_time_total", row_limit=10 - ) -) diff --git a/internal/scripts_br/run_tap_with_evo2_train.sh b/internal/scripts_br/run_tap_with_evo2_train.sh deleted file mode 100755 index f22a8fb36..000000000 --- a/internal/scripts_br/run_tap_with_evo2_train.sh +++ /dev/null @@ -1,122 +0,0 @@ -#!/bin/bash -# -# -# nsys option like --pytorch function-trace: -# nvtx markers for common torch operations at the pytorch level like torch.Tensor.to -# -# nsys option like --pytorch autograd-shapes-nvtx: -# nvtx markers for common torch operations at the kernel level like "to", "to_copy" -# - - -# ---------------------------------------- -# (0) preamble -# ---------------------------------------- -MESSAGE_TEMPLATE='********run_evo2_train.sh: %s\n' -DATE_OF_SCRIPT=$(date +'%Y%m%dT%H%M') -SCRIPT_DIR="$(dirname "$(realpath "$BASH_SOURCE")")" -printf "${MESSAGE_TEMPLATE}" "begin" -printf "${MESSAGE_TEMPLATE}" "DATE_OF_SCRIPT=${DATE_OF_SCRIPT}" - -# ---------------------------------------- -# (1) set some user parameters -# ---------------------------------------- -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export TAP_NSIGHT_LOCATION='/usr/local/cuda/bin/nsys' -export TAP_LOG_LEVEL='debug' -export TAP_MODE='nsight' # '', nsight, or anna..... '' means ignore all profiling -export TAP_NVTX='pytorch' # pytorch, apex, python -export TAP_BACKWARD_NVTX='false' # true or false -export TAP_PROFILE_MEMORY='false' -export TAP_WAIT_STEPS='1' # 2 is my default -export TAP_WARMUP_STEPS='1' # 12 is my default, -export TAP_ACTIVE_STEPS='4' # 1 is my default -export TAP_WAIT_EPOCHS='1' -#!/usr/bin/env sh - - -export TAP_EXIT_ON_STOP=true - -#export APP_NVTX_CATEGORIES='main,lit_module,dataset' -export TAP_NSIGHT_FLAGS='--trace nvtx,cuda' -#export TAP_MAX_DEPTH=14 # minimal is 4, since torch compile adds a ldevel, default is 14 - - -RESULTS_DIR="./results/run_tap_with_evo2_train" - -TRAIN_ARGS_ARRAY=( - "--mock-data" - "--seq-length" - "256" - "--micro-batch-size" - "1" - "--model-size" - "test" - "--max-steps" - "30" - "--context-parallel-size" - "1" - "--devices" - "1" - "--val-check-interval" "0" -) - -RUN_LABEL_PREFIX="tap_bionemo_evo2_train" -PYTHON_SCRIPT_PATH=sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py - -# ---------------------------------------- -# (2) dump parameters -# ---------------------------------------- -printf "${MESSAGE_TEMPLATE}" "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}" - -# ---------------------------------------- -# (3) create output dir names and file names -# - create log file name and report filename -# ---------------------------------------- -run_label_arr=( - ${RUN_LABEL_PREFIX} - "mock-data" - ${BRANCH_AT_START} - ${DATE_OF_SCRIPT} - ${COMMIT_AT_START} -) -RUN_LABEL="$(IFS='_'; echo "${run_label_arr[*]}")" - -RESULTS_SUBDIR="${RESULTS_DIR}/${RUN_LABEL}" -export TAP_SAVE_DIR="${RESULTS_SUBDIR}" -mkdir -p "${RESULTS_SUBDIR}" -chmod a+r "${RESULTS_SUBDIR}" - -LOG_FILE="${RESULTS_SUBDIR}/${RUN_LABEL}.log" -REPORT_FILE="${RESULTS_SUBDIR}/${RUN_LABEL}.nsys-rep" - -# ---------------------------------------- -# (4) create command -# ---------------------------------------- -APPLICATION_TO_PROFILE="python ${PYTHON_SCRIPT_PATH} ${TRAIN_ARGS_ARRAY[@]}" -#APPLICATION_TO_PROFILE="python -c 'import torch; x= torch.ones(500)'" - -read -r -d '' TAP_PROFILE_CMD <&1 | tee -a ${LOG_FILE} -EOF -# ---------------------------------------- -# (5) run command -# ---------------------------------------- -printf "\n" -printf "${MESSAGE_TEMPLATE}" "nsys version: $(nsys --version)" - -printf "\n" -printf "${MESSAGE_TEMPLATE}" "APPLICATION_TO_PROFILE=${APPLICATION_TO_PROFILE}" -printf "\n" -printf "${MESSAGE_TEMPLATE}" "TAP_PROFILE_CMD" -echo "${TAP_PROFILE_CMD}" -eval "${TAP_PROFILE_CMD}" - -if [[ -f '/tmp/.tap_dummy_nsight_report.nsys-rep' ]]; then - cp /tmp/.tap_dummy_nsight_report.nsys-rep ${RESULTS_SUBDIR}/tap_dummy_nsight_report.nsys-rep -fi -# ---------------------------------------- -# (-1) post-amble -# ---------------------------------------- -printf "${MESSAGE_TEMPLATE}" "TAP_SAVE_DIR=${TAP_SAVE_DIR}" -printf "${MESSAGE_TEMPLATE}" "end script" \ No newline at end of file diff --git a/internal/scripts_br/run_update_git_submodules.sh b/internal/scripts_br/run_update_git_submodules.sh deleted file mode 100755 index 1b0c74c98..000000000 --- a/internal/scripts_br/run_update_git_submodules.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -git submodule update --init --recursive \ No newline at end of file diff --git a/internal/scripts_br/show_git_submodule_config.sh b/internal/scripts_br/show_git_submodule_config.sh deleted file mode 100755 index 1289082cd..000000000 --- a/internal/scripts_br/show_git_submodule_config.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/bash -git config --local --get-regexp submodule \ No newline at end of file diff --git a/internal/scripts_br/venv_create.sh b/internal/scripts_br/venv_create.sh deleted file mode 100755 index 80a20d63b..000000000 --- a/internal/scripts_br/venv_create.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash -# -# title: virtual_env_create.sh -# usage: -# cd ; ./scripts/virtual_env_create.sh -# -# create a virtual environment for the benchmarking repo -# -MESSAGE_TEMPLATE='********virtual_env_create.sh: %s\n' -DATE_OF_SCRIPT=$(date +'%Y%m%dT%H%M%S') -SCRIPT_DIR="$(dirname "$(realpath "$BASH_SOURCE")")" -GIT_BRANCH=$(git branch --show-current) - -# ----------------------------------------------- -# (1) set script-level parameters -# ------------------------------------------------ -ENV_DIR=./venv_bionemo_fw - -# ----------------------------------------------- -# (2) create venv -# ------------------------------------------------ -printf "${MESSAGE_TEMPLATE}" "attempt to created a virtual env in directory ${ENV_DIR}" - -# -------------------------------------------------- -# on computelab run -# (1) cannot run as sudo -# ------------------------------------------------- -#apt update -#apt install -y python3 python3-pip python3.10-venv - - -sudo add-apt-repository ppa:deadsnakes/ppa -y -sudo apt update -sudo apt install python3.13 python3.13-venv python3.13-dev - - -python3 -m venv ${ENV_DIR} - - -# ----------------------------------------------- -# (3) enter venv -# ------------------------------------------------ -source ${ENV_DIR}/bin/activate -printf "${MESSAGE_TEMPLATE}" "you are in virtual env in directory ${ENV_DIR}" - -# ----------------------------------------------- -# (3) install pip to virtual environment -# ------------------------------------------------ -if [[ "$(hostname)" == *viking-prod* ]]; then - printf "${MESSAGE_TEMPLATE}" "installing pip inside virtual-environment on viking host" - sudo apt update - sudo apt install -y python3-pip -else - printf "${MESSAGE_TEMPLATE}" "installing pip inside virtual-environment" - apt update python3-pip python3.10-venv - apt install -y python3-pip -fi - - -# ----------------------------------------------- -# (4) pip install -# ------------------------------------------------ -pip install pre-commit==4.1.0 - -printf "${MESSAGE_TEMPLATE}" "to enter this virtual env, source ${ENV_DIR}/bin/activate" -printf "${MESSAGE_TEMPLATE}" "to exit this virtual env, deactivate" -printf "${MESSAGE_TEMPLATE}" "exiting" \ No newline at end of file diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py new file mode 100644 index 000000000..7e9791867 --- /dev/null +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py @@ -0,0 +1,171 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Callable +import os +from threading import stack_size +from torch import nn +from torch import Tensor + +BNM_MODULE_HOOK_HANDLES = [] + + +class BnmModuleHookManager(): + + def configure_hooks( + self, + root_module: nn.Module, + results_dir: str | None = None, + forward_pre_hook_types: list[str] | None = None, + forward_hook_types: list[str] | None = None, + level_max: int | None = None, + ): + """Configure hooks. + + Args: + root_module: The module ancestor to all submodules which should have hooks. + forward_pre_hook_types: The types of forward pre hooks to configure # ["input_shapes"] + forward_hook_types: The types of forward hooks to configure. #["output_shapes"], + """ + print(f"BnmModuleHookManager,configure_hooks,type(self.module)={type(root_module)}") + self.root_module = root_module + self.level_max = os.getenv("BNM_MODULE_HOOK_MANAGER_LEVEL_MAX", level_max) # str or None or int + if isinstance(self.level_max, str): + self.level_max = int(self.level_max) + + self.results_dir = os.getenv("BNM_MODULE_HOOK_MANAGER_RESULTS_DIR", results_dir) # str or None + self.bnm_module_hook_output_filename = None if self.results_dir is None else os.path.join(str(self.results_dir), f"bnm_module_hook_output_lvl{level_max}.txt") + + + self.forward_pre_hook_types = forward_pre_hook_types + self.forward_hook_types = forward_hook_types + + header_with_column_names = ";".join([ + "class_to_collect_metrics", + "method_name", + "level", + "hooked_pytorch_module_name", + "hooked_function_name", + "metric_name", + "metric_value", + ]) + self.write_line_to_file(header_with_column_names) + + BnmModuleHookManager.do_for_each_submodule_bfs( + func=self.configure_hooks_for_submodule, + module=root_module, + level=0, + level_max=level_max, + ) + + def configure_hooks_for_submodule(self, module: nn.Module, level: int | None = None): + """ + Args: + module: A submodule + level: The level of the submodule in the subtree of the root module + """ + + + if isinstance(self.forward_pre_hook_types,list) and "input_shapes" in self.forward_pre_hook_types: + + def forward_pre_hook_for_input_shapes( + module: nn.Module, + input: tuple[Tensor] + ): + message = BnmModuleHookManager.bnm_forward_pre_hook_for_input_shapes_helper(module, input, level) + self.write_line_to_file(message) + + BNM_MODULE_HOOK_HANDLES.append( + module.register_forward_pre_hook(forward_pre_hook_for_input_shapes) + ) + + if isinstance(self.forward_hook_types,list) and "output_shapes" in self.forward_hook_types: + + def forward_hook_for_output_shapes( + module: nn.Module, + input: tuple[Tensor], + output: tuple[Tensor] | Tensor, + ): + message = BnmModuleHookManager.bnm_forward_hook_for_output_shapes_helper(module, input, output, level) + self.write_line_to_file(message) + + BNM_MODULE_HOOK_HANDLES.append( + module.register_forward_hook(forward_hook_for_output_shapes) + ) + + def write_line_to_file(self, line: str): + if self.bnm_module_hook_output_filename is not None: + with open(self.bnm_module_hook_output_filename, "a") as f: + f.write(line + "\n") + + @staticmethod + def do_for_each_submodule_bfs( + func: Callable, + module: nn.Module, + level: int = 0, + level_max: int | None = None + ): + + func(module, level) + if level_max is None or level+1 <= level_max: + for _, child in module.named_children(): + BnmModuleHookManager.do_for_each_submodule_bfs( + func=func, module=child, level=level + 1, level_max=level_max + ) + + @staticmethod + def bnm_forward_pre_hook_for_input_shapes_helper( + module: nn.Module, input: tuple[Tensor] | Tensor, level: int | None = None + ) -> str: + some_list_of_strings = ["NA"] + if isinstance(input, Tensor): + some_list_of_strings = [str(tuple(input.shape))] + elif isinstance(input, tuple): + some_list_of_strings = [ + "NA" if not isinstance(input_component, Tensor) else str(tuple(input_component.shape)) + for input_component in input + ] + + input_names_and_shapes = "|".join(some_list_of_strings) + message = ";".join([ + "BnmModuleHookManager", + "bnm_forward_pre_hook_for_input_shapes_helper", + f"{level}", + f"{module.__class__.__name__}", + "forward", + "input_shapes", + f"{input_names_and_shapes}", + ]) + return message + + @staticmethod + def bnm_forward_hook_for_output_shapes_helper( + module: nn.Module, input: tuple[Tensor], output: tuple[Tensor] | Tensor, level: int | None = None + ) -> str: + some_list_of_strings = ["NA"] + if isinstance(output, Tensor): + some_list_of_strings = [str(tuple(output.shape))] + elif isinstance(output, tuple): + some_list_of_strings = ["NA" if not isinstance(output_component, Tensor) else str(tuple(output_component.shape)) for output_component in output] + + output_names_and_shapes = "|".join(some_list_of_strings) + message = ";".join([ + "BnmModuleHookManager", + "bnm_forward_hook_for_output_shapes_helper", + f"{level}", + f"{module.__class__.__name__}", + "forward", + "output_shapes", + f"{output_names_and_shapes}", + ]) + return message From 31b39fe245d27044ee2232548fdc01fc831349c1 Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Tue, 2 Sep 2025 15:51:01 +0000 Subject: [PATCH 06/12] br: update to bnm_module_hook_manager in nemo [skip ci] Signed-off-by: Brian Roland --- 3rdparty/NeMo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/NeMo b/3rdparty/NeMo index ea54a221a..7aecc748b 160000 --- a/3rdparty/NeMo +++ b/3rdparty/NeMo @@ -1 +1 @@ -Subproject commit ea54a221a495b4dd8afae3993a18167bac42ba3f +Subproject commit 7aecc748bfd1e941e53c4efd5ec09a2f1c0f872b From f6121c38f957f66341a79430ea18346434e45a17 Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Tue, 2 Sep 2025 16:15:24 +0000 Subject: [PATCH 07/12] br: [skip ci] Signed-off-by: Brian Roland --- .../src/bionemo/evo2/run/train.py | 28 +++++++++++++++++-- .../utils/logging/bnm_module_hook_manager.py | 10 +++++-- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py index b6716514c..792b081bd 100644 --- a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py @@ -54,12 +54,32 @@ from bionemo.llm.utils.datamodule_utils import infer_global_batch_size from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger - -from bionemo.evo2.utils.logging.bnm_module_hook_manager import BNM_MODULE_HOOK_HANDLES +from bionemo.evo2.utils.logging.bnm_module_hook_manager import BnmModuleHookManager, BNM_MODULE_HOOK_HANDLES torch._dynamo.config.suppress_errors = True +from bionemo.evo2.utils.logging.bnm_module_hook_manager import BnmModuleHookManager + + +class HyenaModelWithCustomMetrics(llm.HyenaModel): + + def configure_model(self, vp_stage: Optional[int] = None) -> None: + """Add additional configuration for HyenaModel(GPTModel), after GPTModel.configure_model(). + + When this method is called, self.module is the HyenaModel(LanguageModule(MegatronModel)) + + """ + super(llm.HyenaModel, self).configure_model(vp_stage=vp_stage) + self.bnm_module_hook_manager = BnmModuleHookManager() + + self.bnm_module_hook_manager.configure_hooks( + root_module=self.module, + forward_pre_hook_types=["input_shapes"], + forward_hook_types=["output_shapes"], + ) + + def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace: """Parse arguments for Evo2 model training.""" parser = argparse.ArgumentParser( @@ -633,7 +653,9 @@ def train(args: argparse.Namespace) -> nl.Trainer: if args.lora_finetune: lora_transform = Evo2LoRA(peft_ckpt_path=args.lora_checkpoint_path) print("********************train: init llm.HyenaModel*******") - model = llm.HyenaModel(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) + #model = llm.HyenaModel(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) + model = HyenaModelWithCustomMetrics(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) + else: # mamba if args.no_weight_decay_embeddings: config_modifiers_init["hyena_no_weight_decay_cond_fn"] = mamba_no_weight_decay_cond_with_embeddings diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py index 7e9791867..6db97d43d 100644 --- a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py @@ -39,14 +39,18 @@ def configure_hooks( """ print(f"BnmModuleHookManager,configure_hooks,type(self.module)={type(root_module)}") self.root_module = root_module + import pdb; pdb.set_trace() self.level_max = os.getenv("BNM_MODULE_HOOK_MANAGER_LEVEL_MAX", level_max) # str or None or int if isinstance(self.level_max, str): self.level_max = int(self.level_max) self.results_dir = os.getenv("BNM_MODULE_HOOK_MANAGER_RESULTS_DIR", results_dir) # str or None - self.bnm_module_hook_output_filename = None if self.results_dir is None else os.path.join(str(self.results_dir), f"bnm_module_hook_output_lvl{level_max}.txt") + self.bnm_module_hook_output_filename = None + if self.results_dir is not None: + self.bnm_module_hook_output_filename = os.path.join( + str(self.results_dir), f"bnm_module_hook_output_lvl{self.level_max}.txt" + ) - self.forward_pre_hook_types = forward_pre_hook_types self.forward_hook_types = forward_hook_types @@ -65,7 +69,7 @@ def configure_hooks( func=self.configure_hooks_for_submodule, module=root_module, level=0, - level_max=level_max, + level_max=self.level_max, ) def configure_hooks_for_submodule(self, module: nn.Module, level: int | None = None): From df85fafa30e637d7e2d83ad9b265ae2ecc1d6f08 Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Wed, 17 Sep 2025 18:14:37 +0000 Subject: [PATCH 08/12] br: module_hook_manager Signed-off-by: Brian Roland --- .../evo2/utils/logging/bnm_module_hook_manager.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py index 6db97d43d..e83ad3816 100644 --- a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py @@ -39,7 +39,6 @@ def configure_hooks( """ print(f"BnmModuleHookManager,configure_hooks,type(self.module)={type(root_module)}") self.root_module = root_module - import pdb; pdb.set_trace() self.level_max = os.getenv("BNM_MODULE_HOOK_MANAGER_LEVEL_MAX", level_max) # str or None or int if isinstance(self.level_max, str): self.level_max = int(self.level_max) @@ -111,6 +110,7 @@ def write_line_to_file(self, line: str): if self.bnm_module_hook_output_filename is not None: with open(self.bnm_module_hook_output_filename, "a") as f: f.write(line + "\n") + f.flush() @staticmethod def do_for_each_submodule_bfs( @@ -140,12 +140,16 @@ def bnm_forward_pre_hook_for_input_shapes_helper( for input_component in input ] - input_names_and_shapes = "|".join(some_list_of_strings) + input_names_and_shapes = "|".join(some_list_of_strings) + + module_name = f"{module.__class__.__name__}" + if hasattr(module, "operator_type"): + module_name += f"-{module.operator_type}" message = ";".join([ "BnmModuleHookManager", "bnm_forward_pre_hook_for_input_shapes_helper", f"{level}", - f"{module.__class__.__name__}", + module_name, "forward", "input_shapes", f"{input_names_and_shapes}", @@ -163,11 +167,14 @@ def bnm_forward_hook_for_output_shapes_helper( some_list_of_strings = ["NA" if not isinstance(output_component, Tensor) else str(tuple(output_component.shape)) for output_component in output] output_names_and_shapes = "|".join(some_list_of_strings) + module_name = f"{module.__class__.__name__}" + if hasattr(module, "operator_type"): + module_name += f"-{module.operator_type}" message = ";".join([ "BnmModuleHookManager", "bnm_forward_hook_for_output_shapes_helper", f"{level}", - f"{module.__class__.__name__}", + module_name, "forward", "output_shapes", f"{output_names_and_shapes}", From caf2ab601ebe54b8f2ad615a1b8bd3525b456dc5 Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Wed, 17 Sep 2025 18:31:33 +0000 Subject: [PATCH 09/12] br: after merge from main Signed-off-by: Brian Roland --- sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py index 8a9f6cff9..009a71dbb 100644 --- a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py @@ -818,17 +818,9 @@ def train(args: argparse.Namespace) -> nl.Trainer: lora_transform = None if args.lora_finetune: lora_transform = Evo2LoRA(peft_ckpt_path=args.lora_checkpoint_path) -<<<<<<< HEAD - print("********************train: init llm.HyenaModel*******") - #model = llm.HyenaModel(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) - model = HyenaModelWithCustomMetrics(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) - - else: # mamba -======= model = llm.HyenaModel(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) elif model_type == "mamba": # mamba ->>>>>>> main if args.no_weight_decay_embeddings: config_modifiers_init["hyena_no_weight_decay_cond_fn"] = mamba_no_weight_decay_cond_with_embeddings config_modifiers_init["lowercase_loss_reweighting"] = args.mamba_lowercase_loss_weight From 1b9cf7cc509bfb1e794d68a8e25a60e6a203de64 Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Wed, 17 Sep 2025 21:58:57 +0000 Subject: [PATCH 10/12] br: put in rearrange decorator Signed-off-by: Brian Roland --- .../src/bionemo/evo2/run/train.py | 28 +---- .../utils/logging/bnm_module_hook_manager.py | 108 ++++++++++++++---- 2 files changed, 93 insertions(+), 43 deletions(-) diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py index 009a71dbb..722960f85 100644 --- a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py @@ -57,31 +57,11 @@ from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger from bionemo.evo2.utils.logging.bnm_module_hook_manager import BnmModuleHookManager, BNM_MODULE_HOOK_HANDLES +from bionemo.evo2.utils.logging.hyena_model_with_custom_metrics import HyenaModelWithCustomMetrics torch._dynamo.config.suppress_errors = True -from bionemo.evo2.utils.logging.bnm_module_hook_manager import BnmModuleHookManager - -class HyenaModelWithCustomMetrics(llm.HyenaModel): - - def configure_model(self, vp_stage: Optional[int] = None) -> None: - """Add additional configuration for HyenaModel(GPTModel), after GPTModel.configure_model(). - - When this method is called, self.module is the HyenaModel(LanguageModule(MegatronModel)) - - """ - super(llm.HyenaModel, self).configure_model(vp_stage=vp_stage) - - self.bnm_module_hook_manager = BnmModuleHookManager() - - self.bnm_module_hook_manager.configure_hooks( - root_module=self.module, - forward_pre_hook_types=["input_shapes"], - forward_hook_types=["output_shapes"], - ) - - def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace: """Parse arguments for Evo2 model training.""" parser = argparse.ArgumentParser( @@ -819,7 +799,11 @@ def train(args: argparse.Namespace) -> nl.Trainer: if args.lora_finetune: lora_transform = Evo2LoRA(peft_ckpt_path=args.lora_checkpoint_path) - model = llm.HyenaModel(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) + import os + if os.getenv("BNM_MODULE_HOOK_MANAGER_LEVEL_MAX","") != "": + model = HyenaModelWithCustomMetrics(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) + else: + model = llm.HyenaModel(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) elif model_type == "mamba": # mamba if args.no_weight_decay_embeddings: config_modifiers_init["hyena_no_weight_decay_cond_fn"] = mamba_no_weight_decay_cond_with_embeddings diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py index e83ad3816..3a3c21c7e 100644 --- a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py @@ -11,15 +11,20 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Callable +from typing import Callable, Optional import os from threading import stack_size from torch import nn from torch import Tensor +import functools +from einops import rearrange BNM_MODULE_HOOK_HANDLES = [] +_original_rearrange = rearrange + + class BnmModuleHookManager(): def configure_hooks( @@ -49,7 +54,8 @@ def configure_hooks( self.bnm_module_hook_output_filename = os.path.join( str(self.results_dir), f"bnm_module_hook_output_lvl{self.level_max}.txt" ) - + global BNM_MODULE_HOOK_OUTPUT_FILENAME + BNM_MODULE_HOOK_OUTPUT_FILENAME = self.bnm_module_hook_output_filename self.forward_pre_hook_types = forward_pre_hook_types self.forward_hook_types = forward_hook_types @@ -62,14 +68,17 @@ def configure_hooks( "metric_name", "metric_value", ]) - self.write_line_to_file(header_with_column_names) - + BnmModuleHookManager.write_line_to_file( + filename=self.bnm_module_hook_output_filename, + line=header_with_column_names, + ) BnmModuleHookManager.do_for_each_submodule_bfs( func=self.configure_hooks_for_submodule, module=root_module, level=0, level_max=self.level_max, ) + def configure_hooks_for_submodule(self, module: nn.Module, level: int | None = None): """ @@ -86,7 +95,10 @@ def forward_pre_hook_for_input_shapes( input: tuple[Tensor] ): message = BnmModuleHookManager.bnm_forward_pre_hook_for_input_shapes_helper(module, input, level) - self.write_line_to_file(message) + BnmModuleHookManager.write_line_to_file( + filename=self.bnm_module_hook_output_filename, + line=message, + ) BNM_MODULE_HOOK_HANDLES.append( module.register_forward_pre_hook(forward_pre_hook_for_input_shapes) @@ -100,15 +112,18 @@ def forward_hook_for_output_shapes( output: tuple[Tensor] | Tensor, ): message = BnmModuleHookManager.bnm_forward_hook_for_output_shapes_helper(module, input, output, level) - self.write_line_to_file(message) + BnmModuleHookManager.write_line_to_file( + filename=self.bnm_module_hook_output_filename, + line=message, + ) BNM_MODULE_HOOK_HANDLES.append( module.register_forward_hook(forward_hook_for_output_shapes) ) - - def write_line_to_file(self, line: str): - if self.bnm_module_hook_output_filename is not None: - with open(self.bnm_module_hook_output_filename, "a") as f: + @staticmethod + def write_line_to_file(filename: str, line: str): + if filename is not None: + with open(filename, "a") as f: f.write(line + "\n") f.flush() @@ -126,11 +141,9 @@ def do_for_each_submodule_bfs( BnmModuleHookManager.do_for_each_submodule_bfs( func=func, module=child, level=level + 1, level_max=level_max ) - + @staticmethod - def bnm_forward_pre_hook_for_input_shapes_helper( - module: nn.Module, input: tuple[Tensor] | Tensor, level: int | None = None - ) -> str: + def arg_names_and_shapes_as_str(input: Tensor | tuple[Tensor]): some_list_of_strings = ["NA"] if isinstance(input, Tensor): some_list_of_strings = [str(tuple(input.shape))] @@ -141,6 +154,14 @@ def bnm_forward_pre_hook_for_input_shapes_helper( ] input_names_and_shapes = "|".join(some_list_of_strings) + return input_names_and_shapes + + + @staticmethod + def bnm_forward_pre_hook_for_input_shapes_helper( + module: nn.Module, input: tuple[Tensor] | Tensor, level: int | None = None + ) -> str: + input_names_and_shapes = BnmModuleHookManager.arg_names_and_shapes_as_str(input) module_name = f"{module.__class__.__name__}" if hasattr(module, "operator_type"): @@ -160,13 +181,8 @@ def bnm_forward_pre_hook_for_input_shapes_helper( def bnm_forward_hook_for_output_shapes_helper( module: nn.Module, input: tuple[Tensor], output: tuple[Tensor] | Tensor, level: int | None = None ) -> str: - some_list_of_strings = ["NA"] - if isinstance(output, Tensor): - some_list_of_strings = [str(tuple(output.shape))] - elif isinstance(output, tuple): - some_list_of_strings = ["NA" if not isinstance(output_component, Tensor) else str(tuple(output_component.shape)) for output_component in output] - - output_names_and_shapes = "|".join(some_list_of_strings) + + output_names_and_shapes = BnmModuleHookManager.arg_names_and_shapes_as_str(output) module_name = f"{module.__class__.__name__}" if hasattr(module, "operator_type"): module_name += f"-{module.operator_type}" @@ -180,3 +196,53 @@ def bnm_forward_hook_for_output_shapes_helper( f"{output_names_and_shapes}", ]) return message + + + @staticmethod + def shape_logger(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + # first argument is usually the tensor/array + input_ = args[0] + + input_names_and_shapes = BnmModuleHookManager.arg_names_and_shapes_as_str(input_) + message = ";".join([ + "BnmModuleHookManager", + "bnm_forward_hook_for_output_shapes_helper", + "?", + "rearrange", + "forward", + "input_shapes", + f"{input_names_and_shapes}", + ]) + + BnmModuleHookManager.write_line_to_file( + filename=BNM_MODULE_HOOK_OUTPUT_FILENAME, + line=message, + ) + + + result = func(*args, **kwargs) + + result_names_and_shapes = BnmModuleHookManager.arg_names_and_shapes_as_str(result) + result_message = ";".join([ + "BnmModuleHookManager", + "bnm_forward_hook_for_output_shapes_helper", + "?", + "rearrange", + "forward", + "output_shapes", + f"{result_names_and_shapes}", + ]) + + BnmModuleHookManager.write_line_to_file( + filename=BNM_MODULE_HOOK_OUTPUT_FILENAME, + line=result_message, + ) + + + return result + return wrapper + + +rearrange = BnmModuleHookManager.shape_logger(_original_rearrange) From 6691d8465287f272a21cb0865e7d4adb991c4168 Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Tue, 23 Sep 2025 17:25:01 +0000 Subject: [PATCH 11/12] br: before merge from main Signed-off-by: Brian Roland --- sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py | 9 ++++++--- .../evo2/utils/logging/bnm_module_hook_manager.py | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py index 722960f85..481e1a2eb 100644 --- a/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/run/train.py @@ -57,7 +57,8 @@ from bionemo.llm.utils.logger_utils import WandbConfig, setup_nemo_lightning_logger from bionemo.evo2.utils.logging.bnm_module_hook_manager import BnmModuleHookManager, BNM_MODULE_HOOK_HANDLES -from bionemo.evo2.utils.logging.hyena_model_with_custom_metrics import HyenaModelWithCustomMetrics +#from bionemo.evo2.utils.logging.hyena_model_with_custom_metrics import HyenaModelWithCustomMetrics +from bionemo.evo2.utils.logging.hyena_model_with_call_stack_monitor import HyenaModelWithCallStackMonitor torch._dynamo.config.suppress_errors = True @@ -800,8 +801,10 @@ def train(args: argparse.Namespace) -> nl.Trainer: lora_transform = Evo2LoRA(peft_ckpt_path=args.lora_checkpoint_path) import os - if os.getenv("BNM_MODULE_HOOK_MANAGER_LEVEL_MAX","") != "": - model = HyenaModelWithCustomMetrics(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) + if os.getenv("BNM_CALL_STACK_MONITOR_LEVEL_MAX","") != "": + model = HyenaModelWithCallStackMonitor(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) + # elif os.getenv("BNM_MODULE_HOOK_MANAGER_LEVEL_MAX","") != "": + # model = HyenaModelWithCustomMetrics(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) else: model = llm.HyenaModel(model_config, tokenizer=data_module.tokenizer, model_transform=lora_transform) elif model_type == "mamba": # mamba diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py index 3a3c21c7e..0ab26001a 100644 --- a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_module_hook_manager.py @@ -245,4 +245,4 @@ def wrapper(*args, **kwargs): return wrapper -rearrange = BnmModuleHookManager.shape_logger(_original_rearrange) +# rearrange = BnmModuleHookManager.shape_logger(_original_rearrange) From c46d0e7ce7072bb79c7da6b5c644b968d6490941 Mon Sep 17 00:00:00 2001 From: Brian Roland Date: Tue, 23 Sep 2025 17:27:23 +0000 Subject: [PATCH 12/12] br: call-stack-monitor tools Signed-off-by: Brian Roland --- .../utils/logging/bnm_call_stack_monitor.py | 283 ++++++++++++++++++ .../hyena_model_with_call_stack_monitor.py | 43 +++ .../hyena_model_with_custom_metrics.py | 21 ++ .../logging/run_bnm_call_stack_monitor.py | 26 ++ 4 files changed, 373 insertions(+) create mode 100644 sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_call_stack_monitor.py create mode 100644 sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/hyena_model_with_call_stack_monitor.py create mode 100644 sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/hyena_model_with_custom_metrics.py create mode 100644 sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/run_bnm_call_stack_monitor.py diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_call_stack_monitor.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_call_stack_monitor.py new file mode 100644 index 000000000..ac97a3644 --- /dev/null +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/bnm_call_stack_monitor.py @@ -0,0 +1,283 @@ +import os +import sys +import inspect +from torch import Tensor + +EVENT_TYPE = "event_type" +LEVEL_OF_CALL_FRAME = "level_of_call_frame" + + +class BnmCallStackMonitor(): + def __init__(self, results_dir: str | None = None,): + + self.level_max = os.getenv("BNM_CALL_STACK_MONITOR_LEVEL_MAX", 9) # str or None or int + if isinstance(self.level_max, str): + self.level_max = int(self.level_max) + + self.num_events_max = None + + self.results_dir = os.getenv("BNM_CALL_STACK_MONITOR_RESULTS_DIR", results_dir) # str or None + self.results_filename = None + if self.results_dir is not None: + self.results_filename = os.path.join( + str(self.results_dir), f"bnm_call_stack_monitor_output.txt" + ) + global BNM_CALL_STACK_MONITOR_OUTPUT_FILENAME + BNM_CALL_STACK_MONITOR_OUTPUT_FILENAME = self.results_filename + + def start_monitoring(self): + global CALL_STACK_EVENTS + CALL_STACK_EVENTS = [] + + prof = create_profiler_with_function_io_metrics(CALL_STACK_EVENTS, level_max = self.level_max, num_events_max=self.num_events_max) + sys.setprofile(prof) + + def stop_monitoring(self): + sys.setprofile(None) + + @property + def call_stack_events(self): + return CALL_STACK_EVENTS + + def delete_call_stack_events(self): + del CALL_STACK_EVENTS + + def write_events_to_file(self): + if self.results_filename is None: + raise Exception + + header_with_column_names = ";".join([ + "class_to_collect_metrics", + "level", + "module_or_class_name_short", + "func_name", + "frame_id", + EVENT_TYPE, + "event_id", + "metric_name", + "metric_value", + "is_class_name_in_black_list", + "is_function_name_in_blacklist", + "class_name_long", + "location", + ]) + BnmCallStackMonitor.write_line_to_file( + filename=self.results_filename, + line=header_with_column_names, + ) + + for event in self.call_stack_events: + message_as_line = ";".join([str(x) for x in [ + "BnmCallStackMonitor", + event[LEVEL_OF_CALL_FRAME], + event["class_name_short"], + event["func_name"], + event["frame_id"], + event[EVENT_TYPE], + event["event_id"], + event["metric_name"], + event["metric_value"], + event["is_class_name_in_black_list"], + event["is_function_name_in_blacklist"], + event["class_name_long"], + event["location"], + ]]) + + BnmCallStackMonitor.write_line_to_file( + filename=self.results_filename, + line=message_as_line, + ) + + @staticmethod + def write_line_to_file(filename: str, line: str): + if filename is not None: + with open(filename, "a") as f: + f.write(line + "\n") + f.flush() + + +def create_brief_module_name(frame): + + frame_code_filename = f"{frame.f_code.co_filename}" + for x in ["dist-packages/", "3rdparty/"]: + if x in frame_code_filename: + frame_code_filename = frame_code_filename.split(x)[-1] + break + + frame_code_filename = frame_code_filename.rstrip(".py") + split_result = frame_code_filename.split("/") + + if len(split_result) <= 2: + out = ".".join(split_result) + else: + out = "...".join([split_result[0], split_result[-2] ]) + return out + + +def create_profiler_with_function_io_metrics(call_stack_events: list, num_events_max: int= 50, level_max: int = 9): + """ + Returns a profiling function that logs inputs and outputs of every function call. + + Use the returned function like: + + prof = create_profiler_with_function_io_metrics(CALL_STACK_EVENTS) + sys.setprofile(prof) + + """ + + def profiler(frame, event_type, arg): + + if isinstance(num_events_max, int) and len(call_stack_events) >= num_events_max: + return + + func_name = frame.f_code.co_name + func_loc = f"{frame.f_code.co_filename}:{frame.f_lineno}" + args, _, _, values = inspect.getargvalues(frame) + frame_args_as_dict = {k: values[k] for k in args} + + brief_module_name = create_brief_module_name(frame) + + is_an_input_a_tensor = any([isinstance(v, Tensor) for v in frame_args_as_dict.values()]) + + if not is_an_input_a_tensor: + return + + # FILEPATH_KEY_WHITELIST = ["NeMo", "Megatron", "evo2", "einops"] + # does_func_loc_contain_key_from_whitelist = any([x in func_loc for x in FILEPATH_KEY_WHITELIST]) + # if not does_func_loc_contain_key_from_whitelist: + # return + + FUNCTION_NAME_BLACKLIST = [ + "nvtx_range_push", + "nvtx_range_pop", + "__hash__", + "maybe_contiguous", + "cast_if_needed", + "cast", + "shape", + "", + "reset_swizzled_inputs", "swizzle_inputs", "set_activation_dtype", + "is_appropriate_type", + "convert_tensor", + "get_backend", + "_apply_recipe", + "_check_single_tensor", + "make_viewless_tensor", + "make_upper_case", + "reduce_from_tensor_model_parallel_region", + "fused_apply_rotary_pos_emb", + "reduce_from_tensor_model_parallel_region", + "copy_to_tensor_model_parallel_region", + + ] + is_function_name_in_blacklist = any([x in func_name for x in FUNCTION_NAME_BLACKLIST]) + + is_class_method, class_name_long, _ = frame_is_class_method(frame) + brief_module_name = create_brief_module_name(frame) + class_name_short = brief_module_name if class_name_long is None else ".".join(class_name_long.split(".")[-1:]) + + CLASS_NAME_BLACKLIST = [ + "SymNumberMemoDescriptor", + "MetaTensorDescriber", + "WeakIdRef", + "WeakIdKeyDictionary", + "FakeTensor", + "OperationFuser", + "IdentityOp", + ] + is_class_name_in_black_list = any([class_name_short==x for x in CLASS_NAME_BLACKLIST]) + + + level_of_call_frame = None + metric_name = None + metric_value = None + if event_type not in ["call", "return"]: + return + + elif event_type == "call": + if len(call_stack_events) == 0: + level_of_call_frame = 0 + elif call_stack_events[-1][EVENT_TYPE] == "call": + level_of_call_frame = call_stack_events[-1][LEVEL_OF_CALL_FRAME] + 1 + if level_of_call_frame > level_max: + # do not create event + return + elif call_stack_events[-1][EVENT_TYPE] == "return": + level_of_call_frame = call_stack_events[-1][LEVEL_OF_CALL_FRAME] + + metric_name ="input_shapes" + metric_value = "|".join([ + f"{k}={tuple(v.shape)}" for k, v in frame_args_as_dict.items() if isinstance(v, Tensor) + ]) + + elif event_type == "return": + + if len(call_stack_events) == 0: + # return from function containing sys.profiler(prof) will trigger + return + elif call_stack_events[-1][EVENT_TYPE] == "call": + level_of_call_frame = call_stack_events[-1][LEVEL_OF_CALL_FRAME] + elif call_stack_events[-1][EVENT_TYPE] == "return": + level_of_call_frame = call_stack_events[-1][LEVEL_OF_CALL_FRAME] - 1 + + metric_name = "output_shapes" + metric_value = f"NA" + if isinstance(arg, Tensor): + metric_value = f"{tuple(arg.shape)}" + elif isinstance(arg, tuple): + metric_value = "|".join([f"{tuple(v.shape)}" for v in arg if isinstance(v, Tensor)]) + + frame_id = str(id(frame)) + event_dict = { + LEVEL_OF_CALL_FRAME: level_of_call_frame, + "class_name_short": class_name_short, + "func_name": func_name, + EVENT_TYPE: event_type, + "frame_id": frame_id, + "event_id": "|".join([class_name_short, func_name, frame_id, event_type]), + "metric_name": metric_name, + "metric_value": metric_value, + "is_class_name_in_black_list": is_class_name_in_black_list, + "is_function_name_in_blacklist": is_function_name_in_blacklist, + "class_name_long": class_name_long, + "location": func_loc, + + } + call_stack_events.append(event_dict) + #print(f"{event_dict}") + + return profiler + + +def frame_is_class_method(frame=None): + """ + Returns (is_method: bool, class, function_name) + is_method = True if frame is an instance or class method + class = the class object if available, else None + function_name = name of the function in the frame + """ + + + if frame is None: + return False, None, None + else: + locals_ = frame.f_locals + func_name = frame.f_code.co_name + + # Check for instance method (has 'self') + if 'self' in locals_: + cls = type(locals_['self']) + class_name_long = str(cls).split("\'")[1] + + return True, class_name_long, func_name + + # Check for class method (has 'cls') + if 'cls' in locals_: + cls = locals_['cls'] + class_name_long = str(cls).split("\'")[1] + + return True, class_name_long, func_name + + # Static method or free function + return False, None, func_name + \ No newline at end of file diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/hyena_model_with_call_stack_monitor.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/hyena_model_with_call_stack_monitor.py new file mode 100644 index 000000000..b8c0853de --- /dev/null +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/hyena_model_with_call_stack_monitor.py @@ -0,0 +1,43 @@ +from typing import Optional +from torch import nn, Tensor +from nemo.collections import llm +from bionemo.evo2.utils.logging.bnm_call_stack_monitor import BnmCallStackMonitor + + +class HyenaModelWithCallStackMonitor(llm.HyenaModel): + + def configure_model(self, vp_stage: Optional[int] = None) -> None: + """Add additional configuration for HyenaModel(GPTModel), after GPTModel.configure_model(). + + When this method is called, self.module is the HyenaModel(LanguageModule(MegatronModel)) + + """ + super(llm.HyenaModel, self).configure_model(vp_stage=vp_stage) + + global BNM_CALL_STACK_MONITOR_HOOKS + BNM_CALL_STACK_MONITOR_HOOKS = [] + + def forward_pre_hook(module: nn.Module, input: Tensor | tuple[Tensor]): + if not hasattr(module, "bnm_call_stack_monitor"): + module.bnm_call_stack_monitor = BnmCallStackMonitor() + module.bnm_call_stack_monitor.start_monitoring() + + def forward_hook(module: nn.Module, input: Tensor | tuple[Tensor], output: Tensor | tuple[Tensor]): + if hasattr(module, "bnm_call_stack_monitor"): + module.bnm_call_stack_monitor.stop_monitoring() + module.bnm_call_stack_monitor.write_events_to_file() + + BNM_CALL_STACK_MONITOR_HOOKS.append( + self.module.register_forward_pre_hook(forward_pre_hook) + ) + BNM_CALL_STACK_MONITOR_HOOKS.append( + self.module.register_forward_hook(forward_hook) + ) + + + + + + + + diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/hyena_model_with_custom_metrics.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/hyena_model_with_custom_metrics.py new file mode 100644 index 000000000..fa4fddddd --- /dev/null +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/hyena_model_with_custom_metrics.py @@ -0,0 +1,21 @@ +from typing import Optional +from nemo.collections import llm +from bionemo.evo2.utils.logging.bnm_module_hook_manager import BnmModuleHookManager + +class HyenaModelWithCustomMetrics(llm.HyenaModel): + + def configure_model(self, vp_stage: Optional[int] = None) -> None: + """Add additional configuration for HyenaModel(GPTModel), after GPTModel.configure_model(). + + When this method is called, self.module is the HyenaModel(LanguageModule(MegatronModel)) + + """ + super(llm.HyenaModel, self).configure_model(vp_stage=vp_stage) + + self.bnm_module_hook_manager = BnmModuleHookManager() + + self.bnm_module_hook_manager.configure_hooks( + root_module=self.module, + forward_pre_hook_types=["input_shapes"], + forward_hook_types=["output_shapes"], + ) diff --git a/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/run_bnm_call_stack_monitor.py b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/run_bnm_call_stack_monitor.py new file mode 100644 index 000000000..043fc791b --- /dev/null +++ b/sub-packages/bionemo-evo2/src/bionemo/evo2/utils/logging/run_bnm_call_stack_monitor.py @@ -0,0 +1,26 @@ +import sys +from bionemo.evo2.utils.logging.bnm_call_stack_monitor import BnmCallStackMonitor + + +# Example usage +def foo(x, y): + return bar(x) + y + + +def bar(z): + return z * 2 + +def main(): + + monitor = BnmCallStackMonitor() + monitor.start_monitoring() + + result = foo(3, 4) + + monitor.stop_monitoring() + + monitor.write_events_to_file() + + +if __name__ == "__main__": + main() \ No newline at end of file