Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions sdk/python/v1beta1/kubeflow/katib/api/katib_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,14 @@ class name in this argument.
"Trainer parameters must be an instance of HuggingFaceTrainerParams."
)

if not trainer_parameters.training_parameters:
raise ValueError(
"trainer_parameters.training_parameters must be provided. "
"Please use transformers.TrainingArguments(...) to configure "
"training. See: https://www.kubeflow.org/docs/components/katib/"
"user-guides/hp-tuning/configure-experiment/#tune-api"
)

# Iterate over input parameters and do substitutions.
experiment_parameters = []
trial_parameters = []
Expand Down Expand Up @@ -665,9 +673,17 @@ class name in this argument.
"--dataset_dir",
VOLUME_PATH_DATASET,
"--lora_config",
f"'{lora_config}'",
(
json.dumps(lora_config)
if isinstance(lora_config, dict)
else lora_config
),
"--training_parameters",
f"'{training_args}'",
(
json.dumps(training_args)
if isinstance(training_args, dict)
else training_args
),
],
volume_mounts=[STORAGE_INITIALIZER_VOLUME_MOUNT],
resources=(
Expand Down
47 changes: 47 additions & 0 deletions sdk/python/v1beta1/kubeflow/katib/api/katib_client_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,30 @@ def create_experiment(
},
ValueError,
),
(
"missing training_parameters in HuggingFaceTrainerParams - issue #2587",
{
"name": "tune_test",
"model_provider_parameters": HuggingFaceModelParams(
model_uri="hf://google-bert/bert-base-cased",
transformer_type=transformers.AutoModelForSequenceClassification,
num_labels=5,
),
"dataset_provider_parameters": HuggingFaceDatasetParams(
repo_id="yelp_review_full",
split="train[:3000]",
),
"trainer_parameters": HuggingFaceTrainerParams(
training_parameters=None,
),
"resources_per_trial": types.TrainerResources(
num_workers=2,
num_procs_per_worker=2,
resources_per_worker={"gpu": "2"},
),
},
ValueError,
),
(
"pvc creation failed",
{
Expand Down Expand Up @@ -881,6 +905,29 @@ def test_tune(katib_client, test_name, kwargs, expected_output):
additional_metric_names=[],
)

# Verify JSON serialization fix
# Container args should NOT have extra shell quotes like '{"key": "value"}'
container_args = (
experiment.spec.trial_template.trial_spec.spec.pytorch_replica_specs[
"Master"
]
.template.spec.containers[0]
.args
)
for i, arg in enumerate(container_args):
if arg in ("--training_parameters", "--lora_config"):
assert i + 1 < len(
container_args
), f"Missing value for {arg} in container args"
next_arg = container_args[i + 1]
# Should NOT start/end with extra single quotes
assert not (
next_arg.startswith("'") and next_arg.endswith("'")
), (
f"{arg} value should not be wrapped with extra quotes. "
f"Got: {next_arg[:50]}..."
)

elif test_name == "valid flow with pip_index_urls":
# Verify pip install command in container args.
args_content = "".join(
Expand Down