diff --git a/api/common/config.py b/api/common/config.py index 571a9408..9ed88853 100644 --- a/api/common/config.py +++ b/api/common/config.py @@ -1,3 +1,7 @@ +# Copyright (c) MLCommons and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. @@ -42,4 +46,5 @@ "ssl_org_pem_file_path": os.environ["SSL_ORG_PEM_FILE"], "trial_jwtexp": 900, "frontend_ip": os.environ["FRONTEND_IP"], + "runpod_api_key": os.environ.get("RUNPOD_API_KEY", ""), } diff --git a/api/controllers/models.py b/api/controllers/models.py index d4e70ce2..c08403ce 100644 --- a/api/controllers/models.py +++ b/api/controllers/models.py @@ -348,9 +348,12 @@ def do_upload_via_train_files(credentials, tid, model_name): current_upload = json.loads(upload.file.read().decode("utf-8")) upload.file.seek(0) payload = { - "id_json": current_upload, - "bucket_name": task.s3_bucket, - "key": name, + "input": { + "id_json": current_upload, + "bucket_name": task.s3_bucket, + "key": name, + "model_id": model[1], # Add model_id for backend processing + } } s3_client.upload_fileobj( upload.file, @@ -359,19 +362,24 @@ def do_upload_via_train_files(credentials, tid, model_name): ) light_model_endpoint = task.lambda_model - r = requests.post(light_model_endpoint, json=payload) - try: - score = r.json()["score"] - except Exception as ex: - logger.exception(ex) - subject = f"Model {model_name} failed training as {r.json()['detail']}" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {config['runpod_api_key']}", + } + + r = requests.post(light_model_endpoint, json=payload, headers=headers) + + if r.status_code != 200: + logger.error( + f"RunPod request failed with status {r.status_code}: {r.text}" + ) Email().send( contact=user.email, cc_contact="dynabench-site@mlcommons.org", template_name="model_train_failed.txt", - msg_dict={"name": model_name}, - subject=subject, + msg_dict={"name": model_name, "model_id": model[1]}, + subject=f"Model {model_name} submission failed", ) for idx2, (rem_name, rem_upload) in enumerate(train_items[idx + 1 :]): s3_client.upload_fileobj( @@ -381,47 +389,74 @@ def do_upload_via_train_files(credentials, tid, model_name): ) bottle.abort(400) + score = None # Will be set by the async evaluation + did = dm.getByName(name).id r_realid = rm.getByTid(tid)[0].rid if isinstance(task_config.get("perf_metric"), list): metric = task_config.get("perf_metric")[0].get("type") elif isinstance(task_config.get("perf_metric"), dict): metric = task_config.get("perf_metric").get("type") - new_score = { - metric: score, - "perf": score, - "perf_std": 0.0, - "perf_by_tag": [ - { - "tag": str(name), - "pretty_perf": f"{score} %", - "perf": score, - "perf_std": 0.0, - "perf_dict": {metric: score}, - } - ], - } - new_score_string = json.dumps(new_score) + if score is not None: + new_score = { + metric: score, + "perf": score, + "perf_std": 0.0, + "perf_by_tag": [ + { + "tag": str(name), + "pretty_perf": f"{score} %", + "perf": score, + "perf_std": 0.0, + "perf_dict": {metric: score}, + } + ], + } + + new_score_string = json.dumps(new_score) + + sm.create( + model_id=model[1], + r_realid=r_realid, + did=did, + pretty_perf=f"{score} %", + perf=score, + metadata_json=new_score_string, + ) - sm.create( - model_id=model[1], - r_realid=r_realid, - did=did, - pretty_perf=f"{score} %", - perf=score, - metadata_json=new_score_string, + if any(upload.content_type != "text/plain" for upload in train_files.values()): + Email().send( + contact=user.email, + cc_contact="dynabench-site@mlcommons.org", + template_name="model_train_successful.txt", + msg_dict={"name": model_name, "model_id": model[1]}, + subject=f"Model {model_name} submitted for evaluation.", ) - Email().send( - contact=user.email, - cc_contact="dynabench-site@mlcommons.org", - template_name="model_train_successful.txt", - msg_dict={"name": model_name, "model_id": model[1]}, - subject=f"Model {model_name} training succeeded.", - ) + return util.json_encode( + { + "success": "ok", + "model_id": model[1], + "message": "Model submitted for evaluation. You will receive an email when evaluation is complete.", + } + ) + else: + Email().send( + contact=user.email, + cc_contact="dynabench-site@mlcommons.org", + template_name="model_train_successful.txt", + msg_dict={"name": model_name, "model_id": model[1]}, + subject=f"Model {model_name} evaluation completed.", + ) - return util.json_encode({"success": "ok", "model_id": model[1]}) + return util.json_encode( + { + "success": "ok", + "model_id": model[1], + "message": "Model evaluation completed successfully.", + } + ) @bottle.post("/models/upload_predictions//") diff --git a/backend/app/domain/services/base/score.py b/backend/app/domain/services/base/score.py index 8218d6b4..55541124 100644 --- a/backend/app/domain/services/base/score.py +++ b/backend/app/domain/services/base/score.py @@ -401,7 +401,7 @@ def add_scores_and_update_model( self.email_helper.send( contact=user.email, cc_contact=self.email_sender, - template_name="model_inference_failed.txt", + template_name="model_evaluation_failed.txt", msg_dict={"name": model["name"], "message": message}, subject=f"Model {model['name']} evaluation failed.", ) @@ -418,11 +418,57 @@ def add_scores_and_update_model( round_info = self.round_repository.get_round_info_by_round_and_task( model["tid"], round_id ) + + # Get task configuration to determine score handling + task_config = self.task_repository.get_config_file_by_task_id( + model["tid"] + )[0] + task_config = yaml.safe_load(task_config) + metadata_json = dict(scores) + # Determine the main performance metric based on task configuration + perf_metric = task_config.get("perf_metric", {}) + if isinstance(perf_metric, list): + main_metric = perf_metric[0].get("type", "score") + elif isinstance(perf_metric, dict): + main_metric = perf_metric.get("type", "score") + else: + main_metric = "score" # Default fallback + + # Extract the score value - handle different formats + score_value = None + + # First, try to extract from nested results (for RunPod format) + if "results" in metadata_json and "score" in metadata_json["results"]: + score_value = metadata_json["results"]["score"] + elif "score" in metadata_json: + score_value = metadata_json["score"] + elif main_metric in metadata_json: + score_value = metadata_json[main_metric] + elif "Standard_CER_15_WORSE" in metadata_json: + # Backward compatibility for speech tasks + score_value = metadata_json["Standard_CER_15_WORSE"] + main_metric = "Standard_CER_15_WORSE" + else: + raise ValueError(f"No score found in metadata: {metadata_json}") + + # Format the score appropriately based on metric type + if main_metric == "Standard_CER_15_WORSE": + # Speech recognition - percentage format + pretty_perf = f"{100 * score_value:.2f}%" + else: + # Other tasks - decimal format + pretty_perf = f"{score_value:.4f}" + + # Store the main metric type in metadata for reference + metadata_json["main_metric"] = main_metric + metadata_json["task_perf_metric"] = perf_metric + + # Build the score structure with proper metric information new_score = { - "perf": metadata_json["Standard_CER_15_WORSE"], - "pretty_perf": f"{100*metadata_json['Standard_CER_15_WORSE']:.2f}%", + "perf": score_value, + "pretty_perf": pretty_perf, "mid": model_id, "r_realid": round_info.id, "did": datasets[0]["id"], @@ -437,7 +483,7 @@ def add_scores_and_update_model( cc_contact=self.email_sender, template_name="model_evaluation_sucessful.txt", msg_dict={"name": model["name"], "model_id": model["id"]}, - subject=f"Model {model['name']} evaluation succeeded.", + subject=f"Model {model['name']} evaluation completed successfully.", ) print( f"sent email evaluation sucessful to {user.email} model {model['name']} "