[wwb] Fix reranker tests (#2890)

sbalandi · web-flow · commit 84da0d1ad9fb · 2025-10-24T22:34:04.000Z
## Description - split wwb reranker tests - skip rerankier genAI tests on MacOS, task created ## Checklist: - [ ] Tests have been updated or added to cover the new code  - [ ] This patch fully addresses the ticket.  - [ ] I have made corresponding changes to the documentation
diff --git a/tools/who_what_benchmark/tests/test_cli_embeddings.py b/tools/who_what_benchmark/tests/test_cli_embeddings.py
@@ -17,7 +17,7 @@
 )
 def test_embeddings_basic(model_id, model_type, tmp_path):
     GT_FILE = tmp_path / "gt.csv"
-    MODEL_PATH = tmp_path / model_id.replace("/", "--")
+    MODEL_PATH = tmp_path / model_id.replace("/", "_")
 
     result = subprocess.run(["optimum-cli", "export",
                              "openvino", "-m", model_id,
diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py
@@ -38,7 +38,7 @@ def run_wwb(args):
 
 def setup_module():
     for model_id in OV_IMAGE_MODELS:
-        MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "--"))
+        MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "_"))
         subprocess.run(["optimum-cli", "export", "openvino", "--model", model_id, MODEL_PATH], capture_output=True, text=True)
 
 
@@ -121,7 +121,7 @@ def test_image_model_genai(model_id, model_type, tmp_path):
         pytest.xfail("Ticket 173169")
 
     GT_FILE = tmp_path / "gt.csv"
-    MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "--"))
+    MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "_"))
 
     run_wwb([
         "--base-model",
diff --git a/tools/who_what_benchmark/tests/test_cli_reranking.py b/tools/who_what_benchmark/tests/test_cli_reranking.py
@@ -1,64 +1,94 @@
 import subprocess  # nosec B404
+import sys
 import pytest
+import shutil
 import logging
+import tempfile
 from test_cli_image import run_wwb
+from pathlib import Path
 
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+tmp_dir = tempfile.mkdtemp()
 
 
-@pytest.mark.parametrize(
-    ("model_id", "model_type"),
-    [
-        ("cross-encoder/ms-marco-TinyBERT-L2-v2", "text-reranking"),
-    ],
-)
-def test_reranking_basic(model_id, model_type, tmp_path):
-    GT_FILE = tmp_path / "gt.csv"
-    MODEL_PATH = tmp_path / model_id.replace("/", "--")
-
-    result = subprocess.run(["optimum-cli", "export",
-                             "openvino", "-m", model_id,
-                             MODEL_PATH, "--task",
-                             "text-classification",
-                             "--trust-remote-code"],
-                            capture_output=True,
-                            text=True,
-                            )
-    assert result.returncode == 0
+OV_RERANK_MODELS = {
+    ("cross-encoder/ms-marco-TinyBERT-L2-v2", "text-classification"),
+    ("Qwen/Qwen3-Reranker-0.6B", "text-generation"),
+}
 
-    # Collect reference with HF model
+
+def setup_module():
+    for model_info in OV_RERANK_MODELS:
+        model_id = model_info[0]
+        task = model_info[1]
+        MODEL_PATH = Path(tmp_dir, model_id.replace("/", "_"))
+        subprocess.run(["optimum-cli", "export", "openvino", "--model", model_id, MODEL_PATH, "--task", task, "--trust-remote-code"],
+                       capture_output=True,
+                       text=True)
+
+
+def teardown_module():
+    logger.info("Remove models")
+    shutil.rmtree(tmp_dir)
+
+
+@pytest.mark.parametrize(("model_info"), OV_RERANK_MODELS)
+def test_reranking_genai(model_info, tmp_path):
+    if sys.platform == 'darwin':
+        pytest.xfail("Ticket 175534")
+
+    GT_FILE = Path(tmp_dir) / "gt.csv"
+    model_id = model_info[0]
+    MODEL_PATH = Path(tmp_dir) / model_id.replace("/", "_")
+
+    # test GenAI
     run_wwb([
         "--base-model",
-        model_id,
+        MODEL_PATH,
         "--num-samples",
         "1",
         "--gt-data",
         GT_FILE,
         "--device",
         "CPU",
         "--model-type",
-        model_type,
-        "--hf",
+        "text-reranking",
+        "--genai"
     ])
 
-    # test Optimum
+    assert Path(tmp_dir, "reference").exists()
+
+
+@pytest.mark.parametrize(
+    ("model_info"), OV_RERANK_MODELS
+)
+def test_reranking_optimum(model_info, tmp_path):
+    GT_FILE = Path(tmp_dir) / "gt.csv"
+    model_id = model_info[0]
+    MODEL_PATH = Path(tmp_dir, model_id.replace("/", "_"))
+
+    # Collect reference with HF model
     run_wwb([
-        "--target-model",
-        MODEL_PATH,
+        "--base-model",
+        model_id,
         "--num-samples",
         "1",
         "--gt-data",
         GT_FILE,
         "--device",
         "CPU",
         "--model-type",
-        model_type,
+        "text-reranking",
+        "--hf",
     ])
 
-    # test GenAI
-    run_wwb([
+    assert GT_FILE.exists()
+    assert Path(tmp_dir, "reference").exists()
+
+    # test Optimum
+    outpus = run_wwb([
         "--target-model",
         MODEL_PATH,
         "--num-samples",
@@ -68,12 +98,17 @@ def test_reranking_basic(model_id, model_type, tmp_path):
         "--device",
         "CPU",
         "--model-type",
-        model_type,
-        "--genai",
+        "text-reranking",
         "--output",
         tmp_path,
     ])
 
+    assert (tmp_path / "target").exists()
+    assert (tmp_path / "target.csv").exists()
+    assert (tmp_path / "metrics_per_question.csv").exists()
+    assert (tmp_path / "metrics.csv").exists()
+    assert "Metrics for model" in outpus
+
     # test w/o models
     run_wwb([
         "--target-data",
@@ -85,56 +120,6 @@ def test_reranking_basic(model_id, model_type, tmp_path):
         "--device",
         "CPU",
         "--model-type",
-        model_type,
-        "--genai",
-    ])
-
-
-@pytest.mark.parametrize(
-    ("model_id", "model_type"),
-    [
-        ("Qwen/Qwen3-Reranker-0.6B", "text-reranking"),
-    ],
-)
-def test_reranking_qwen(model_id, model_type, tmp_path):
-    GT_FILE = tmp_path / "gt.csv"
-    MODEL_PATH = tmp_path / model_id.replace("/", "--")
-
-    result = subprocess.run(["optimum-cli", "export",
-                             "openvino", "-m", model_id,
-                             MODEL_PATH, "--task",
-                             "text-generation",
-                             "--trust-remote-code"],
-                            capture_output=True,
-                            text=True,
-                            )
-    assert result.returncode == 0
-
-    # Collect reference with HF model
-    run_wwb([
-        "--base-model",
-        model_id,
-        "--num-samples",
-        "1",
-        "--gt-data",
-        GT_FILE,
-        "--device",
-        "CPU",
-        "--model-type",
-        model_type,
-        "--hf",
-    ])
-
-    # test Optimum
-    run_wwb([
-        "--target-model",
-        MODEL_PATH,
-        "--num-samples",
-        "1",
-        "--gt-data",
-        GT_FILE,
-        "--device",
-        "CPU",
-        "--model-type",
-        model_type,
+        "text-reranking",
+        "--genai"
     ])
diff --git a/tools/who_what_benchmark/tests/test_cli_vlm.py b/tools/who_what_benchmark/tests/test_cli_vlm.py
@@ -13,7 +13,7 @@ def run_test(model_id, model_type, optimum_threshold, genai_threshold, tmp_path)
     if sys.platform == 'darwin':
         pytest.xfail("Ticket 173169")
     GT_FILE = tmp_path / "gt.csv"
-    MODEL_PATH = tmp_path / model_id.replace("/", "--")
+    MODEL_PATH = tmp_path / model_id.replace("/", "_")
 
     result = subprocess.run(["optimum-cli", "export",
                              "openvino", "-m", model_id,

Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`)`
`18`	`18`	`def test_embeddings_basic(model_id, model_type, tmp_path):`
`19`	`19`	`GT_FILE = tmp_path / "gt.csv"`
`20`		`- MODEL_PATH = tmp_path / model_id.replace("/", "--")`
	`20`	`+ MODEL_PATH = tmp_path / model_id.replace("/", "_")`
`21`	`21`
`22`	`22`	`result = subprocess.run(["optimum-cli", "export",`
`23`	`23`	`"openvino", "-m", model_id,`