Skip to content

Commit 5bb04e3

Browse files
authored
chore(weave): test flake fix for windows (#6272)
1 parent b01b3ff commit 5bb04e3

File tree

4 files changed

+25
-14
lines changed

4 files changed

+25
-14
lines changed

tests/trace/test_evaluate.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import asyncio
22
import os
3+
import sys
34
import time
45
from unittest.mock import patch
56

@@ -9,14 +10,16 @@
910
import weave
1011
from weave import Dataset, Evaluation, Model
1112

13+
_LATENCY_TOL = 10 if sys.platform == "win32" else 1
14+
1215
dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
1316
dataset = Dataset(rows=dataset_rows)
1417

1518

1619
expected_eval_result = {
1720
"output": {"mean": 9.5},
1821
"score": {"true_count": 1, "true_fraction": 0.5},
19-
"model_latency": {"mean": pytest.approx(0, abs=1)},
22+
"model_latency": {"mean": pytest.approx(0, abs=_LATENCY_TOL)},
2023
}
2124

2225

@@ -239,7 +242,7 @@ def model(col_a, col_b):
239242
return col_a + col_b
240243

241244
result = await evaluation.evaluate(model)
242-
assert result.pop("model_latency").get("mean") == pytest.approx(0, abs=1)
245+
assert result.pop("model_latency").get("mean") == pytest.approx(0, abs=_LATENCY_TOL)
243246

244247
# Build expected result dynamically
245248
expected_result = {
@@ -260,7 +263,9 @@ def model(col_a, col_b):
260263
predict_and_score_calls = list(evaluation.predict_and_score.calls())
261264
assert len(predict_and_score_calls) == 3
262265
outputs = [c.output for c in predict_and_score_calls]
263-
assert all(o.pop("model_latency") == pytest.approx(0, abs=1) for o in outputs)
266+
assert all(
267+
o.pop("model_latency") == pytest.approx(0, abs=_LATENCY_TOL) for o in outputs
268+
)
264269

265270
# Build expected output dynamically
266271
expected_output = {
@@ -324,9 +329,9 @@ def score(output):
324329
assert result == {
325330
"output": {"mean": 5.5},
326331
"score": {"mean": 1.0},
327-
"model_latency": {"mean": pytest.approx(1, abs=1)},
332+
"model_latency": {"mean": pytest.approx(1, abs=_LATENCY_TOL)},
328333
}
329-
assert time.time() - now < 5
334+
assert time.time() - now < (15 if sys.platform == "win32" else 5)
330335

331336

332337
def test_evaluation_from_weaveobject_missing_evaluation_name(client):

tests/trace/test_evaluate_oldstyle.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,22 @@
11
import asyncio
2+
import sys
23

34
import pytest
45

56
import weave
67
from weave import Dataset, Evaluation, Model
78
from weave.scorers import MultiTaskBinaryClassificationF1
89

10+
_LATENCY_TOL = 10 if sys.platform == "win32" else 1
11+
912
dataset_rows = [{"input": "1 + 2", "target": 3}, {"input": "2**4", "target": 15}]
1013
dataset = Dataset(rows=dataset_rows)
1114

1215

1316
expected_eval_result = {
1417
"model_output": {"mean": 9.5},
1518
"score_oldstyle": {"true_count": 1, "true_fraction": 0.5},
16-
"model_latency": {"mean": pytest.approx(0, abs=1)},
19+
"model_latency": {"mean": pytest.approx(0, abs=_LATENCY_TOL)},
1720
}
1821

1922

@@ -109,7 +112,7 @@ def test_evaluate_both_styles(client):
109112
"model_output": {"mean": 9.5},
110113
"score_oldstyle": {"true_count": 1, "true_fraction": 0.5},
111114
"score_newstyle": {"true_count": 1, "true_fraction": 0.5},
112-
"model_latency": {"mean": pytest.approx(0, abs=1)},
115+
"model_latency": {"mean": pytest.approx(0, abs=_LATENCY_TOL)},
113116
}
114117

115118

tests/trace/test_evaluations.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import dataclasses
22
import random
3+
import sys
34
from typing import Any
45

56
import pydantic
@@ -13,6 +14,8 @@
1314
from weave.trace.refs import CallRef
1415
from weave.trace_server import trace_server_interface as tsi
1516

17+
_LATENCY_TOL = 10 if sys.platform == "win32" else 1
18+
1619

1720
def flatten_calls(
1821
calls: list[tsi.CallSchema], parent_id: str | None = None, depth: int = 0
@@ -535,7 +538,7 @@ async def test_evaluation_data_topology(client):
535538
"nested": {"bool_avg": 0.5},
536539
"reason": "This is a custom test reason",
537540
}
538-
model_latency = {"mean": pytest.approx(0, abs=1)}
541+
model_latency = {"mean": pytest.approx(0, abs=_LATENCY_TOL)}
539542
predict_usage_summary = {
540543
"usage": {
541544
"gpt-4o-2024-05-13": {
@@ -733,7 +736,7 @@ def function_score(scorer_res, output) -> dict:
733736
assert res == {
734737
"output": {"a": {"mean": 3.0}, "b": {"c": {"mean": 2.0}}},
735738
"function_score": {"a": {"mean": 3.0}, "b": {"c": {"mean": 2.0}}},
736-
"model_latency": {"mean": pytest.approx(0, abs=2)},
739+
"model_latency": {"mean": pytest.approx(0, abs=max(2, _LATENCY_TOL))},
737740
}
738741

739742

tests/trace_server/test_trace_server_evaluation_apis.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
from weave.trace_server.workers.evaluate_model_worker import evaluate_model_worker
2626
from weave.utils.project_id import from_project_id, to_project_id
2727

28+
_LATENCY_TOL = 10 if sys.platform == "win32" else 1
29+
2830

2931
@pytest.mark.asyncio
3032
async def test_evaluation_status(client):
@@ -84,7 +86,7 @@ def generate_id_side_effect():
8486
output={
8587
"output": {"mean": 3.0},
8688
"scorer": {"mean": 1.0},
87-
"model_latency": {"mean": pytest.approx(0, abs=1)},
89+
"model_latency": {"mean": pytest.approx(0, abs=_LATENCY_TOL)},
8890
}
8991
)
9092

@@ -374,9 +376,7 @@ def evaluate_model_wrapped(req: EvaluateModelReq):
374376
assert eval_call.summary["weave"]["status"] == TraceStatus.DESCENDANT_ERROR
375377
assert eval_call.output == {
376378
"LLMAsAJudgeScorer": None,
377-
"model_latency": {"mean": pytest.approx(0, abs=2)}
378-
if sys.platform != "win32"
379-
else {"mean": pytest.approx(0, abs=10)},
379+
"model_latency": {"mean": pytest.approx(0, abs=max(2, _LATENCY_TOL))},
380380
}
381381
else:
382382
assert eval_call.summary["status_counts"] == {
@@ -387,5 +387,5 @@ def evaluate_model_wrapped(req: EvaluateModelReq):
387387
assert eval_call.output == {
388388
"output": {"score": {"mean": 9.0}},
389389
"LLMAsAJudgeScorer": {"score": {"mean": 9.0}},
390-
"model_latency": {"mean": pytest.approx(0, abs=1)},
390+
"model_latency": {"mean": pytest.approx(0, abs=_LATENCY_TOL)},
391391
}

0 commit comments

Comments
 (0)