Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -175,4 +175,7 @@ data/

# output/
output/
temp/
temp/

# cache/
lettucedetect/cache/
13 changes: 13 additions & 0 deletions docs/EVALUATION.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Evaluation

## Use LLM baselines

```bash
python scripts/evaluate_llm.py --model "gpt-4o-mini" --data_path "data/translated/ragtruth-de-translated-300sample.json" --evaluation_type "example_level"
```

## Use HallucinationDetector

```bash
python scripts/evaluate.py --model_path "output/hallucination_detection_de_210m" --data_path "data/translated/ragtruth-de-translated-300sample.json" --evaluation_type "example_level"
```
50 changes: 7 additions & 43 deletions lettucedetect/models/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from lettucedetect.datasets.hallucination_dataset import HallucinationData, HallucinationSample
from lettucedetect.datasets.hallucination_dataset import HallucinationSample
from lettucedetect.models.inference import HallucinationDetector


Expand Down Expand Up @@ -224,8 +224,6 @@ def create_sample_llm(sample, labels):
def evaluate_detector_char_level(
detector: HallucinationDetector,
samples: list[HallucinationSample],
samples_llm: list[HallucinationSample] = None,
baseline_file_exists: bool = False,
) -> dict[str, float]:
"""Evaluate the HallucinationDetector at the character level.

Expand All @@ -241,24 +239,17 @@ def evaluate_detector_char_level(

:param detector: The detector to evaluate.
:param samples: A list of samples to evaluate.
:param samples_llm : A list of samples containing LLM generated labels, is used if baseline file exists.
:param baseline_file_exists: Gives information if baseline file exists or should be created.
:return: A dictionary with global metrics: {"char_precision": ..., "char_recall": ..., "char_f1": ...}
"""
total_overlap = 0
total_predicted = 0
total_gold = 0
hallucination_data_llm = HallucinationData(samples=[])

for i, sample in enumerate(tqdm(samples, desc="Evaluating", leave=False)):
for sample in tqdm(samples, desc="Evaluating", leave=False):
prompt = sample.prompt
answer = sample.answer
gold_spans = sample.labels
predicted_spans = (
samples_llm[i].labels
if baseline_file_exists
else detector.predict_prompt(prompt, answer, output_format="spans")
)
predicted_spans = detector.predict_prompt(prompt, answer, output_format="spans")

# Compute total predicted span length for this sample.
sample_predicted_length = sum(pred["end"] - pred["start"] for pred in predicted_spans)
Expand All @@ -278,22 +269,16 @@ def evaluate_detector_char_level(
sample_overlap += overlap_end - overlap_start
total_overlap += sample_overlap

if not baseline_file_exists:
sample_llm = create_sample_llm(sample, predicted_spans)
hallucination_data_llm.samples.append(sample_llm)

precision = total_overlap / total_predicted if total_predicted > 0 else 0
recall = total_overlap / total_gold if total_gold > 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

return {"precision": precision, "recall": recall, "f1": f1}, hallucination_data_llm
return {"precision": precision, "recall": recall, "f1": f1}


def evaluate_detector_example_level(
detector: HallucinationDetector,
samples: list[HallucinationSample],
samples_llm: list[HallucinationSample] = None,
baseline_file_exists: bool = None,
verbose: bool = True,
) -> dict[str, dict[str, float]]:
"""Evaluate the HallucinationDetector at the example level.
Expand All @@ -319,27 +304,18 @@ def evaluate_detector_example_level(
"""
example_preds: list[int] = []
example_labels: list[int] = []
hallucination_data_llm = HallucinationData(samples=[])

for i, sample in enumerate(tqdm(samples, desc="Evaluating", leave=False)):
for sample in tqdm(samples, desc="Evaluating", leave=False):
prompt = sample.prompt
answer = sample.answer
gold_spans = sample.labels
predicted_spans = (
samples_llm.__getitem__(i).labels
if baseline_file_exists
else detector.predict_prompt(prompt, answer, output_format="spans")
)
predicted_spans = detector.predict_prompt(prompt, answer, output_format="spans")
true_example_label = 1 if gold_spans else 0
pred_example_label = 1 if predicted_spans else 0

example_labels.append(true_example_label)
example_preds.append(pred_example_label)

if not baseline_file_exists:
sample_llm = create_sample_llm(sample, predicted_spans)
hallucination_data_llm.samples.append(sample_llm)

precision, recall, f1, _ = precision_recall_fscore_support(
example_labels, example_preds, labels=[0, 1], average=None, zero_division=0
)
Expand All @@ -361,18 +337,6 @@ def evaluate_detector_example_level(
fpr, tpr, _ = roc_curve(example_labels, example_preds)
auroc = auc(fpr, tpr)

results: dict[str, dict[str, float]] = {
"supported": { # Class 0
"precision": float(precision[0]),
"recall": float(recall[0]),
"f1": float(f1[0]),
},
"hallucinated": { # Class 1
"precision": float(precision[1]),
"recall": float(recall[1]),
"f1": float(f1[1]),
},
}
results["auroc"] = auroc

if verbose:
Expand All @@ -387,4 +351,4 @@ def evaluate_detector_example_level(
print(report)
results["classification_report"] = report

return results, hallucination_data_llm
return results
Loading