diff --git a/doc/api.rst b/doc/api.rst index 0bd549e97..4e7dbfbcd 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -126,6 +126,7 @@ API Reference fetch_examples fetch_forbidden_questions_dataset fetch_harmbench_dataset + fetch_harmbench_multimodal_dataset_async fetch_librAI_do_not_answer_dataset fetch_llm_latent_adversarial_training_harmful_dataset fetch_jbb_behaviors_by_harm_category diff --git a/pyrit/datasets/__init__.py b/pyrit/datasets/__init__.py index f99d1922b..b95459558 100644 --- a/pyrit/datasets/__init__.py +++ b/pyrit/datasets/__init__.py @@ -10,6 +10,7 @@ from pyrit.datasets.dataset_helper import fetch_examples from pyrit.datasets.forbidden_questions_dataset import fetch_forbidden_questions_dataset from pyrit.datasets.harmbench_dataset import fetch_harmbench_dataset +from pyrit.datasets.harmbench_multimodal_dataset import fetch_harmbench_multimodal_dataset_async from pyrit.datasets.librAI_do_not_answer_dataset import fetch_librAI_do_not_answer_dataset from pyrit.datasets.llm_latent_adversarial_training_harmful_dataset import ( fetch_llm_latent_adversarial_training_harmful_dataset, @@ -47,6 +48,7 @@ "fetch_examples", "fetch_forbidden_questions_dataset", "fetch_harmbench_dataset", + "fetch_harmbench_multimodal_dataset_async", "fetch_librAI_do_not_answer_dataset", "fetch_llm_latent_adversarial_training_harmful_dataset", "fetch_many_shot_jailbreaking_dataset", diff --git a/pyrit/datasets/harmbench_multimodal_dataset.py b/pyrit/datasets/harmbench_multimodal_dataset.py new file mode 100644 index 000000000..f6d7c1b81 --- /dev/null +++ b/pyrit/datasets/harmbench_multimodal_dataset.py @@ -0,0 +1,200 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import logging +import uuid +from enum import Enum +from pathlib import Path +from typing import List, Literal, Optional + +from pyrit.common.net_utility import make_request_and_raise_if_error_async +from pyrit.datasets.dataset_helper import fetch_examples +from pyrit.models import SeedPromptDataset, data_serializer_factory +from pyrit.models.seed_prompt import SeedPrompt + +logger = logging.getLogger(__name__) + + +class SemanticCategory(Enum): + CYBERCRIME_INTRUSION = "cybercrime_intrusion" # n=54 + ILLEGAL = "illegal" # 36 + HARMFUL = "harmful" # 9 + CHEMICAL_BIOLOGICAL = "chemical_biological" # 4 + HARASSMENT_BULLYING = "harassment_bullying" # 4 + MISINFORMATION_DISINFORMATION = "misinformation_disinformation" # 3 + + +async def fetch_harmbench_multimodal_dataset_async( + *, + source: str = ( + "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/behavior_datasets/" + "harmbench_behaviors_multimodal_all.csv" + ), + source_type: Literal["public_url", "file"] = "public_url", + cache: bool = True, + data_home: Optional[Path] = None, + categories: Optional[List[SemanticCategory]] = None, +) -> SeedPromptDataset: + """ + Fetch HarmBench multimodal examples and create a SeedPromptDataset. + + The HarmBench multimodal dataset contains 110 harmful behaviors. + Each example consists of an image ("image_path") and a behavior string referencing the image ("text"). + The text and image prompts that belong to the same example are linked using the same ``prompt_group_id``. + You can extract the grouped prompts using the ``group_seed_prompts_by_prompt_group_id`` method. + + Note: The first call may be slow as images need to be downloaded from the remote repository. + Subsequent calls will be faster since images are cached locally and won't need to be re-downloaded. + + Args: + source (str): The source from which to fetch examples. Defaults to the HarmBench repository. + source_type (Literal["public_url", "file"]): The type of source. Defaults to 'public_url'. + cache (bool): Whether to cache the fetched examples. Defaults to True. + data_home (Optional[Path]): Directory to store cached data. Defaults to None. + categories (Optional[List[SemanticCategory]]): List of semantic categories + to filter examples. If None, all categories are included (default). + + Returns: + SeedPromptDataset: A SeedPromptDataset containing the multimodal examples. + + Raises: + ValueError: If any of the specified categories are invalid. + + Note: + For more information related to the HarmBench project and the original dataset, visit: + https://www.harmbench.org/ \n + Paper: https://arxiv.org/abs/2402.04249 \n + Authors: + Mantas Mazeika & Long Phan & Xuwang Yin & Andy Zou & Zifan Wang & Norman Mu & Elham Sakhaee + & Nathaniel Li & Steven Basart & Bo Li & David Forsyth & Dan Hendrycks + """ + if categories is not None: + valid_categories = {category.value for category in SemanticCategory} + invalid_categories = ( + set(cat.value if isinstance(cat, SemanticCategory) else cat for cat in categories) - valid_categories + ) + if invalid_categories: + raise ValueError(f"Invalid semantic categories: {', '.join(invalid_categories)}") + + required_keys = {"Behavior", "BehaviorID", "FunctionalCategory", "SemanticCategory", "ImageFileName"} + examples = fetch_examples(source, source_type, cache, data_home) + prompts = [] + failed_image_count = 0 + + for example in examples: + missing_keys = required_keys - example.keys() + if missing_keys: + raise ValueError(f"Missing keys in example: {', '.join(missing_keys)}") + + if example["FunctionalCategory"] != "multimodal": + continue + + semantic_category = example["SemanticCategory"] + + if categories is not None: + category_values = {cat.value for cat in categories} + if semantic_category not in category_values: + continue + + behavior_text = example["Behavior"] + behavior_id = example["BehaviorID"] + image_filename = example["ImageFileName"] + image_description = example.get("ImageDescription", "") + redacted_description = example.get("RedactedImageDescription", "") + + # A unique group ID to link the text and image prompts since they are part of the same example + group_id = uuid.uuid4() + + # Note: All images in the HarmBench dataset are stored as .png files, even if the ImageFileName + # field specifies a different extension (.jpg or .jpeg). Hence we always use .png extension here. + image_url = ( + "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9/data/multimodal_behavior_images/" + f"{image_filename.rsplit('.', 1)[0]}.png" + ) + + try: + # Only include examples where image fetch is successful + local_image_path = await _fetch_and_save_image_async(image_url, behavior_id) + + image_prompt = SeedPrompt( + value=local_image_path, + data_type="image_path", + name=f"HarmBench Multimodal Image - {behavior_id}", + dataset_name="HarmBench Multimodal Examples", + harm_categories=[semantic_category], + description=f"An image prompt from the HarmBench multimodal dataset, BehaviorID: {behavior_id}", + source=source, + prompt_group_id=group_id, + sequence=0, + metadata={ + "behavior_id": behavior_id, + "image_description": image_description, + "redacted_image_description": redacted_description, + "original_image_url": image_url, + }, + ) + prompts.append(image_prompt) + except Exception as e: + failed_image_count += 1 + logger.warning(f"Failed to fetch image for behavior {behavior_id}: {e}. Skipping this example.") + else: + text_prompt = SeedPrompt( + value=behavior_text, + data_type="text", + name=f"HarmBench Multimodal Text - {behavior_id}", + dataset_name="HarmBench Multimodal Examples", + harm_categories=[semantic_category], + description=(f"A text prompt from the HarmBench multimodal dataset, BehaviorID: {behavior_id}"), + source=source, + prompt_group_id=group_id, + sequence=0, + metadata={ + "behavior_id": behavior_id, + }, + authors=[ + "Mantas Mazeika", + "Long Phan", + "Xuwang Yin", + "Andy Zou", + "Zifan Wang", + "Norman Mu", + "Elham Sakhaee", + "Nathaniel Li", + "Steven Basart", + "Bo Li", + "David Forsyth", + "Dan Hendrycks", + ], + groups=[ + "University of Illinois Urbana-Champaign", + "Center for AI Safety", + "Carnegie Mellon University", + "UC Berkeley", + "Microsoft", + ], + ) + prompts.append(text_prompt) + + if failed_image_count > 0: + logger.warning(f"Total skipped examples: {failed_image_count} (image fetch failures)") + + seed_prompt_dataset = SeedPromptDataset(prompts=prompts) + return seed_prompt_dataset + + +async def _fetch_and_save_image_async(image_url: str, behavior_id: str) -> str: + filename = f"harmbench_{behavior_id}.png" + serializer = data_serializer_factory(category="seed-prompt-entries", data_type="image_path", extension="png") + + # Return existing path if image already exists for this BehaviorID + serializer.value = str(serializer._memory.results_path + serializer.data_sub_directory + f"/{filename}") + try: + if await serializer._memory.results_storage_io.path_exists(serializer.value): + return serializer.value + except Exception as e: + logger.warning(f"Failed to check whether image for {behavior_id} already exists: {e}") + + response = await make_request_and_raise_if_error_async(endpoint_uri=image_url, method="GET") + await serializer.save_data(data=response.content, output_filename=filename.replace(".png", "")) + + return str(serializer.value) diff --git a/tests/integration/datasets/test_fetch_datasets.py b/tests/integration/datasets/test_fetch_datasets.py index 6d5fadaf3..3d898f61c 100644 --- a/tests/integration/datasets/test_fetch_datasets.py +++ b/tests/integration/datasets/test_fetch_datasets.py @@ -13,6 +13,7 @@ fetch_equitymedqa_dataset_unique_values, fetch_forbidden_questions_dataset, fetch_harmbench_dataset, + fetch_harmbench_multimodal_dataset_async, fetch_jbb_behaviors_by_harm_category, fetch_jbb_behaviors_by_jbb_category, fetch_jbb_behaviors_dataset, @@ -72,6 +73,21 @@ def test_fetch_datasets(fetch_function, is_seed_prompt_dataset): assert len(data.prompts) > 0 +@pytest.mark.asyncio +@pytest.mark.parametrize( + "fetch_function, number_of_prompts", + [ + (fetch_harmbench_multimodal_dataset_async, 110 * 2), + ], +) +async def test_fetch_multimodal_datasets(fetch_function, number_of_prompts): + data = await fetch_function() + + assert data is not None + assert isinstance(data, SeedPromptDataset) + assert len(data.prompts) == number_of_prompts + + @pytest.mark.integration def test_fetch_jbb_behaviors_by_harm_category(): """Integration test for filtering by harm category with real data.""" diff --git a/tests/unit/datasets/test_harmbench_multimodal_dataset.py b/tests/unit/datasets/test_harmbench_multimodal_dataset.py new file mode 100644 index 000000000..bbc3a6748 --- /dev/null +++ b/tests/unit/datasets/test_harmbench_multimodal_dataset.py @@ -0,0 +1,276 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from unittest.mock import patch + +import pytest + +from pyrit.datasets.harmbench_multimodal_dataset import ( + SemanticCategory, + fetch_harmbench_multimodal_dataset_async, +) +from pyrit.models import SeedPromptDataset + + +@pytest.mark.asyncio +@pytest.mark.usefixtures("patch_central_database") +class TestHarmBenchMultimodalDataset: + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + async def test_fetching_with_valid_inputs(self, mock_fetch, mock_fetch_image): + mock_examples = [ + { + "Behavior": "Test harmful behavior with image", + "BehaviorID": "test_behavior_1", + "SemanticCategory": "harmful_category", + "FunctionalCategory": "multimodal", + "ImageFileName": "test_image.png", + "ImageDescription": "A test image description", + "RedactedImageDescription": "A redacted test image description", + }, + { + "Behavior": "Another harmful behavior", + "BehaviorID": "test_behavior_2", + "SemanticCategory": "illegal_activity", + "FunctionalCategory": "multimodal", + "ImageFileName": "another_image.jpg", + "ImageDescription": "Another image description", + "RedactedImageDescription": "Another redacted description", + }, + ] + + mock_fetch.return_value = mock_examples + mock_fetch_image.side_effect = [ + "/dbdata/seed-prompt-entries/images/harmbench_test_behavior_1.png", + "/dbdata/seed-prompt-entries/images/harmbench_test_behavior_2.png", + ] + + result = await fetch_harmbench_multimodal_dataset_async() + + assert isinstance(result, SeedPromptDataset) + assert len(result.prompts) == 4 # 2 text + 2 image prompts + + text_prompts = [p for p in result.prompts if p.data_type == "text"] + image_prompts = [p for p in result.prompts if p.data_type == "image_path"] + assert len(text_prompts) == 2 + assert len(image_prompts) == 2 + + # Verify each text prompt has corresponding image prompt with same group_id + for text_prompt in text_prompts: + matching_image = next(p for p in image_prompts if p.prompt_group_id == text_prompt.prompt_group_id) + assert matching_image is not None + assert text_prompt.prompt_group_id == matching_image.prompt_group_id + + # Test that the prompts have correct properties set + + # Text properties + assert text_prompts[0].value == "Test harmful behavior with image" + assert text_prompts[0].data_type == "text" + assert text_prompts[0].name == "HarmBench Multimodal Text - test_behavior_1" + assert text_prompts[0].dataset_name == "HarmBench Multimodal Examples" + assert text_prompts[0].harm_categories == ["harmful_category"] + assert ( + text_prompts[0].description + == "A text prompt from the HarmBench multimodal dataset, BehaviorID: test_behavior_1" + ) + assert text_prompts[0].sequence == 0 + assert text_prompts[0].metadata == {"behavior_id": "test_behavior_1"} + + # Image properties + assert image_prompts[1].value == "/dbdata/seed-prompt-entries/images/harmbench_test_behavior_2.png" + assert image_prompts[1].data_type == "image_path" + assert image_prompts[1].name == "HarmBench Multimodal Image - test_behavior_2" + assert image_prompts[1].dataset_name == "HarmBench Multimodal Examples" + assert image_prompts[1].harm_categories == ["illegal_activity"] + assert ( + image_prompts[1].description + == "An image prompt from the HarmBench multimodal dataset, BehaviorID: test_behavior_2" + ) + assert image_prompts[1].sequence == 0 + expected_metadata = { + "behavior_id": "test_behavior_2", + "image_description": "Another image description", + "redacted_image_description": "Another redacted description", + "original_image_url": "https://raw.githubusercontent.com/centerforaisafety/HarmBench/c0423b9" + + "/data/multimodal_behavior_images/another_image.png", + } + assert image_prompts[1].metadata == expected_metadata + + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + async def test_fetching_with_missing_required_keys(self, mock_fetch, mock_fetch_image): + mock_examples = [ + { + "Behavior": "Test behavior", + "BehaviorID": "test_id", + "FunctionalCategory": "multimodal", + # Missing SemanticCategory and ImageFileName + } + ] + + mock_fetch.return_value = mock_examples + with pytest.raises(ValueError, match="Missing keys in example"): + await fetch_harmbench_multimodal_dataset_async() + + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + async def test_fetching_with_missing_optional_fields(self, mock_fetch, mock_fetch_image): + mock_examples = [ + { + "Behavior": "Test behavior", + "BehaviorID": "test_optional", + "SemanticCategory": "test_category", + "FunctionalCategory": "multimodal", + "ImageFileName": "test_optional.png", + # Missing optional fields: ImageDescription, RedactedImageDescription + } + ] + + mock_fetch.return_value = mock_examples + mock_fetch_image.return_value = "/dbdata/seed-prompt-entries/images/harmbench_test_optional.png" + + result = await fetch_harmbench_multimodal_dataset_async() + + assert isinstance(result, SeedPromptDataset) + assert len(result.prompts) == 2 + + # Verify image prompt handles missing optional fields + image_prompt = next(p for p in result.prompts if p.data_type == "image_path") + assert image_prompt.metadata["image_description"] == "" + assert image_prompt.metadata["redacted_image_description"] == "" + + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + async def test_fetching_with_empty_examples(self, mock_fetch, mock_fetch_image): + mock_fetch.return_value = [] + + with pytest.raises(ValueError, match="SeedPromptDataset cannot be empty"): + await fetch_harmbench_multimodal_dataset_async() + + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + async def test_filtering_out_non_multimodal_examples(self, mock_fetch, mock_fetch_image): + mock_examples = [ + { + "Behavior": "Text only behavior", + "BehaviorID": "text_only", + "SemanticCategory": "harmful", + "FunctionalCategory": "text_generation", # Non-multimodal + "ImageFileName": "unused.png", + }, + { + "Behavior": "Multimodal behavior", + "BehaviorID": "multimodal_id", + "SemanticCategory": "harmful", + "FunctionalCategory": "multimodal", # This should be included + "ImageFileName": "valid.png", + }, + ] + + mock_fetch.return_value = mock_examples + mock_fetch_image.return_value = "/dbdata/seed-prompt-entries/images/harmbench_multimodal_id.png" + + result = await fetch_harmbench_multimodal_dataset_async() + + assert len(result.prompts) == 2 # one example (1 text + 1 image) + assert all(p.metadata["behavior_id"] == "multimodal_id" for p in result.prompts) + + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + async def test_filtering_by_semantic_categories(self, mock_fetch, mock_fetch_image): + mock_examples = [ + { + "Behavior": "Illegal behavior", + "BehaviorID": "illegal_behavior", + "SemanticCategory": "illegal", + "FunctionalCategory": "multimodal", + "ImageFileName": "illegal.png", + }, + { + "Behavior": "Harmful behavior", + "BehaviorID": "harmful_behavior", + "SemanticCategory": "harmful", + "FunctionalCategory": "multimodal", + "ImageFileName": "harmful.png", + }, + { + "Behavior": "Cybercrime behavior", + "BehaviorID": "cybercrime_behavior", + "SemanticCategory": "cybercrime_intrusion", + "FunctionalCategory": "multimodal", + "ImageFileName": "cybercrime.png", + }, + { + "Behavior": "Harassment behavior", + "BehaviorID": "harassment_behavior", + "SemanticCategory": "harassment_bullying", + "FunctionalCategory": "multimodal", + "ImageFileName": "harassment.png", + }, + ] + mock_fetch.return_value = mock_examples + + mock_fetch_image.side_effect = [ + "/dbdata/seed-prompt-entries/images/harmbench_illegal_behavior.png", + "/dbdata/seed-prompt-entries/images/harmbench_cybercrime_behavior.png", + "/dbdata/seed-prompt-entries/images/harmbench_harmful_behavior.png", + ] + + # Filter by single category + result = await fetch_harmbench_multimodal_dataset_async(categories=[SemanticCategory.ILLEGAL]) + assert isinstance(result, SeedPromptDataset) + assert len(result.prompts) == 2 # 1 text + 1 image prompt for illegal category + assert all(p.metadata["behavior_id"] == "illegal_behavior" for p in result.prompts) + assert all(p.harm_categories == ["illegal"] for p in result.prompts) + + # Filter by multiple categories + result = await fetch_harmbench_multimodal_dataset_async( + categories=[SemanticCategory.CYBERCRIME_INTRUSION, SemanticCategory.HARMFUL] + ) + assert isinstance(result, SeedPromptDataset) + assert len(result.prompts) == 4 # 2 examples × 2 prompts each + behavior_ids = {p.metadata["behavior_id"] for p in result.prompts} + assert behavior_ids == {"cybercrime_behavior", "harmful_behavior"} + + # Filter with an empty list + with pytest.raises(ValueError, match="SeedPromptDataset cannot be empty"): + await fetch_harmbench_multimodal_dataset_async(categories=[]) + + @patch("pyrit.datasets.harmbench_multimodal_dataset._fetch_and_save_image_async") + @patch("pyrit.datasets.harmbench_multimodal_dataset.fetch_examples") + async def test_skips_example_when_image_fetch_fails(self, mock_fetch, mock_fetch_image, caplog): + mock_examples = [ + { + "Behavior": "Successful behavior", + "BehaviorID": "success_behavior", + "SemanticCategory": "harmful", + "FunctionalCategory": "multimodal", + "ImageFileName": "success.png", + }, + { + "Behavior": "Failed behavior", + "BehaviorID": "failed_behavior", + "SemanticCategory": "harmful", + "FunctionalCategory": "multimodal", + "ImageFileName": "failed.png", + }, + ] + + mock_fetch.return_value = mock_examples + mock_fetch_image.side_effect = [ # first call succeeds, second call fails + "/dbdata/seed-prompt-entries/images/harmbench_success_behavior.png", + Exception("Network error - image not found"), + ] + + result = await fetch_harmbench_multimodal_dataset_async() + + # Only the successful example should be included (1 text + 1 image = 2 prompts) + assert isinstance(result, SeedPromptDataset) + assert len(result.prompts) == 2 + behavior_ids = {p.metadata["behavior_id"] for p in result.prompts} + assert behavior_ids == {"success_behavior"} + + warning_messages = [record.message for record in caplog.records if record.levelname == "WARNING"] + assert len(warning_messages) == 2 + assert any("Failed to fetch image for behavior failed_behavior" in msg for msg in warning_messages) + assert any("Total skipped examples: 1 (image fetch failures)" in msg for msg in warning_messages)