facebookresearch · chevalierNoir · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025 · Dec 30, 2025
diff --git a/README.md b/README.md
@@ -71,9 +71,9 @@ torchaudio.save("residual.wav", result.residual.cpu(), sample_rate)  # Everythin
 
 SAM-Audio supports three types of prompts:
 
-1. **Text Prompting**: Describe the sound you want to isolate using natural language
+1. **Text Prompting**: Describe the sound you want to isolate using natural language. To match training, please use lowercase noun-phrase/verb-phrase (NP/VP) format for text (for example instead of "Thunder can be heard in the background" use "thunder").
    ```python
-   processor(audios=[audio], descriptions=["A man speaking"])
+   processor(audios=[audio], descriptions=["man speaking"])
    ```
 
 2. **Visual Prompting**: Use video frames and masks to isolate sounds associated with visual objects
@@ -83,7 +83,7 @@ SAM-Audio supports three types of prompts:
 
 3. **Span Prompting**: Specify time ranges where the target sound occurs
    ```python
-   processor(audios=[audio], descriptions=["A horn honking"], anchors=[[["+", 6.3, 7.0]]])
+   processor(audios=[audio], descriptions=["car honking"], anchors=[[["+", 6.3, 7.0]]])
    ```
 
 See the [examples](examples) directory for more detailed examples

diff --git a/examples/span_prompting.ipynb b/examples/span_prompting.ipynb
@@ -74,7 +74,7 @@
     ")\n",
     "wav = wav.mean(0, keepdim=True)\n",
     "inputs = processor(\n",
-    "    audios=[wav], descriptions=[\"A horn honking\"], anchors=[[[\"+\", 6.3, 7.0]]]\n",
+    "    audios=[wav], descriptions=[\"horn honking\"], anchors=[[[\"+\", 6.3, 7.0]]]\n",
     ").to(device)\n",
     "with torch.inference_mode():\n",
     "    result = model.separate(inputs)"

diff --git a/examples/text_prompting.ipynb b/examples/text_prompting.ipynb
@@ -71,7 +71,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "inputs = processor(audios=[video_file], descriptions=[\"A man speaking\"]).to(device)\n",
+    "inputs = processor(audios=[video_file], descriptions=[\"man speaking\"]).to(device)\n",
     "with torch.inference_mode():\n",
     "    result = model.separate(inputs)"
    ]