diff --git a/.riot/requirements/1fe8dd2.txt b/.riot/requirements/1900591.txt
similarity index 54%
rename from .riot/requirements/1fe8dd2.txt
rename to .riot/requirements/1900591.txt
index 72ea102ea03..ed309c651a9 100644
--- a/.riot/requirements/1fe8dd2.txt
+++ b/.riot/requirements/1900591.txt
@@ -2,33 +2,35 @@
 # This file is autogenerated by pip-compile with Python 3.13
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1fe8dd2.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/1900591.in
 #
-aiohappyeyeballs==2.4.8
-aiohttp==3.11.13
-aiosignal==1.3.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
 annotated-types==0.7.0
-anyio==4.8.0
+anyio==4.9.0
 appdirs==1.4.4
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
-coverage[toml]==7.6.12
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
+coverage[toml]==7.9.2
 dataclasses-json==0.6.7
-datasets==3.3.2
+datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
-filelock==3.17.0
-frozenlist==1.5.0
-fsspec[http]==2024.12.0
-h11==0.14.0
-httpcore==1.0.7
+filelock==3.18.0
+frozenlist==1.7.0
+fsspec[http]==2025.3.0
+greenlet==3.2.3
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.10.0
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -39,45 +41,47 @@ langchain-text-splitters==0.2.4
 langsmith==0.1.147
 marshmallow==3.26.1
 mock==5.2.0
-multidict==6.1.0
+multidict==6.6.3
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.26.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
-orjson==3.10.15
+orjson==3.11.0
 packaging==24.2
-pandas==2.2.3
-pluggy==1.5.0
-propcache==0.3.0
-pyarrow==19.0.1
-pydantic==2.10.6
-pydantic-core==2.27.2
+pandas==2.3.1
+pluggy==1.6.0
+propcache==0.3.2
+pyarrow==21.0.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+pygments==2.19.2
 pysbd==0.3.4
-pytest==8.3.5
+pytest==8.4.1
 pytest-asyncio==0.21.1
-pytest-cov==6.0.0
-pytest-mock==3.14.0
+pytest-cov==6.2.1
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.9.0
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.14.1
 typing-inspect==0.9.0
-tzdata==2025.1
-urllib3==2.3.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
 vcrpy==7.0.0
 wrapt==1.17.2
 xxhash==3.5.0
-yarl==1.18.3
+yarl==1.20.1
diff --git a/.riot/requirements/1687eab.txt b/.riot/requirements/46e9996.txt
similarity index 50%
rename from .riot/requirements/1687eab.txt
rename to .riot/requirements/46e9996.txt
index 009a5ede488..6c8fc1bdc4c 100644
--- a/.riot/requirements/1687eab.txt
+++ b/.riot/requirements/46e9996.txt
@@ -2,33 +2,35 @@
 # This file is autogenerated by pip-compile with Python 3.11
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate --resolver=backtracking .riot/requirements/1687eab.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/46e9996.in
 #
-aiohappyeyeballs==2.4.8
-aiohttp==3.11.13
-aiosignal==1.3.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
 annotated-types==0.7.0
-anyio==4.8.0
+anyio==4.9.0
 appdirs==1.4.4
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
-coverage[toml]==7.6.12
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
+coverage[toml]==7.9.2
 dataclasses-json==0.6.7
-datasets==3.3.2
+datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
-filelock==3.17.0
-frozenlist==1.5.0
-fsspec[http]==2024.12.0
-h11==0.14.0
-httpcore==1.0.7
+filelock==3.18.0
+frozenlist==1.7.0
+fsspec[http]==2025.3.0
+greenlet==3.2.3
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.10.0
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -39,45 +41,47 @@ langchain-text-splitters==0.2.4
 langsmith==0.1.147
 marshmallow==3.26.1
 mock==5.2.0
-multidict==6.1.0
+multidict==6.6.3
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.26.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
-orjson==3.10.15
+orjson==3.11.0
 packaging==24.2
-pandas==2.2.3
-pluggy==1.5.0
-propcache==0.3.0
-pyarrow==19.0.1
-pydantic==2.10.6
-pydantic-core==2.27.2
+pandas==2.3.1
+pluggy==1.6.0
+propcache==0.3.2
+pyarrow==21.0.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+pygments==2.19.2
 pysbd==0.3.4
-pytest==8.3.5
+pytest==8.4.1
 pytest-asyncio==0.21.1
-pytest-cov==6.0.0
-pytest-mock==3.14.0
+pytest-cov==6.2.1
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.9.0
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.14.1
 typing-inspect==0.9.0
-tzdata==2025.1
-urllib3==2.3.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
 vcrpy==7.0.0
 wrapt==1.17.2
 xxhash==3.5.0
-yarl==1.18.3
+yarl==1.20.1
diff --git a/.riot/requirements/771848b.txt b/.riot/requirements/5908834.txt
similarity index 52%
rename from .riot/requirements/771848b.txt
rename to .riot/requirements/5908834.txt
index cd804c107c2..443cdf7b385 100644
--- a/.riot/requirements/771848b.txt
+++ b/.riot/requirements/5908834.txt
@@ -2,35 +2,37 @@
 # This file is autogenerated by pip-compile with Python 3.9
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate --resolver=backtracking .riot/requirements/771848b.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/5908834.in
 #
-aiohappyeyeballs==2.4.8
-aiohttp==3.11.13
-aiosignal==1.3.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
 annotated-types==0.7.0
-anyio==4.8.0
+anyio==4.9.0
 appdirs==1.4.4
 async-timeout==4.0.3
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
-coverage[toml]==7.6.12
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
+coverage[toml]==7.9.2
 dataclasses-json==0.6.7
-datasets==3.3.2
+datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
-exceptiongroup==1.2.2
-filelock==3.17.0
-frozenlist==1.5.0
-fsspec[http]==2024.12.0
-h11==0.14.0
-httpcore==1.0.7
+exceptiongroup==1.3.0
+filelock==3.18.0
+frozenlist==1.7.0
+fsspec[http]==2025.3.0
+greenlet==3.2.3
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.10.0
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -41,46 +43,48 @@ langchain-text-splitters==0.2.4
 langsmith==0.1.147
 marshmallow==3.26.1
 mock==5.2.0
-multidict==6.1.0
+multidict==6.6.3
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.26.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
-orjson==3.10.15
+orjson==3.11.0
 packaging==24.2
-pandas==2.2.3
-pluggy==1.5.0
-propcache==0.3.0
-pyarrow==19.0.1
-pydantic==2.10.6
-pydantic-core==2.27.2
+pandas==2.3.1
+pluggy==1.6.0
+propcache==0.3.2
+pyarrow==21.0.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+pygments==2.19.2
 pysbd==0.3.4
-pytest==8.3.5
+pytest==8.4.1
 pytest-asyncio==0.21.1
-pytest-cov==6.0.0
-pytest-mock==3.14.0
+pytest-cov==6.2.1
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.9.0
 tomli==2.2.1
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.14.1
 typing-inspect==0.9.0
-tzdata==2025.1
+typing-inspection==0.4.1
+tzdata==2025.2
 urllib3==1.26.20
 vcrpy==7.0.0
 wrapt==1.17.2
 xxhash==3.5.0
-yarl==1.18.3
+yarl==1.20.1
diff --git a/.riot/requirements/146f2d8.txt b/.riot/requirements/97f1328.txt
similarity index 74%
rename from .riot/requirements/146f2d8.txt
rename to .riot/requirements/97f1328.txt
index e552a5620a3..68875c403e1 100644
--- a/.riot/requirements/146f2d8.txt
+++ b/.riot/requirements/97f1328.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate --resolver=backtracking .riot/requirements/146f2d8.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/97f1328.in
 #
 aiohappyeyeballs==2.4.4
 aiohttp==3.10.11
@@ -11,26 +11,28 @@ annotated-types==0.7.0
 anyio==4.5.2
 appdirs==1.4.4
 async-timeout==4.0.3
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
 coverage[toml]==7.6.1
 dataclasses-json==0.6.7
 datasets==3.1.0
 dill==0.3.8
 distro==1.9.0
-exceptiongroup==1.2.2
+exceptiongroup==1.3.0
 filelock==3.16.1
 frozenlist==1.5.0
 fsspec[http]==2024.9.0
-h11==0.14.0
-httpcore==1.0.7
+greenlet==3.1.1
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.9.1
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -43,10 +45,10 @@ marshmallow==3.22.0
 mock==5.2.0
 multidict==6.1.0
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.24.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
 orjson==3.10.15
 packaging==24.2
@@ -60,25 +62,25 @@ pysbd==0.3.4
 pytest==8.3.5
 pytest-asyncio==0.21.1
 pytest-cov==5.0.0
-pytest-mock==3.14.0
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.7.0
 tomli==2.2.1
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.13.2
 typing-inspect==0.9.0
-tzdata==2025.1
+tzdata==2025.2
 urllib3==1.26.20
 vcrpy==6.0.2
 wrapt==1.17.2
diff --git a/.riot/requirements/12c5529.txt b/.riot/requirements/f37741b.txt
similarity index 54%
rename from .riot/requirements/12c5529.txt
rename to .riot/requirements/f37741b.txt
index 40afeea9f40..f258baee359 100644
--- a/.riot/requirements/12c5529.txt
+++ b/.riot/requirements/f37741b.txt
@@ -2,35 +2,37 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate .riot/requirements/12c5529.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/f37741b.in
 #
-aiohappyeyeballs==2.4.8
-aiohttp==3.11.13
-aiosignal==1.3.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
 annotated-types==0.7.0
-anyio==4.8.0
+anyio==4.9.0
 appdirs==1.4.4
 async-timeout==4.0.3
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
-coverage[toml]==7.6.12
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
+coverage[toml]==7.9.2
 dataclasses-json==0.6.7
-datasets==3.3.2
+datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
-exceptiongroup==1.2.2
-filelock==3.17.0
-frozenlist==1.5.0
-fsspec[http]==2024.12.0
-h11==0.14.0
-httpcore==1.0.7
+exceptiongroup==1.3.0
+filelock==3.18.0
+frozenlist==1.7.0
+fsspec[http]==2025.3.0
+greenlet==3.2.3
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.10.0
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -41,46 +43,48 @@ langchain-text-splitters==0.2.4
 langsmith==0.1.147
 marshmallow==3.26.1
 mock==5.2.0
-multidict==6.1.0
+multidict==6.6.3
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.26.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
-orjson==3.10.15
+orjson==3.11.0
 packaging==24.2
-pandas==2.2.3
-pluggy==1.5.0
-propcache==0.3.0
-pyarrow==19.0.1
-pydantic==2.10.6
-pydantic-core==2.27.2
+pandas==2.3.1
+pluggy==1.6.0
+propcache==0.3.2
+pyarrow==21.0.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+pygments==2.19.2
 pysbd==0.3.4
-pytest==8.3.5
+pytest==8.4.1
 pytest-asyncio==0.21.1
-pytest-cov==6.0.0
-pytest-mock==3.14.0
+pytest-cov==6.2.1
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.9.0
 tomli==2.2.1
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.14.1
 typing-inspect==0.9.0
-tzdata==2025.1
-urllib3==2.3.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
 vcrpy==7.0.0
 wrapt==1.17.2
 xxhash==3.5.0
-yarl==1.18.3
+yarl==1.20.1
diff --git a/.riot/requirements/4102ef5.txt b/.riot/requirements/ffd66c1.txt
similarity index 54%
rename from .riot/requirements/4102ef5.txt
rename to .riot/requirements/ffd66c1.txt
index d0518848787..89c4f02b050 100644
--- a/.riot/requirements/4102ef5.txt
+++ b/.riot/requirements/ffd66c1.txt
@@ -2,33 +2,35 @@
 # This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    pip-compile --allow-unsafe --no-annotate .riot/requirements/4102ef5.in
+#    pip-compile --allow-unsafe --no-annotate .riot/requirements/ffd66c1.in
 #
-aiohappyeyeballs==2.4.8
-aiohttp==3.11.13
-aiosignal==1.3.2
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.14
+aiosignal==1.4.0
 annotated-types==0.7.0
-anyio==4.8.0
+anyio==4.9.0
 appdirs==1.4.4
-attrs==25.1.0
-certifi==2025.1.31
-charset-normalizer==3.4.1
-coverage[toml]==7.6.12
+attrs==25.3.0
+certifi==2025.7.14
+charset-normalizer==3.4.2
+coverage[toml]==7.9.2
 dataclasses-json==0.6.7
-datasets==3.3.2
+datasets==4.0.0
 dill==0.3.8
 distro==1.9.0
-filelock==3.17.0
-frozenlist==1.5.0
-fsspec[http]==2024.12.0
-h11==0.14.0
-httpcore==1.0.7
+filelock==3.18.0
+frozenlist==1.7.0
+fsspec[http]==2025.3.0
+greenlet==3.2.3
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
 httpx==0.28.1
-huggingface-hub==0.29.2
+huggingface-hub==0.33.4
 hypothesis==6.45.0
 idna==3.10
-iniconfig==2.0.0
-jiter==0.8.2
+iniconfig==2.1.0
+jiter==0.10.0
 jsonpatch==1.33
 jsonpointer==3.0.0
 langchain==0.2.17
@@ -39,45 +41,47 @@ langchain-text-splitters==0.2.4
 langsmith==0.1.147
 marshmallow==3.26.1
 mock==5.2.0
-multidict==6.1.0
+multidict==6.6.3
 multiprocess==0.70.16
-mypy-extensions==1.0.0
+mypy-extensions==1.1.0
 nest-asyncio==1.6.0
 numpy==1.26.4
-openai==1.65.3
+openai==1.97.1
 opentracing==2.4.0
-orjson==3.10.15
+orjson==3.11.0
 packaging==24.2
-pandas==2.2.3
-pluggy==1.5.0
-propcache==0.3.0
-pyarrow==19.0.1
-pydantic==2.10.6
-pydantic-core==2.27.2
+pandas==2.3.1
+pluggy==1.6.0
+propcache==0.3.2
+pyarrow==21.0.0
+pydantic==2.11.7
+pydantic-core==2.33.2
+pygments==2.19.2
 pysbd==0.3.4
-pytest==8.3.5
+pytest==8.4.1
 pytest-asyncio==0.21.1
-pytest-cov==6.0.0
-pytest-mock==3.14.0
+pytest-cov==6.2.1
+pytest-mock==3.14.1
 python-dateutil==2.9.0.post0
-pytz==2025.1
+pytz==2025.2
 pyyaml==6.0.2
 ragas==0.1.21
 regex==2024.11.6
-requests==2.32.3
+requests==2.32.4
 requests-toolbelt==1.0.0
 six==1.17.0
 sniffio==1.3.1
 sortedcontainers==2.4.0
-sqlalchemy==2.0.38
+sqlalchemy==2.0.41
 tenacity==8.5.0
 tiktoken==0.9.0
 tqdm==4.67.1
-typing-extensions==4.12.2
+typing-extensions==4.14.1
 typing-inspect==0.9.0
-tzdata==2025.1
-urllib3==2.3.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.5.0
 vcrpy==7.0.0
 wrapt==1.17.2
 xxhash==3.5.0
-yarl==1.18.3
+yarl==1.20.1
diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py
index 10c81842e4c..e28956390be 100644
--- a/ddtrace/llmobs/_constants.py
+++ b/ddtrace/llmobs/_constants.py
@@ -56,6 +56,7 @@
 EVP_PAYLOAD_SIZE_LIMIT = 5 << 20  # 5MB (actual limit is 5.1MB)
 EVP_EVENT_SIZE_LIMIT = (1 << 20) - 1024  # 999KB (actual limit is 1MB)
 
+EXPERIMENT_CSV_FIELD_MAX_SIZE = 10 * 1024 * 1024
 
 DROPPED_IO_COLLECTION_ERROR = "dropped_io"
 DROPPED_VALUE_TEXT = "[This value has been dropped because this span's size exceeds the 1MB size limit.]"
diff --git a/ddtrace/llmobs/_experiment.py b/ddtrace/llmobs/_experiment.py
index 8fcf6706065..996af3cd00d 100644
--- a/ddtrace/llmobs/_experiment.py
+++ b/ddtrace/llmobs/_experiment.py
@@ -177,12 +177,56 @@ def __len__(self) -> int:
     def __iter__(self) -> Iterator[DatasetRecord]:
         return iter(self._records)
 
+    def as_dataframe(self) -> None:
+        try:
+            import pandas as pd
+        except ImportError as e:
+            raise ImportError(
+                "pandas is required to convert dataset to DataFrame. Please install via `pip install pandas`"
+            ) from e
+
+        column_tuples = set()
+        data_rows = []
+        for record in self._records:
+            flat_record = {}  # type: Dict[Union[str, Tuple[str, str]], Any]
+
+            input_data = record.get("input_data", {})
+            if isinstance(input_data, dict):
+                for input_data_col, input_data_val in input_data.items():
+                    flat_record[("input_data", input_data_col)] = input_data_val
+                    column_tuples.add(("input_data", input_data_col))
+            else:
+                flat_record[("input_data", "")] = input_data
+                column_tuples.add(("input_data", ""))
+
+            expected_output = record.get("expected_output", {})
+            if isinstance(expected_output, dict):
+                for expected_output_col, expected_output_val in expected_output.items():
+                    flat_record[("expected_output", expected_output_col)] = expected_output_val
+                    column_tuples.add(("expected_output", expected_output_col))
+            else:
+                flat_record[("expected_output", "")] = expected_output
+                column_tuples.add(("expected_output", ""))
+
+            for metadata_col, metadata_val in record.get("metadata", {}).items():
+                flat_record[("metadata", metadata_col)] = metadata_val
+                column_tuples.add(("metadata", metadata_col))
+
+            data_rows.append(flat_record)
+
+        records_list = []
+        for flat_record in data_rows:
+            row = [flat_record.get(col, None) for col in column_tuples]
+            records_list.append(row)
+
+        return pd.DataFrame(data=records_list, columns=pd.MultiIndex.from_tuples(column_tuples))
+
 
 class Experiment:
     def __init__(
         self,
         name: str,
-        task: Callable[[DatasetRecordInputType], JSONType],
+        task: Callable[[DatasetRecordInputType, Optional[ExperimentConfigType]], JSONType],
         dataset: Dataset,
         evaluators: List[Callable[[DatasetRecordInputType, JSONType, JSONType], JSONType]],
         project_name: str,
@@ -224,7 +268,8 @@ def run(
         if not self._llmobs_instance.enabled:
             logger.warning(
                 "Skipping experiment as LLMObs is not enabled. "
-                "Ensure LLM Observability is enabled via `LLMObs.enable(...)` or set `DD_LLMOBS_ENABLED=1`."
+                "Ensure LLM Observability is enabled via `LLMObs.enable(...)` "
+                "or set `DD_LLMOBS_ENABLED=1` and use `ddtrace-run` to run your application."
             )
             return []
 
@@ -273,7 +318,7 @@ def _process_record(self, idx_record: Tuple[int, DatasetRecord]) -> Optional[Tas
             }
             output_data = None
             try:
-                output_data = self._task(input_data)
+                output_data = self._task(input_data, self._config)
             except Exception:
                 span.set_exc_info(*sys.exc_info())
             self._llmobs_instance.annotate(span, input_data=input_data, output_data=output_data, tags=tags)
diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py
index d7a6db680cd..74fdd935944 100644
--- a/ddtrace/llmobs/_llmobs.py
+++ b/ddtrace/llmobs/_llmobs.py
@@ -1,3 +1,4 @@
+import csv
 from dataclasses import dataclass
 from dataclasses import field
 import inspect
@@ -49,6 +50,7 @@
 from ddtrace.llmobs._constants import DISPATCH_ON_LLM_TOOL_CHOICE
 from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL
 from ddtrace.llmobs._constants import DISPATCH_ON_TOOL_CALL_OUTPUT_USED
+from ddtrace.llmobs._constants import EXPERIMENT_CSV_FIELD_MAX_SIZE
 from ddtrace.llmobs._constants import EXPERIMENT_EXPECTED_OUTPUT
 from ddtrace.llmobs._constants import EXPERIMENT_ID_KEY
 from ddtrace.llmobs._constants import INPUT_DOCUMENTS
@@ -78,8 +80,8 @@
 from ddtrace.llmobs._context import LLMObsContextProvider
 from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
 from ddtrace.llmobs._experiment import Dataset
+from ddtrace.llmobs._experiment import DatasetRecord
 from ddtrace.llmobs._experiment import DatasetRecordInputType
-from ddtrace.llmobs._experiment import DatasetRecordRaw as DatasetRecord
 from ddtrace.llmobs._experiment import Experiment
 from ddtrace.llmobs._experiment import ExperimentConfigType
 from ddtrace.llmobs._experiment import JSONType
@@ -601,6 +603,67 @@ def create_dataset(cls, name: str, description: str, records: List[DatasetRecord
             ds.push()
         return ds
 
+    @classmethod
+    def create_dataset_from_csv(
+        cls,
+        csv_path: str,
+        dataset_name: str,
+        input_data_columns: List[str],
+        expected_output_columns: List[str],
+        metadata_columns: List[str] = [],
+        csv_delimiter: str = ",",
+        description="",
+    ) -> Dataset:
+        ds = cls._instance._dne_client.dataset_create(dataset_name, description)
+
+        # Store the original field size limit to restore it later
+        original_field_size_limit = csv.field_size_limit()
+
+        csv.field_size_limit(EXPERIMENT_CSV_FIELD_MAX_SIZE)  # 10mb
+
+        try:
+            with open(csv_path, mode="r") as csvfile:
+                content = csvfile.readline().strip()
+                if not content:
+                    raise ValueError("CSV file appears to be empty or header is missing.")
+
+                csvfile.seek(0)
+
+                rows = csv.DictReader(csvfile, delimiter=csv_delimiter)
+
+                if rows.fieldnames is None:
+                    raise ValueError("CSV file appears to be empty or header is missing.")
+
+                header_columns = rows.fieldnames
+                missing_input_columns = [col for col in input_data_columns if col not in header_columns]
+                missing_output_columns = [col for col in expected_output_columns if col not in header_columns]
+                missing_metadata_columns = [col for col in metadata_columns if col not in metadata_columns]
+
+                if any(col not in header_columns for col in input_data_columns):
+                    raise ValueError(f"Input columns not found in CSV header: {missing_input_columns}")
+                if any(col not in header_columns for col in expected_output_columns):
+                    raise ValueError(f"Expected output columns not found in CSV header: {missing_output_columns}")
+                if any(col not in header_columns for col in metadata_columns):
+                    raise ValueError(f"Metadata columns not found in CSV header: {missing_metadata_columns}")
+
+                for row in rows:
+                    ds.append(
+                        DatasetRecord(
+                            input_data={col: row[col] for col in input_data_columns},
+                            expected_output={col: row[col] for col in expected_output_columns},
+                            metadata={col: row[col] for col in metadata_columns},
+                            record_id="",
+                        )
+                    )
+
+        finally:
+            # Always restore the original field size limit
+            csv.field_size_limit(original_field_size_limit)
+
+        if len(ds) > 0:
+            ds.push()
+        return ds
+
     @classmethod
     def _delete_dataset(cls, dataset_id: str) -> None:
         return cls._instance._dne_client.dataset_delete(dataset_id)
@@ -609,7 +672,7 @@ def _delete_dataset(cls, dataset_id: str) -> None:
     def experiment(
         cls,
         name: str,
-        task: Callable[[DatasetRecordInputType], JSONType],
+        task: Callable[[DatasetRecordInputType, Optional[ExperimentConfigType]], JSONType],
         dataset: Dataset,
         evaluators: List[Callable[[DatasetRecordInputType, JSONType, JSONType], JSONType]],
         description: str = "",
@@ -619,7 +682,7 @@ def experiment(
         """Initializes an Experiment to run a task on a Dataset and evaluators.
 
         :param name: The name of the experiment.
-        :param task: The task function to run. Must accept a parameter ``input_data`` and optionally ``config``.
+        :param task: The task function to run. Must accept parameters ``input_data`` and ``config``.
         :param dataset: The dataset to run the experiment on, created with LLMObs.pull/create_dataset().
         :param evaluators: A list of evaluator functions to evaluate the task output.
                            Must accept parameters ``input_data``, ``output_data``, and ``expected_output``.
@@ -631,8 +694,8 @@ def experiment(
             raise TypeError("task must be a callable function.")
         sig = inspect.signature(task)
         params = sig.parameters
-        if "input_data" not in params:
-            raise TypeError("Task function must accept 'input_data' parameters.")
+        if "input_data" not in params or "config" not in params:
+            raise TypeError("Task function must have 'input_data' and 'config' parameters.")
         if not isinstance(dataset, Dataset):
             raise TypeError("Dataset must be an LLMObs Dataset object.")
         if not evaluators or not all(callable(evaluator) for evaluator in evaluators):
diff --git a/riotfile.py b/riotfile.py
index 790dea22d29..ae6ceead39f 100644
--- a/riotfile.py
+++ b/riotfile.py
@@ -3134,6 +3134,7 @@ def select_pys(min_version: str = MIN_PYTHON_VERSION, max_version: str = MAX_PYT
                 "pytest-asyncio": "==0.21.1",
                 "ragas": "==0.1.21",
                 "langchain": latest,
+                "pandas": latest,
             },
             pys=select_pys(min_version="3.8"),
         ),
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_34c161e3.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_34c161e3.yaml
new file mode 100644
index 00000000000..3e4ea62376f
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_34c161e3.yaml
@@ -0,0 +1,49 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "id": "8dbba503-cf48-4e82-805e-1d9cb46e5ed3",
+      "attributes": {"insert_records": [{"input": {"in0": "r0v1", "in1": "r0v2", "in2":
+      "r0v3"}, "expected_output": {"out0": "r0v4", "out1": "r0v5"}, "metadata": {"m0":
+      "r0v6"}}, {"input": {"in0": "r1v1", "in1": "r1v2", "in2": "r1v3"}, "expected_output":
+      {"out0": "r1v4", "out1": "r1v5"}, "metadata": {"m0": "r1v6"}}], "update_records":
+      [], "delete_records": []}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '434'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/8dbba503-cf48-4e82-805e-1d9cb46e5ed3/batch_update
+  response:
+    body:
+      string: '{"data":[]}'
+    headers:
+      content-length:
+      - '11'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 16:27:16 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_466da69c.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_466da69c.yaml
new file mode 100644
index 00000000000..81adf342dcc
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_batch_update_post_466da69c.yaml
@@ -0,0 +1,49 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "id": "8dbba503-cf48-4e82-805e-1d9cb46e5ed3",
+      "attributes": {"insert_records": [{"input": {"in0": "r0v1", "in1": "r0v2", "in2":
+      "r0v3"}, "expected_output": {"out0": "r0v4", "out1": "r0v5"}, "metadata": {}},
+      {"input": {"in0": "r1v1", "in1": "r1v2", "in2": "r1v3"}, "expected_output":
+      {"out0": "r1v4", "out1": "r1v5"}, "metadata": {}}], "update_records": [], "delete_records":
+      []}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '410'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/8dbba503-cf48-4e82-805e-1d9cb46e5ed3/batch_update
+  response:
+    body:
+      string: '{"data":[{"id":"79dc4c72-e4fd-4de7-b46e-419b8072a207","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449869Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449869Z","version":1}},{"id":"f0f3fe26-1060-452f-a292-30b1293dd5a1","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449869Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449869Z","version":1}}]}'
+    headers:
+      content-length:
+      - '812'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:48 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_records_get_46238641.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_records_get_46238641.yaml
new file mode 100644
index 00000000000..f8b4fdefe78
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_8dbba503-cf48-4e82-805e-1d9cb46e5ed3_records_get_46238641.yaml
@@ -0,0 +1,45 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/8dbba503-cf48-4e82-805e-1d9cb46e5ed3/records
+  response:
+    body:
+      string: '{"data":[{"id":"79dc4c72-e4fd-4de7-b46e-419b8072a207","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449Z"}},{"id":"f0f3fe26-1060-452f-a292-30b1293dd5a1","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:48.152449Z","dataset_id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{},"updated_at":"2025-07-23T02:50:48.152449Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '796'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:52 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_batch_update_post_b8c84073.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_batch_update_post_b8c84073.yaml
new file mode 100644
index 00000000000..eb8a50f0d92
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_batch_update_post_b8c84073.yaml
@@ -0,0 +1,49 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "id": "acf19ca4-8062-4548-abbf-95b33f55d51d",
+      "attributes": {"insert_records": [{"input": {"in0": "r0v1", "in1": "r0v2", "in2":
+      "r0v3"}, "expected_output": {"out0": "r0v4", "out1": "r0v5"}, "metadata": {"m0":
+      "r0v6"}}, {"input": {"in0": "r1v1", "in1": "r1v2", "in2": "r1v3"}, "expected_output":
+      {"out0": "r1v4", "out1": "r1v5"}, "metadata": {"m0": "r1v6"}}], "update_records":
+      [], "delete_records": []}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '434'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/acf19ca4-8062-4548-abbf-95b33f55d51d/batch_update
+  response:
+    body:
+      string: '{"data":[{"id":"016350fb-8b44-4d5d-af57-0c8408bb5d25","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124332Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{"m0":"r0v6"},"updated_at":"2025-07-23T02:50:52.795124332Z","version":1}},{"id":"91fcfd23-040c-40ad-a0b0-f497f10b41e3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124332Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{"m0":"r1v6"},"updated_at":"2025-07-23T02:50:52.795124332Z","version":1}}]}'
+    headers:
+      content-length:
+      - '834'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:52 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_records_get_d7e731ad.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_records_get_d7e731ad.yaml
new file mode 100644
index 00000000000..51d0723acfa
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_acf19ca4-8062-4548-abbf-95b33f55d51d_records_get_d7e731ad.yaml
@@ -0,0 +1,45 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/acf19ca4-8062-4548-abbf-95b33f55d51d/records
+  response:
+    body:
+      string: '{"data":[{"id":"016350fb-8b44-4d5d-af57-0c8408bb5d25","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r0v4","out1":"r0v5"},"input":{"in0":"r0v1","in1":"r0v2","in2":"r0v3"},"metadata":{"m0":"r0v6"},"updated_at":"2025-07-23T02:50:52.795124Z"}},{"id":"91fcfd23-040c-40ad-a0b0-f497f10b41e3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.795124Z","dataset_id":"acf19ca4-8062-4548-abbf-95b33f55d51d","expected_output":{"out0":"r1v4","out1":"r1v5"},"input":{"in0":"r1v1","in1":"r1v2","in2":"r1v3"},"metadata":{"m0":"r1v6"},"updated_at":"2025-07-23T02:50:52.795124Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '818'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:55 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_d7976483.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_d7976483.yaml
new file mode 100644
index 00000000000..698ca4baa2d
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_d7976483.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids":
+      ["8dbba503-cf48-4e82-805e-1d9cb46e5ed3"]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '119'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete
+  response:
+    body:
+      string: '{"data":[{"id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:46.680898Z","current_version":1,"deleted_at":"2025-07-23T02:50:52.35158Z","description":"A
+        good csv dataset","name":"test-dataset-good-csv","updated_at":"2025-07-23T02:50:48.30675Z"}}]}'
+    headers:
+      content-length:
+      - '357'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:52 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_f8dc510e.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_f8dc510e.yaml
new file mode 100644
index 00000000000..10725a15b17
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_delete_post_f8dc510e.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"type": "soft", "dataset_ids":
+      ["acf19ca4-8062-4548-abbf-95b33f55d51d"]}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '119'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets/delete
+  response:
+    body:
+      string: '{"data":[{"id":"acf19ca4-8062-4548-abbf-95b33f55d51d","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.663543Z","current_version":1,"deleted_at":"2025-07-23T02:50:55.361839Z","description":"A
+        good pipe separated csv dataset","name":"test-dataset-good-csv-pipe","updated_at":"2025-07-23T02:50:52.943101Z"}}]}'
+    headers:
+      content-length:
+      - '379'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:55 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv-pipe_get_bcb704ce.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv-pipe_get_bcb704ce.yaml
new file mode 100644
index 00000000000..032e3c997f3
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv-pipe_get_bcb704ce.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets?filter%5Bname%5D=test-dataset-good-csv-pipe
+  response:
+    body:
+      string: '{"data":[{"id":"acf19ca4-8062-4548-abbf-95b33f55d51d","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.663543Z","current_version":1,"description":"A
+        good pipe separated csv dataset","name":"test-dataset-good-csv-pipe","updated_at":"2025-07-23T02:50:52.943101Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '356'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:55 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv_get_989b2028.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv_get_989b2028.yaml
new file mode 100644
index 00000000000..02e7ba102ae
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_filter_name__test-dataset-good-csv_get_989b2028.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Length
+      : - '0'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: GET
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets?filter%5Bname%5D=test-dataset-good-csv
+  response:
+    body:
+      string: '{"data":[{"id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:46.680898Z","current_version":1,"description":"A
+        good csv dataset","name":"test-dataset-good-csv","updated_at":"2025-07-23T02:50:48.30675Z"}}],"meta":{"after":""}}'
+    headers:
+      content-length:
+      - '335'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:51 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_027be704.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_027be704.yaml
new file mode 100644
index 00000000000..232be655b93
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_027be704.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-good-csv-pipe",
+      "description": "A good pipe separated csv dataset"}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '136'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets
+  response:
+    body:
+      string: '{"data":{"id":"acf19ca4-8062-4548-abbf-95b33f55d51d","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:52.663543594Z","current_version":0,"description":"A
+        good pipe separated csv dataset","name":"test-dataset-good-csv-pipe","updated_at":"2025-07-23T02:50:52.663543594Z"}}}'
+    headers:
+      content-length:
+      - '340'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:52 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_c1d4ae31.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_c1d4ae31.yaml
new file mode 100644
index 00000000000..de032063c0c
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_c1d4ae31.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-good-csv",
+      "description": "A good csv dataset"}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '116'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets
+  response:
+    body:
+      string: '{"data":{"id":"8dbba503-cf48-4e82-805e-1d9cb46e5ed3","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-23T02:50:46.680898186Z","current_version":0,"description":"A
+        good csv dataset","name":"test-dataset-good-csv","updated_at":"2025-07-23T02:50:46.680898186Z"}}}'
+    headers:
+      content-length:
+      - '320'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:46 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_d59b5313.yaml b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_d59b5313.yaml
new file mode 100644
index 00000000000..07a85cd5c60
--- /dev/null
+++ b/tests/llmobs/llmobs_cassettes/datadog/datadog_api_unstable_llm-obs_v1_datasets_post_d59b5313.yaml
@@ -0,0 +1,46 @@
+interactions:
+- request:
+    body: '{"data": {"type": "datasets", "attributes": {"name": "test-dataset-bad-csv",
+      "description": "not a real csv dataset"}}}'
+    headers:
+      Accept:
+      - '*/*'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Accept-Encoding
+      : - identity
+      Connection:
+      - keep-alive
+      Content-Length:
+      - '119'
+      ? !!python/object/apply:multidict._multidict.istr
+      - Content-Type
+      : - application/json
+      User-Agent:
+      - python-requests/2.32.3
+    method: POST
+    uri: https://api.datadoghq.com/api/unstable/llm-obs/v1/datasets
+  response:
+    body:
+      string: '{"data":{"id":"6890ba6e-8023-414c-bdb1-662c63f9f489","type":"datasets","attributes":{"author":{"id":"de473b30-eb9f-11e9-a77a-c7405862b8bd"},"created_at":"2025-07-22T23:10:06.524865Z","current_version":0,"description":"not
+        a real csv dataset","name":"test-dataset-bad-csv","updated_at":"2025-07-22T23:10:06.524865Z"}}}'
+    headers:
+      content-length:
+      - '317'
+      content-security-policy:
+      - frame-ancestors 'self'; report-uri https://logs.browser-intake-datadoghq.com/api/v2/logs?dd-api-key=pube4f163c23bbf91c16b8f57f56af9fc58&dd-evp-origin=content-security-policy&ddsource=csp-report&ddtags=site%3Adatadoghq.com
+      content-type:
+      - application/vnd.api+json
+      date:
+      - Wed, 23 Jul 2025 02:50:47 GMT
+      strict-transport-security:
+      - max-age=31536000; includeSubDomains; preload
+      vary:
+      - Accept-Encoding
+      x-content-type-options:
+      - nosniff
+      x-frame-options:
+      - SAMEORIGIN
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/tests/llmobs/static_files/empty.csv b/tests/llmobs/static_files/empty.csv
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/llmobs/static_files/good_dataset.csv b/tests/llmobs/static_files/good_dataset.csv
new file mode 100644
index 00000000000..be4f03d552c
--- /dev/null
+++ b/tests/llmobs/static_files/good_dataset.csv
@@ -0,0 +1,3 @@
+in0,in1,in2,out0,out1,m0
+r0v1,r0v2,r0v3,r0v4,r0v5,r0v6
+r1v1,r1v2,r1v3,r1v4,r1v5,r1v6
diff --git a/tests/llmobs/static_files/good_dataset_pipe_separated.csv b/tests/llmobs/static_files/good_dataset_pipe_separated.csv
new file mode 100644
index 00000000000..7a5a698efe1
--- /dev/null
+++ b/tests/llmobs/static_files/good_dataset_pipe_separated.csv
@@ -0,0 +1,3 @@
+in0|in1|in2|out0|out1|m0
+r0v1|r0v2|r0v3|r0v4|r0v5|r0v6
+r1v1|r1v2|r1v3|r1v4|r1v5|r1v6
diff --git a/tests/llmobs/test_experiments.py b/tests/llmobs/test_experiments.py
index 8a2747bcd41..57409feae5e 100644
--- a/tests/llmobs/test_experiments.py
+++ b/tests/llmobs/test_experiments.py
@@ -28,11 +28,11 @@ def wait_for_backend():
         time.sleep(2)
 
 
-def dummy_task(input_data):
+def dummy_task(input_data, config):
     return input_data
 
 
-def faulty_task(input_data):
+def faulty_task(input_data, config):
     raise ValueError("This is a test error")
 
 
@@ -86,6 +86,177 @@ def test_dataset_create_delete(llmobs):
     llmobs._delete_dataset(dataset_id=dataset._id)
 
 
+def test_dataset_as_dataframe(llmobs, test_dataset_one_record):
+    dataset = test_dataset_one_record
+    df = dataset.as_dataframe()
+    assert len(df.columns) == 2
+    assert df.size == 2  # size is num elements in a series
+
+
+def test_csv_dataset_as_dataframe(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
+    dataset_id = None
+    try:
+        dataset = llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-good-csv",
+            description="A good csv dataset",
+            input_data_columns=["in0", "in1", "in2"],
+            expected_output_columns=["out0", "out1"],
+            metadata_columns=["m0"],
+        )
+        dataset_id = dataset._id
+        assert len(dataset) == 2
+
+        df = dataset.as_dataframe()
+        assert len(df.columns) == 6
+        assert sorted(df.columns) == [
+            ("expected_output", "out0"),
+            ("expected_output", "out1"),
+            ("input_data", "in0"),
+            ("input_data", "in1"),
+            ("input_data", "in2"),
+            ("metadata", "m0"),
+        ]
+    finally:
+        if dataset_id:
+            llmobs._delete_dataset(dataset_id=dataset_id)
+
+
+def test_dataset_csv_missing_input_col(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
+    with pytest.raises(ValueError, match=re.escape("Input columns not found in CSV header: ['in998', 'in999']")):
+        llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-good-csv",
+            description="A good csv dataset",
+            input_data_columns=["in998", "in999"],
+            expected_output_columns=["out0", "out1"],
+        )
+
+
+def test_dataset_csv_missing_output_col(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
+    with pytest.raises(ValueError, match=re.escape("Expected output columns not found in CSV header: ['out999']")):
+        llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-good-csv",
+            description="A good csv dataset",
+            input_data_columns=["in0", "in1", "in2"],
+            expected_output_columns=["out999"],
+        )
+
+
+def test_dataset_csv_empty_csv(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/empty.csv")
+    with pytest.raises(ValueError, match=re.escape("CSV file appears to be empty or header is missing.")):
+        llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-bad-csv",
+            description="not a real csv dataset",
+            input_data_columns=["in0", "in1", "in2"],
+            expected_output_columns=["out0"],
+        )
+
+
+def test_dataset_csv(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/good_dataset.csv")
+    dataset_id = None
+    try:
+        dataset = llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-good-csv",
+            description="A good csv dataset",
+            input_data_columns=["in0", "in1", "in2"],
+            expected_output_columns=["out0", "out1"],
+        )
+        dataset_id = dataset._id
+        assert len(dataset) == 2
+        assert len(dataset[0]["input_data"]) == 3
+        assert dataset[0]["input_data"]["in0"] == "r0v1"
+        assert dataset[0]["input_data"]["in1"] == "r0v2"
+        assert dataset[0]["input_data"]["in2"] == "r0v3"
+        assert dataset[1]["input_data"]["in0"] == "r1v1"
+        assert dataset[1]["input_data"]["in1"] == "r1v2"
+        assert dataset[1]["input_data"]["in2"] == "r1v3"
+
+        assert len(dataset[0]["expected_output"]) == 2
+        assert dataset[0]["expected_output"]["out0"] == "r0v4"
+        assert dataset[0]["expected_output"]["out1"] == "r0v5"
+        assert dataset[1]["expected_output"]["out0"] == "r1v4"
+        assert dataset[1]["expected_output"]["out1"] == "r1v5"
+
+        assert dataset.description == "A good csv dataset"
+
+        assert dataset._id is not None
+
+        wait_for_backend()
+        ds = llmobs.pull_dataset(name=dataset.name)
+
+        assert len(ds) == len(dataset)
+        assert ds.name == dataset.name
+        assert ds.description == dataset.description
+        assert ds._version == 1
+    finally:
+        if dataset_id:
+            llmobs._delete_dataset(dataset_id=dataset_id)
+
+
+def test_dataset_csv_pipe_separated(llmobs):
+    test_path = os.path.dirname(__file__)
+    csv_path = os.path.join(test_path, "static_files/good_dataset_pipe_separated.csv")
+    dataset_id = None
+    try:
+        dataset = llmobs.create_dataset_from_csv(
+            csv_path=csv_path,
+            dataset_name="test-dataset-good-csv-pipe",
+            description="A good pipe separated csv dataset",
+            input_data_columns=["in0", "in1", "in2"],
+            expected_output_columns=["out0", "out1"],
+            metadata_columns=["m0"],
+            csv_delimiter="|",
+        )
+        dataset_id = dataset._id
+        assert len(dataset) == 2
+        assert len(dataset[0]["input_data"]) == 3
+        assert dataset[0]["input_data"]["in0"] == "r0v1"
+        assert dataset[0]["input_data"]["in1"] == "r0v2"
+        assert dataset[0]["input_data"]["in2"] == "r0v3"
+        assert dataset[1]["input_data"]["in0"] == "r1v1"
+        assert dataset[1]["input_data"]["in1"] == "r1v2"
+        assert dataset[1]["input_data"]["in2"] == "r1v3"
+
+        assert len(dataset[0]["expected_output"]) == 2
+        assert dataset[0]["expected_output"]["out0"] == "r0v4"
+        assert dataset[0]["expected_output"]["out1"] == "r0v5"
+        assert dataset[1]["expected_output"]["out0"] == "r1v4"
+        assert dataset[1]["expected_output"]["out1"] == "r1v5"
+
+        assert len(dataset[0]["metadata"]) == 1
+        assert dataset[0]["metadata"]["m0"] == "r0v6"
+        assert dataset[1]["metadata"]["m0"] == "r1v6"
+
+        assert dataset.description == "A good pipe separated csv dataset"
+
+        assert dataset._id is not None
+
+        wait_for_backend()
+        ds = llmobs.pull_dataset(name=dataset.name)
+
+        assert len(ds) == len(dataset)
+        assert ds.name == dataset.name
+        assert ds.description == dataset.description
+        assert ds._version == 1
+    finally:
+        if dataset_id:
+            llmobs._delete_dataset(dataset_id=dataset._id)
+
+
 def test_dataset_pull_non_existent(llmobs):
     with pytest.raises(ValueError):
         llmobs.pull_dataset(name="test-dataset-non-existent")
@@ -218,12 +389,18 @@ def test_experiment_invalid_task_type_raises(llmobs, test_dataset_one_record):
 
 
 def test_experiment_invalid_task_signature_raises(llmobs, test_dataset_one_record):
-    with pytest.raises(TypeError, match="Task function must accept 'input_data' parameters."):
+    with pytest.raises(TypeError, match="Task function must have 'input_data' and 'config' parameters."):
 
         def my_task(not_input):
             pass
 
         llmobs.experiment("test_experiment", my_task, test_dataset_one_record, [dummy_evaluator])
+    with pytest.raises(TypeError, match="Task function must have 'input_data' and 'config' parameters."):
+
+        def my_task(input_data, not_config):
+            pass
+
+        llmobs.experiment("test_experiment", my_task, test_dataset_one_record, [dummy_evaluator])
 
 
 def test_experiment_invalid_dataset_raises(llmobs):