unslothai
diff --git a/‎studio/backend/models/data_recipe.py‎
Lines changed: 40 additions & 3 deletions b/‎studio/backend/models/data_recipe.py‎
Lines changed: 40 additions & 3 deletions
diff --git a/‎studio/backend/plugins/data-designer-unstructured-seed/pyproject.toml‎
Lines changed: 3 additions & 0 deletions b/‎studio/backend/plugins/data-designer-unstructured-seed/pyproject.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎studio/backend/plugins/data-designer-unstructured-seed/src/data_designer_unstructured_seed/chunking.py‎
Lines changed: 106 additions & 0 deletions b/‎studio/backend/plugins/data-designer-unstructured-seed/src/data_designer_unstructured_seed/chunking.py‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎studio/backend/plugins/data-designer-unstructured-seed/src/data_designer_unstructured_seed/config.py‎
Lines changed: 28 additions & 18 deletions b/‎studio/backend/plugins/data-designer-unstructured-seed/src/data_designer_unstructured_seed/config.py‎
Lines changed: 28 additions & 18 deletions
diff --git a/‎studio/backend/plugins/data-designer-unstructured-seed/src/data_designer_unstructured_seed/impl.py‎
Lines changed: 19 additions & 3 deletions b/‎studio/backend/plugins/data-designer-unstructured-seed/src/data_designer_unstructured_seed/impl.py‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎studio/backend/requirements/single-env/data-designer-deps.txt‎
Lines changed: 3 additions & 0 deletions b/‎studio/backend/requirements/single-env/data-designer-deps.txt‎
Lines changed: 3 additions & 0 deletions
@@ -9,7 +9,7 @@
 
 from typing import Any
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, model_validator
 
 
 class RecipePayload(BaseModel):
@@ -76,13 +76,41 @@ class SeedInspectRequest(BaseModel):
 
 
 class SeedInspectUploadRequest(BaseModel):
-    filename: str = Field(min_length = 1)
-    content_base64: str = Field(min_length = 1)
+    # Legacy single-file flow (mutually exclusive with file_ids)
+    filename: str | None = None
+    content_base64: str | None = None
+    # Multi-file flow (mutually exclusive with content_base64)
+    block_id: str | None = None
+    file_ids: list[str] | None = None
+    file_names: list[str] | None = None
+    # Shared fields
     preview_size: int = Field(default = 10, ge = 1, le = 50)
     seed_source_type: str | None = None
     unstructured_chunk_size: int | None = Field(default = None, ge = 1, le = 20000)
     unstructured_chunk_overlap: int | None = Field(default = None, ge = 0, le = 20000)
 
+    @model_validator(mode = "after")
+    def _check_mutual_exclusivity(self) -> "SeedInspectUploadRequest":
+        has_legacy = self.content_base64 is not None
+        has_multi = self.file_ids is not None
+        if has_legacy and has_multi:
+            raise ValueError("Provide either content_base64 or file_ids, not both")
+        if not has_legacy and not has_multi:
+            raise ValueError("Provide either content_base64 or file_ids")
+        if has_multi:
+            if len(self.file_ids) == 0:
+                raise ValueError("file_ids must not be empty")
+            if not self.block_id:
+                raise ValueError("block_id is required when using file_ids")
+            if self.file_names is None or len(self.file_ids) != len(self.file_names):
+                raise ValueError(
+                    "file_names must be provided and same length as file_ids"
+                )
+        if has_legacy:
+            if not self.filename:
+                raise ValueError("filename is required when using content_base64")
+        return self
+
 
 class SeedInspectResponse(BaseModel):
     dataset_name: str
@@ -91,6 +119,15 @@ class SeedInspectResponse(BaseModel):
     preview_rows: list[dict[str, Any]] = Field(default_factory = list)
     split: str | None = None
     subset: str | None = None
+    resolved_paths: list[str] | None = None
+
+
+class UnstructuredFileUploadResponse(BaseModel):
+    file_id: str
+    filename: str
+    size_bytes: int
+    status: str  # "ok" or "error"
+    error: str | None = None
 
 
 class McpToolsListRequest(BaseModel):
 
@@ -13,6 +13,9 @@ requires-python = ">=3.11"
 dependencies = [
   "data-designer-engine>=0.5.1,<0.6",
   "pandas>=2,<3",
+  "pymupdf>=1.24.0",
+  "pymupdf4llm>=0.0.17",
+  "mammoth>=1.8.0",
 ]
 
 [project.entry-points."data_designer.plugins"]
 
@@ -8,6 +8,8 @@
 from pathlib import Path
 from typing import Any
 
+import pandas as pd
+
 from utils.paths import ensure_dir, unstructured_seed_cache_root
 
 DEFAULT_CHUNK_SIZE = 1200
@@ -59,6 +61,59 @@ def build_unstructured_preview_rows(
     ]
 
 
+def build_multi_file_preview_rows(
+    *,
+    file_entries: list[tuple[Path, str]],
+    preview_size: int,
+    chunk_size: int | None,
+    chunk_overlap: int | None,
+) -> list[dict[str, str]]:
+    cs = _to_int(chunk_size, DEFAULT_CHUNK_SIZE)
+    co = _to_int(chunk_overlap, DEFAULT_CHUNK_OVERLAP)
+    _, rows = materialize_multi_file_unstructured_seed(
+        file_entries = file_entries,
+        chunk_size = cs,
+        chunk_overlap = co,
+    )
+    return _round_robin_preview(rows, preview_size)
+
+
+def _round_robin_preview(
+    rows: list[dict[str, str]],
+    preview_size: int,
+) -> list[dict[str, str]]:
+    """Pick preview rows round-robin across source files so every file is represented."""
+    if not rows or preview_size <= 0:
+        return []
+
+    # Group rows by source_file, preserving order of first appearance
+    from collections import OrderedDict
+
+    grouped: OrderedDict[str, list[dict[str, str]]] = OrderedDict()
+    for row in rows:
+        key = row.get("source_file", "")
+        if key not in grouped:
+            grouped[key] = []
+        grouped[key].append(row)
+
+    result: list[dict[str, str]] = []
+    iterators = [iter(chunks) for chunks in grouped.values()]
+    while len(result) < preview_size and iterators:
+        exhausted: list[int] = []
+        for i, it in enumerate(iterators):
+            if len(result) >= preview_size:
+                break
+            val = next(it, None)
+            if val is not None:
+                result.append(val)
+            else:
+                exhausted.append(i)
+        for i in reversed(exhausted):
+            iterators.pop(i)
+
+    return result
+
+
 def materialize_unstructured_seed_dataset(
     *,
     source_path: Path,
@@ -103,6 +158,43 @@ def materialize_unstructured_seed_dataset(
     return parquet_path, rows
 
 
+def materialize_multi_file_unstructured_seed(
+    *,
+    file_entries: list[tuple[Path, str]],  # (extracted_txt_path, original_filename)
+    chunk_size: int,
+    chunk_overlap: int,
+) -> tuple[Path, list[dict[str, str]]]:
+    """Chunk multiple files and combine into one parquet dataset with source_file column."""
+    chunk_size, chunk_overlap = resolve_chunking(chunk_size, chunk_overlap)
+    cache_key = _compute_multi_file_cache_key(file_entries, chunk_size, chunk_overlap)
+    cached = _CACHE_DIR / f"{cache_key}.parquet"
+    if cached.exists():
+        df = pd.read_parquet(cached)
+        rows = df.to_dict(orient = "records")
+        return cached, rows
+
+    all_rows: list[dict[str, str]] = []
+    for txt_path, orig_name in file_entries:
+        text = load_unstructured_text_file(txt_path)
+        chunks = split_text_into_chunks(
+            text = text,
+            chunk_size = chunk_size,
+            chunk_overlap = chunk_overlap,
+        )
+        for chunk in chunks:
+            all_rows.append({"chunk_text": chunk, "source_file": orig_name})
+
+    if not all_rows:
+        raise ValueError("No text found in any uploaded files.")
+
+    df = pd.DataFrame(all_rows)
+    ensure_dir(_CACHE_DIR)
+    tmp = _CACHE_DIR / f"{cache_key}.tmp.parquet"
+    df.to_parquet(tmp, index = False)
+    tmp.replace(cached)
+    return cached, all_rows
+
+
 def load_unstructured_text_file(path: Path) -> str:
     ext = path.suffix.lower()
     if ext not in {".txt", ".md"}:
@@ -193,3 +285,17 @@ def _compute_cache_key(
         ]
     ).encode("utf-8")
     return hashlib.sha256(payload).hexdigest()
+
+
+def _compute_multi_file_cache_key(
+    file_entries: list[tuple[Path, str]],
+    chunk_size: int,
+    chunk_overlap: int,
+) -> str:
+    parts: list[str] = []
+    for path, name in sorted(file_entries, key = lambda e: e[1]):
+        st = path.stat()
+        parts.append(f"{path}|{st.st_size}|{st.st_mtime_ns}|{name}")
+    parts.append(f"cs={chunk_size}|co={chunk_overlap}")
+    raw = "\n".join(parts)
+    return hashlib.sha256(raw.encode()).hexdigest()
@@ -6,7 +6,7 @@
 from pathlib import Path
 from typing import Literal
 
-from pydantic import Field, field_validator
+from pydantic import Field, field_validator, model_validator
 
 from data_designer.config.seed_source import SeedSource
 
@@ -15,27 +15,37 @@
 
 class UnstructuredSeedSource(SeedSource):
     seed_type: Literal["unstructured"] = "unstructured"
-    path: str = Field(..., min_length = 1)
+    paths: list[str] = Field(min_length = 1)
+
+    @model_validator(mode = "before")
+    @classmethod
+    def _normalize_legacy_path(cls, data):
+        if isinstance(data, dict) and "paths" not in data and data.get("path"):
+            data = dict(data)
+            data["paths"] = [data["path"]]
+        return data
+
     chunk_size: int = DEFAULT_CHUNK_SIZE
     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
 
-    @field_validator("path", mode = "after")
+    @field_validator("paths")
     @classmethod
-    def _validate_path(cls, value: str) -> str:
-        path = Path(value).expanduser()
-        if not path.is_file():
-            raise ValueError(f"Unstructured seed path is not a file: {path}")
-        return value
-
-    @field_validator("chunk_size", mode = "after")
+    def _validate_paths(cls, v: list[str]) -> list[str]:
+        for p in v:
+            expanded = Path(p).expanduser()
+            if not expanded.is_file():
+                raise ValueError(f"Seed file does not exist: {expanded}")
+        return v
+
+    @field_validator("chunk_size")
     @classmethod
-    def _validate_chunk_size(cls, value: int) -> int:
-        size, _ = resolve_chunking(value, 0)
-        return size
+    def _resolve_chunk_size(cls, v: int) -> int:
+        cs, _ = resolve_chunking(v, 0)
+        return cs
 
-    @field_validator("chunk_overlap", mode = "after")
+    @field_validator("chunk_overlap")
     @classmethod
-    def _validate_chunk_overlap(cls, value: int, info) -> int:
-        size = info.data.get("chunk_size", cls.model_fields["chunk_size"].default)
-        _, overlap = resolve_chunking(size, value)
-        return overlap
+    def _resolve_chunk_overlap(cls, v: int, info) -> int:
+        cs = info.data.get("chunk_size", DEFAULT_CHUNK_SIZE)
+        _, co = resolve_chunking(cs, v)
+        return co
@@ -8,7 +8,6 @@
 import data_designer.lazy_heavy_imports as lazy
 from data_designer.engine.resources.seed_reader import SeedReader
 
-from .chunking import materialize_unstructured_seed_dataset
 from .config import UnstructuredSeedSource
 
 
@@ -17,8 +16,25 @@ def create_duckdb_connection(self):
         return lazy.duckdb.connect()
 
     def get_dataset_uri(self) -> str:
-        path, _ = materialize_unstructured_seed_dataset(
-            source_path = Path(self.source.path),
+        from .chunking import materialize_multi_file_unstructured_seed
+        import json as json_mod
+
+        file_entries: list[tuple[Path, str]] = []
+        for p in self.source.paths:
+            path_obj = Path(p)
+            file_id = path_obj.name.replace(".extracted.txt", "")
+            meta_path = path_obj.parent / f"{file_id}.meta.json"
+            orig_name = path_obj.name
+            if meta_path.exists():
+                try:
+                    meta = json_mod.loads(meta_path.read_text())
+                    orig_name = meta.get("original_filename", path_obj.name)
+                except (json_mod.JSONDecodeError, OSError):
+                    pass
+            file_entries.append((path_obj, orig_name))
+
+        path, _ = materialize_multi_file_unstructured_seed(
+            file_entries = file_entries,
             chunk_size = self.source.chunk_size,
             chunk_overlap = self.source.chunk_overlap,
         )
 
@@ -17,3 +17,6 @@ ruff<1,>=0.14.10
 scipy<2,>=1.11.0
 sqlfluff<4,>=3.2.0
 tiktoken<1,>=0.8.0
+pymupdf>=1.24.0
+pymupdf4llm>=0.0.17
+mammoth>=1.8.0
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,9 @@ requires-python = ">=3.11"`
`13`	`13`	`dependencies = [`
`14`	`14`	`"data-designer-engine>=0.5.1,<0.6",`
`15`	`15`	`"pandas>=2,<3",`
	`16`	`+ "pymupdf>=1.24.0",`
	`17`	`+ "pymupdf4llm>=0.0.17",`
	`18`	`+ "mammoth>=1.8.0",`
`16`	`19`	`]`
`17`	`20`
`18`	`21`	`[project.entry-points."data_designer.plugins"]`