Skip to content

Commit dd283b0

Browse files
feat(studio): multi-file unstructured seed upload with better backend extraction (#4468)
* fix(recipe-studio): prevent fitView from zooming to wrong location on recipe load * feat: add pymupdf/python-docx deps and unstructured uploads storage root * feat: add POST /seed/upload-unstructured-file endpoint * feat: add multi-file chunking with source_file column * feat: update frontend types and API layer for multi-file upload * feat: round-robin preview rows across source files Ensures every uploaded file is represented in the preview table by cycling through sources instead of just taking the first N rows. * fix: disable OCR, fix auto-load timing, fix persistence on reload - Disable pymupdf4llm OCR with write_images=False, show_progress=False - Replace onAllUploaded callback with useEffect that detects uploading→done transition (avoids stale closure reading empty file IDs) - Fix importer to preserve file IDs from saved recipes instead of clearing (clearing only happens at share time via sanitizeSeedForShare) * fix: harden unstructured upload with input validation and state fixes Validate block_id/file_id with alphanumeric regex to prevent path traversal, use exact stem match for file deletion, add error handling for metadata writes and empty files, fix React stale closures and object mutations in upload loop, and correct validation logic for unstructured seed resolved_paths. * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix: address PR review - legacy path import, share sanitizer, sync effect Promote legacy source.path into resolved_paths for old unstructured recipes, clear source.paths in share sanitizer to prevent leaking local filesystem paths, and gate file sync effect to dialog open transition so users can actually delete all uploaded files. * fix: CSV column fix (BOM + whitespace + unnamed index re-save) for #4470 * fix: harden unstructured upload flow and polish dialog UX * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent f113f35 commit dd283b0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+1217
-316
lines changed

studio/backend/models/data_recipe.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from typing import Any
1111

12-
from pydantic import BaseModel, Field
12+
from pydantic import BaseModel, Field, model_validator
1313

1414

1515
class RecipePayload(BaseModel):
@@ -76,13 +76,41 @@ class SeedInspectRequest(BaseModel):
7676

7777

7878
class SeedInspectUploadRequest(BaseModel):
79-
filename: str = Field(min_length = 1)
80-
content_base64: str = Field(min_length = 1)
79+
# Legacy single-file flow (mutually exclusive with file_ids)
80+
filename: str | None = None
81+
content_base64: str | None = None
82+
# Multi-file flow (mutually exclusive with content_base64)
83+
block_id: str | None = None
84+
file_ids: list[str] | None = None
85+
file_names: list[str] | None = None
86+
# Shared fields
8187
preview_size: int = Field(default = 10, ge = 1, le = 50)
8288
seed_source_type: str | None = None
8389
unstructured_chunk_size: int | None = Field(default = None, ge = 1, le = 20000)
8490
unstructured_chunk_overlap: int | None = Field(default = None, ge = 0, le = 20000)
8591

92+
@model_validator(mode = "after")
93+
def _check_mutual_exclusivity(self) -> "SeedInspectUploadRequest":
94+
has_legacy = self.content_base64 is not None
95+
has_multi = self.file_ids is not None
96+
if has_legacy and has_multi:
97+
raise ValueError("Provide either content_base64 or file_ids, not both")
98+
if not has_legacy and not has_multi:
99+
raise ValueError("Provide either content_base64 or file_ids")
100+
if has_multi:
101+
if len(self.file_ids) == 0:
102+
raise ValueError("file_ids must not be empty")
103+
if not self.block_id:
104+
raise ValueError("block_id is required when using file_ids")
105+
if self.file_names is None or len(self.file_ids) != len(self.file_names):
106+
raise ValueError(
107+
"file_names must be provided and same length as file_ids"
108+
)
109+
if has_legacy:
110+
if not self.filename:
111+
raise ValueError("filename is required when using content_base64")
112+
return self
113+
86114

87115
class SeedInspectResponse(BaseModel):
88116
dataset_name: str
@@ -91,6 +119,15 @@ class SeedInspectResponse(BaseModel):
91119
preview_rows: list[dict[str, Any]] = Field(default_factory = list)
92120
split: str | None = None
93121
subset: str | None = None
122+
resolved_paths: list[str] | None = None
123+
124+
125+
class UnstructuredFileUploadResponse(BaseModel):
126+
file_id: str
127+
filename: str
128+
size_bytes: int
129+
status: str # "ok" or "error"
130+
error: str | None = None
94131

95132

96133
class McpToolsListRequest(BaseModel):

studio/backend/plugins/data-designer-unstructured-seed/pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ requires-python = ">=3.11"
1313
dependencies = [
1414
"data-designer-engine>=0.5.1,<0.6",
1515
"pandas>=2,<3",
16+
"pymupdf>=1.24.0",
17+
"pymupdf4llm>=0.0.17",
18+
"mammoth>=1.8.0",
1619
]
1720

1821
[project.entry-points."data_designer.plugins"]

studio/backend/plugins/data-designer-unstructured-seed/src/data_designer_unstructured_seed/chunking.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from pathlib import Path
99
from typing import Any
1010

11+
import pandas as pd
12+
1113
from utils.paths import ensure_dir, unstructured_seed_cache_root
1214

1315
DEFAULT_CHUNK_SIZE = 1200
@@ -59,6 +61,59 @@ def build_unstructured_preview_rows(
5961
]
6062

6163

64+
def build_multi_file_preview_rows(
65+
*,
66+
file_entries: list[tuple[Path, str]],
67+
preview_size: int,
68+
chunk_size: int | None,
69+
chunk_overlap: int | None,
70+
) -> list[dict[str, str]]:
71+
cs = _to_int(chunk_size, DEFAULT_CHUNK_SIZE)
72+
co = _to_int(chunk_overlap, DEFAULT_CHUNK_OVERLAP)
73+
_, rows = materialize_multi_file_unstructured_seed(
74+
file_entries = file_entries,
75+
chunk_size = cs,
76+
chunk_overlap = co,
77+
)
78+
return _round_robin_preview(rows, preview_size)
79+
80+
81+
def _round_robin_preview(
82+
rows: list[dict[str, str]],
83+
preview_size: int,
84+
) -> list[dict[str, str]]:
85+
"""Pick preview rows round-robin across source files so every file is represented."""
86+
if not rows or preview_size <= 0:
87+
return []
88+
89+
# Group rows by source_file, preserving order of first appearance
90+
from collections import OrderedDict
91+
92+
grouped: OrderedDict[str, list[dict[str, str]]] = OrderedDict()
93+
for row in rows:
94+
key = row.get("source_file", "")
95+
if key not in grouped:
96+
grouped[key] = []
97+
grouped[key].append(row)
98+
99+
result: list[dict[str, str]] = []
100+
iterators = [iter(chunks) for chunks in grouped.values()]
101+
while len(result) < preview_size and iterators:
102+
exhausted: list[int] = []
103+
for i, it in enumerate(iterators):
104+
if len(result) >= preview_size:
105+
break
106+
val = next(it, None)
107+
if val is not None:
108+
result.append(val)
109+
else:
110+
exhausted.append(i)
111+
for i in reversed(exhausted):
112+
iterators.pop(i)
113+
114+
return result
115+
116+
62117
def materialize_unstructured_seed_dataset(
63118
*,
64119
source_path: Path,
@@ -103,6 +158,43 @@ def materialize_unstructured_seed_dataset(
103158
return parquet_path, rows
104159

105160

161+
def materialize_multi_file_unstructured_seed(
162+
*,
163+
file_entries: list[tuple[Path, str]], # (extracted_txt_path, original_filename)
164+
chunk_size: int,
165+
chunk_overlap: int,
166+
) -> tuple[Path, list[dict[str, str]]]:
167+
"""Chunk multiple files and combine into one parquet dataset with source_file column."""
168+
chunk_size, chunk_overlap = resolve_chunking(chunk_size, chunk_overlap)
169+
cache_key = _compute_multi_file_cache_key(file_entries, chunk_size, chunk_overlap)
170+
cached = _CACHE_DIR / f"{cache_key}.parquet"
171+
if cached.exists():
172+
df = pd.read_parquet(cached)
173+
rows = df.to_dict(orient = "records")
174+
return cached, rows
175+
176+
all_rows: list[dict[str, str]] = []
177+
for txt_path, orig_name in file_entries:
178+
text = load_unstructured_text_file(txt_path)
179+
chunks = split_text_into_chunks(
180+
text = text,
181+
chunk_size = chunk_size,
182+
chunk_overlap = chunk_overlap,
183+
)
184+
for chunk in chunks:
185+
all_rows.append({"chunk_text": chunk, "source_file": orig_name})
186+
187+
if not all_rows:
188+
raise ValueError("No text found in any uploaded files.")
189+
190+
df = pd.DataFrame(all_rows)
191+
ensure_dir(_CACHE_DIR)
192+
tmp = _CACHE_DIR / f"{cache_key}.tmp.parquet"
193+
df.to_parquet(tmp, index = False)
194+
tmp.replace(cached)
195+
return cached, all_rows
196+
197+
106198
def load_unstructured_text_file(path: Path) -> str:
107199
ext = path.suffix.lower()
108200
if ext not in {".txt", ".md"}:
@@ -193,3 +285,17 @@ def _compute_cache_key(
193285
]
194286
).encode("utf-8")
195287
return hashlib.sha256(payload).hexdigest()
288+
289+
290+
def _compute_multi_file_cache_key(
291+
file_entries: list[tuple[Path, str]],
292+
chunk_size: int,
293+
chunk_overlap: int,
294+
) -> str:
295+
parts: list[str] = []
296+
for path, name in sorted(file_entries, key = lambda e: e[1]):
297+
st = path.stat()
298+
parts.append(f"{path}|{st.st_size}|{st.st_mtime_ns}|{name}")
299+
parts.append(f"cs={chunk_size}|co={chunk_overlap}")
300+
raw = "\n".join(parts)
301+
return hashlib.sha256(raw.encode()).hexdigest()

studio/backend/plugins/data-designer-unstructured-seed/src/data_designer_unstructured_seed/config.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from pathlib import Path
77
from typing import Literal
88

9-
from pydantic import Field, field_validator
9+
from pydantic import Field, field_validator, model_validator
1010

1111
from data_designer.config.seed_source import SeedSource
1212

@@ -15,27 +15,37 @@
1515

1616
class UnstructuredSeedSource(SeedSource):
1717
seed_type: Literal["unstructured"] = "unstructured"
18-
path: str = Field(..., min_length = 1)
18+
paths: list[str] = Field(min_length = 1)
19+
20+
@model_validator(mode = "before")
21+
@classmethod
22+
def _normalize_legacy_path(cls, data):
23+
if isinstance(data, dict) and "paths" not in data and data.get("path"):
24+
data = dict(data)
25+
data["paths"] = [data["path"]]
26+
return data
27+
1928
chunk_size: int = DEFAULT_CHUNK_SIZE
2029
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP
2130

22-
@field_validator("path", mode = "after")
31+
@field_validator("paths")
2332
@classmethod
24-
def _validate_path(cls, value: str) -> str:
25-
path = Path(value).expanduser()
26-
if not path.is_file():
27-
raise ValueError(f"Unstructured seed path is not a file: {path}")
28-
return value
29-
30-
@field_validator("chunk_size", mode = "after")
33+
def _validate_paths(cls, v: list[str]) -> list[str]:
34+
for p in v:
35+
expanded = Path(p).expanduser()
36+
if not expanded.is_file():
37+
raise ValueError(f"Seed file does not exist: {expanded}")
38+
return v
39+
40+
@field_validator("chunk_size")
3141
@classmethod
32-
def _validate_chunk_size(cls, value: int) -> int:
33-
size, _ = resolve_chunking(value, 0)
34-
return size
42+
def _resolve_chunk_size(cls, v: int) -> int:
43+
cs, _ = resolve_chunking(v, 0)
44+
return cs
3545

36-
@field_validator("chunk_overlap", mode = "after")
46+
@field_validator("chunk_overlap")
3747
@classmethod
38-
def _validate_chunk_overlap(cls, value: int, info) -> int:
39-
size = info.data.get("chunk_size", cls.model_fields["chunk_size"].default)
40-
_, overlap = resolve_chunking(size, value)
41-
return overlap
48+
def _resolve_chunk_overlap(cls, v: int, info) -> int:
49+
cs = info.data.get("chunk_size", DEFAULT_CHUNK_SIZE)
50+
_, co = resolve_chunking(cs, v)
51+
return co

studio/backend/plugins/data-designer-unstructured-seed/src/data_designer_unstructured_seed/impl.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import data_designer.lazy_heavy_imports as lazy
99
from data_designer.engine.resources.seed_reader import SeedReader
1010

11-
from .chunking import materialize_unstructured_seed_dataset
1211
from .config import UnstructuredSeedSource
1312

1413

@@ -17,8 +16,25 @@ def create_duckdb_connection(self):
1716
return lazy.duckdb.connect()
1817

1918
def get_dataset_uri(self) -> str:
20-
path, _ = materialize_unstructured_seed_dataset(
21-
source_path = Path(self.source.path),
19+
from .chunking import materialize_multi_file_unstructured_seed
20+
import json as json_mod
21+
22+
file_entries: list[tuple[Path, str]] = []
23+
for p in self.source.paths:
24+
path_obj = Path(p)
25+
file_id = path_obj.name.replace(".extracted.txt", "")
26+
meta_path = path_obj.parent / f"{file_id}.meta.json"
27+
orig_name = path_obj.name
28+
if meta_path.exists():
29+
try:
30+
meta = json_mod.loads(meta_path.read_text())
31+
orig_name = meta.get("original_filename", path_obj.name)
32+
except (json_mod.JSONDecodeError, OSError):
33+
pass
34+
file_entries.append((path_obj, orig_name))
35+
36+
path, _ = materialize_multi_file_unstructured_seed(
37+
file_entries = file_entries,
2238
chunk_size = self.source.chunk_size,
2339
chunk_overlap = self.source.chunk_overlap,
2440
)

studio/backend/requirements/single-env/data-designer-deps.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,6 @@ ruff<1,>=0.14.10
1717
scipy<2,>=1.11.0
1818
sqlfluff<4,>=3.2.0
1919
tiktoken<1,>=0.8.0
20+
pymupdf>=1.24.0
21+
pymupdf4llm>=0.0.17
22+
mammoth>=1.8.0

0 commit comments

Comments
 (0)