Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,23 @@ runs, but application code should prefer importing `gather(...)` directly.
```python
import syft_ingest

report = syft_ingest.ingest_jsonl(
"./output/creator_social_posts.jsonl",
destination=syft_ingest.QdrantDestination(
collection_name="katy-stevens",
url="http://127.0.0.1:6333",
),
embedding=syft_ingest.EmbeddingSpec(
backend="fastembed",
model="BAAI/bge-small-en-v1.5",
),
)
```

By default, `ingest_jsonl(...)` does not chunk documents. Each input record is
ingested as a single vector point unless you explicitly opt into chunking:

```python
report = syft_ingest.ingest_jsonl(
"./output/creator_social_posts.jsonl",
destination=syft_ingest.QdrantDestination(
Expand Down
13 changes: 8 additions & 5 deletions syft_ingest/core/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,16 +381,19 @@ def _enrich_text(doc: _Doc) -> str:
return f"{header}{desc_line}\n\n{doc.text}"


def to_rag(docs: list[_Doc], chunking: ChunkingSpec) -> list[_Doc]:
def to_rag(docs: list[_Doc], chunking: ChunkingSpec | None) -> list[_Doc]:
"""Chunk documents for RAG ingestion.

This is the single place where chunking happens, per architecture rule:
'to_rag() chunks. export() does NOT chunk.'

When ``chunking`` is ``None``, chunking is disabled and each input document
is emitted as a single RAG document.
"""
chunked: list[_Doc] = []
for doc in docs:
enriched = _enrich_text(doc)
chunks = _chunk_text(enriched, chunking)
chunks = [enriched] if chunking is None else _chunk_text(enriched, chunking)
raw_hash = hashlib.sha256(doc.text.encode("utf-8")).hexdigest()
source_id = doc.payload.get("url") or doc.payload.get("title") or "doc"
stable_doc_key = f"{source_id}::{doc.payload.get('source', '')}::{raw_hash}"
Expand Down Expand Up @@ -554,7 +557,7 @@ def _ingest_docs(
*,
destination: QdrantDestination,
embedding: EmbeddingSpec,
chunking: ChunkingSpec,
chunking: ChunkingSpec | None,
) -> IngestReport:
if not docs:
raise NoDocumentsError("No documents available for ingestion")
Expand Down Expand Up @@ -633,7 +636,7 @@ def ingest_jsonl(
_iter_docs_from_jsonl(manifest_jsonl),
destination=destination,
embedding=embedding or EmbeddingSpec(),
chunking=chunking or ChunkingSpec(),
chunking=chunking,
)


Expand All @@ -648,5 +651,5 @@ def ingest_corpus(
_iter_docs_from_corpus(corpus),
destination=destination,
embedding=embedding or EmbeddingSpec(),
chunking=chunking or ChunkingSpec(),
chunking=chunking,
)
29 changes: 29 additions & 0 deletions tests/test_ingest_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,35 @@ def test_ingest_jsonl_upserts_points(tmp_path, patch_ingest_runtime):
assert report.embedding_contract["embedding_backend"] == "fastembed"


def test_ingest_jsonl_defaults_to_no_chunking(tmp_path, patch_ingest_runtime):
manifest = tmp_path / "manifest.jsonl"
manifest.write_text(
json.dumps(
{
"title": "Long post",
"author": "Katy Stevens",
"url": "https://www.instagram.com/p/long123/",
"text": "word " * 600,
"site": "instagram.com",
"source_type": "social_media_post",
"metadata": {"platform": "instagram", "extractor": "brightdata"},
}
)
+ "\n",
encoding="utf-8",
)

report = ingest_jsonl(
manifest,
destination=QdrantDestination(collection_name="katy-stevens-default"),
embedding=EmbeddingSpec(),
)

assert report.documents_total == 1
assert report.chunks_total == 1
assert len(report.point_ids) == 1


def test_ingest_corpus_supports_source_spec_metadata(tmp_path, patch_ingest_runtime):
brightdata_dir = tmp_path / "brightdata-ig"
brightdata_dir.mkdir(parents=True, exist_ok=True)
Expand Down
16 changes: 16 additions & 0 deletions tests/test_ingest_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,22 @@ def test_chunk_text_empty_returns_empty():
assert _chunk_text("", spec) == []


def test_to_rag_without_chunking_emits_single_doc():
doc = _Doc(
text="Hello world, this is a test document with enough text.",
payload={"url": "https://example.com/page1", "source": "test"},
)

result = to_rag([doc], None)

assert len(result) == 1
payload = result[0].payload
assert payload["chunk_index"] == 0
assert payload["chunk_count"] == 1
assert payload["raw_text"] == doc.text
assert result[0].text


# ---------------------------------------------------------------------------
# 7. test_gather_unsupported_source_spec_kind
# ---------------------------------------------------------------------------
Expand Down