Skip to content

Commit 4541de2

Browse files
authored
Create Document node even from text input (#413)
* Create Document node even from text input * Update doc * Ruff * Save document_type * Fix type * Reuse file_path instead of introducing another document_path parameter * Address comments * Fix CI
1 parent b34419b commit 4541de2

File tree

12 files changed

+107
-50
lines changed

12 files changed

+107
-50
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@
1515
- Fixed an edge case where the LLM can output a property with type 'map', which was causing errors during import as it is not a valid property type in Neo4j.
1616

1717

18+
### Added
19+
20+
- Document node is now always created when running SimpleKGPipeline, even if `from_pdf=False`.
21+
- Document metadata is exposed in SimpleKGPipeline run method.
22+
23+
1824
## 1.9.1
1925

2026
### Fixed

docs/source/user_guide_kg_builder.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,15 @@ chunk overlap in the text splitter component:
219219
)
220220
221221
222+
Run Parameters
223+
--------------
224+
225+
SimpleKGPipeline also accepts addition runtime parameters:
226+
227+
- ``document_metadata`` (dict): each item will be saved as a property attached to the ``Document`` node.
228+
229+
230+
222231
Using a Config file
223232
===================
224233

examples/build_graph/simple_kg_builder_from_pdf.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,12 @@ async def define_and_run_pipeline(
5454
},
5555
neo4j_database=DATABASE,
5656
)
57-
return await kg_builder.run_async(file_path=str(file_path))
57+
return await kg_builder.run_async(
58+
file_path=str(file_path),
59+
# optional, add document metadata, each item will
60+
# be saved as a property of the Document node
61+
# document_metadata={"author": "J. K. Rowling"},
62+
)
5863

5964

6065
async def main() -> PipelineResult:

examples/build_graph/simple_kg_builder_from_text.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,15 @@ async def define_and_run_pipeline(
7979
from_pdf=False,
8080
neo4j_database=DATABASE,
8181
)
82-
return await kg_builder.run_async(text=TEXT)
82+
return await kg_builder.run_async(
83+
text=TEXT,
84+
# optional, specify file path for the Document node
85+
# if not, a random name will be generated
86+
# file_path="my_document.txt"
87+
# optional, add document metadata, each item will
88+
# be saved as a property of the Document node
89+
# document_metadata={"author": "Frank Herbert"},
90+
)
8391

8492

8593
async def main() -> PipelineResult:

src/neo4j_graphrag/experimental/components/lexical_graph.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ def create_document_node(self, document_info: DocumentInfo) -> Neo4jNode:
113113
properties={
114114
"path": document_info.path,
115115
"createdAt": datetime.datetime.now(datetime.timezone.utc).isoformat(),
116+
"document_type": document_info.document_type,
116117
**document_metadata,
117118
},
118119
)

src/neo4j_graphrag/experimental/components/pdf_loader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,5 +89,6 @@ async def run(
8989
document_info=DocumentInfo(
9090
path=filepath,
9191
metadata=self.get_document_metadata(text, metadata),
92+
document_type="pdf",
9293
),
9394
)

src/neo4j_graphrag/experimental/components/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ class DocumentInfo(DataModel):
3838
path: str
3939
metadata: Optional[Dict[str, str]] = None
4040
uid: str = Field(default_factory=lambda: str(uuid.uuid4()))
41+
document_type: Optional[str] = None
4142

4243
@property
4344
def document_id(self) -> str:

src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# limitations under the License.
1515
from __future__ import annotations
1616

17+
from collections import defaultdict
1718
from typing import (
1819
Any,
1920
ClassVar,
@@ -336,37 +337,40 @@ def _get_connections(self) -> list[ConnectionDefinition]:
336337
return connections
337338

338339
def get_run_params(self, user_input: dict[str, Any]) -> dict[str, Any]:
339-
run_params = {}
340-
if self.lexical_graph_config:
341-
run_params["extractor"] = {
342-
"lexical_graph_config": self.lexical_graph_config,
343-
}
344-
run_params["writer"] = {
345-
"lexical_graph_config": self.lexical_graph_config,
346-
}
347-
run_params["pruner"] = {
348-
"lexical_graph_config": self.lexical_graph_config,
349-
}
350340
text = user_input.get("text")
351341
file_path = user_input.get("file_path")
352-
if not ((text is None) ^ (file_path is None)):
353-
# exactly one of text or user_input must be set
342+
if text is None and file_path is None:
343+
# user must provide either text or file_path or both
354344
raise PipelineDefinitionError(
355-
"Use either 'text' (when from_pdf=False) or 'file_path' (when from_pdf=True) argument."
345+
"At least one of `text` (when from_pdf=False) or `file_path` (when from_pdf=True) argument must be provided."
356346
)
347+
run_params: dict[str, dict[str, Any]] = defaultdict(dict)
348+
if self.lexical_graph_config:
349+
run_params["extractor"]["lexical_graph_config"] = self.lexical_graph_config
350+
run_params["writer"]["lexical_graph_config"] = self.lexical_graph_config
351+
run_params["pruner"]["lexical_graph_config"] = self.lexical_graph_config
357352
if self.from_pdf:
358353
if not file_path:
359354
raise PipelineDefinitionError(
360355
"Expected 'file_path' argument when 'from_pdf' is True."
361356
)
362-
run_params["pdf_loader"] = {"filepath": file_path}
357+
run_params["pdf_loader"]["filepath"] = file_path
358+
run_params["pdf_loader"]["metadata"] = user_input.get("document_metadata")
363359
else:
364360
if not text:
365361
raise PipelineDefinitionError(
366362
"Expected 'text' argument when 'from_pdf' is False."
367363
)
368-
run_params["splitter"] = {"text": text}
364+
run_params["splitter"]["text"] = text
369365
# Add full text to schema component for automatic schema extraction
370366
if not self.has_user_provided_schema():
371-
run_params["schema"] = {"text": text}
367+
run_params["schema"]["text"] = text
368+
run_params["extractor"]["document_info"] = dict(
369+
path=user_input.get(
370+
"file_path",
371+
)
372+
or "document.txt",
373+
metadata=user_input.get("document_metadata"),
374+
document_type="inline_text",
375+
)
372376
return run_params

src/neo4j_graphrag/experimental/pipeline/kg_builder.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,16 +145,26 @@ def __init__(
145145
self.runner = PipelineRunner.from_config(config)
146146

147147
async def run_async(
148-
self, file_path: Optional[str] = None, text: Optional[str] = None
148+
self,
149+
file_path: Optional[str] = None,
150+
text: Optional[str] = None,
151+
document_metadata: Optional[dict[str, Any]] = None,
149152
) -> PipelineResult:
150153
"""
151154
Asynchronously runs the knowledge graph building process.
152155
153156
Args:
154-
file_path (Optional[str]): The path to the PDF file to process. Required if `from_pdf` is True.
157+
file_path (Optional[str]): The path to the PDF file to process. Required if `from_pdf` is True. If `from_pdf` is False, can be used to set the Document node path property.
155158
text (Optional[str]): The text content to process. Required if `from_pdf` is False.
159+
document_metadata (Optional[dict[str, Any]]): The metadata to attach to the document.
156160
157161
Returns:
158162
PipelineResult: The result of the pipeline execution.
159163
"""
160-
return await self.runner.run({"file_path": file_path, "text": text})
164+
return await self.runner.run(
165+
{
166+
"file_path": file_path,
167+
"text": text,
168+
"document_metadata": document_metadata,
169+
}
170+
)

tests/unit/experimental/components/test_lexical_graph_builder.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,11 @@ async def test_lexical_graph_builder_run_with_document() -> None:
7878
TextChunk(text="text chunk 1", index=1),
7979
]
8080
),
81-
document_info=DocumentInfo(path="test_lexical_graph", uid=doc_uid),
81+
document_info=DocumentInfo(
82+
path="test_lexical_graph",
83+
uid=doc_uid,
84+
document_type="my_type",
85+
),
8286
)
8387
assert isinstance(result, GraphResult)
8488
graph = result.graph
@@ -89,6 +93,7 @@ async def test_lexical_graph_builder_run_with_document() -> None:
8993
assert document.label == DEFAULT_DOCUMENT_NODE_LABEL
9094
assert document.properties["path"] == "test_lexical_graph"
9195
assert document.properties["createdAt"] is not None
96+
assert document.properties["document_type"] == "my_type"
9297
chunk1 = nodes[1]
9398
assert chunk1.label == DEFAULT_CHUNK_NODE_LABEL
9499
chunk2 = nodes[2]

0 commit comments

Comments
 (0)