Skip to content

Commit a07cd9c

Browse files
authored
Explicit configuration of schema extraction (#374)
* Explicit configuration of schema extraction * Renaming
1 parent ddc78ce commit a07cd9c

File tree

5 files changed

+43
-8
lines changed

5 files changed

+43
-8
lines changed

docs/source/user_guide_kg_builder.rst

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,19 @@ This schema information can be provided to the `SimpleKGBuilder` as demonstrated
131131
# ...
132132
)
133133
134-
.. note::
135-
By default, if no schema is provided to the SimpleKGPipeline, automatic schema extraction will be performed using the LLM (See the :ref:`Automatic Schema Extraction`).
134+
135+
Schema Parameter Behavior
136+
-------------------------
137+
138+
The `schema` parameter controls how entity and relation extraction is performed:
139+
140+
* **EXTRACTED**: ``schema="EXTRACTED"`` or (``schema=None``, default value)
141+
The schema is automatically extracted from the input text once using LLM. This guiding schema is then used to structure entity and relation extraction for all chunks. This guarantees all chunks have the same guiding schema.
142+
(See :ref:`Automatic Schema Extraction`)
143+
144+
* **FREE**: ``schema="FREE"`` or empty schema (``{"node_types": ()}``)
145+
No schema extraction is performed. Entity and relation extraction proceed without a predefined or derived schema, resulting in unguided entity and relation extraction. Use this to bypass automatic schema extraction.
146+
136147

137148
Extra configurations
138149
--------------------

src/neo4j_graphrag/experimental/components/schema.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,10 @@ def node_type_from_label(self, label: str) -> Optional[NodeType]:
226226
def relationship_type_from_label(self, label: str) -> Optional[RelationshipType]:
227227
return self._relationship_type_index.get(label)
228228

229+
@classmethod
230+
def create_empty(cls) -> Self:
231+
return cls(node_types=tuple())
232+
229233
def save(
230234
self,
231235
file_path: Union[str, Path],

src/neo4j_graphrag/experimental/pipeline/config/template_pipeline/simple_kg_builder.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,9 @@
2222
Sequence,
2323
Union,
2424
)
25-
import logging
2625
import warnings
2726

28-
from pydantic import ConfigDict, Field, model_validator
27+
from pydantic import ConfigDict, Field, model_validator, field_validator
2928
from typing_extensions import Self
3029

3130
from neo4j_graphrag.experimental.components.embedder import TextChunkEmbedder
@@ -66,8 +65,6 @@
6665
)
6766
from neo4j_graphrag.generation.prompts import ERExtractionTemplate
6867

69-
logger = logging.getLogger(__name__)
70-
7168

7269
class SimpleKGPipelineConfig(TemplatePipelineConfig):
7370
COMPONENTS: ClassVar[list[str]] = [
@@ -102,6 +99,15 @@ class SimpleKGPipelineConfig(TemplatePipelineConfig):
10299

103100
model_config = ConfigDict(arbitrary_types_allowed=True)
104101

102+
@field_validator("schema_", mode="before")
103+
@classmethod
104+
def validate_schema_literal(cls, v: Any) -> Any:
105+
if v == "FREE": # same as "empty" schema, no guiding schema
106+
return GraphSchema.create_empty()
107+
if v == "EXTRACTED": # same as no schema, schema will be extracted by LLM
108+
return None
109+
return v
110+
105111
@model_validator(mode="after")
106112
def handle_schema_precedence(self) -> Self:
107113
"""Handle schema precedence and warnings"""

src/neo4j_graphrag/experimental/pipeline/kg_builder.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
from __future__ import annotations
1717

18-
from typing import List, Optional, Sequence, Union, Any
18+
from typing import List, Optional, Sequence, Union, Any, Literal
1919
import logging
2020

2121
import neo4j
@@ -99,7 +99,13 @@ def __init__(
9999
entities: Optional[Sequence[EntityInputType]] = None,
100100
relations: Optional[Sequence[RelationInputType]] = None,
101101
potential_schema: Optional[List[tuple[str, str, str]]] = None,
102-
schema: Optional[Union[GraphSchema, dict[str, list[Any]]]] = None,
102+
schema: Optional[
103+
Union[
104+
GraphSchema,
105+
dict[str, list[Any]],
106+
Literal["FREE", "EXTRACTED"],
107+
],
108+
] = None,
103109
from_pdf: bool = True,
104110
text_splitter: Optional[TextSplitter] = None,
105111
pdf_loader: Optional[DataLoader] = None,

tests/unit/experimental/pipeline/config/template_pipeline/test_simple_kg_builder.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,14 @@ def test_simple_kg_pipeline_config_manual_schema() -> None:
138138
assert isinstance(config._get_schema(), SchemaBuilder)
139139

140140

141+
def test_simple_kg_pipeline_config_literal_schema_validation() -> None:
142+
config = SimpleKGPipelineConfig(schema="FREE") # type: ignore
143+
assert config.schema_ == GraphSchema.create_empty()
144+
145+
config = SimpleKGPipelineConfig(schema="EXTRACTED") # type: ignore
146+
assert config.schema_ is None
147+
148+
141149
def test_simple_kg_pipeline_config_schema_run_params() -> None:
142150
config = SimpleKGPipelineConfig(
143151
entities=["Person"],

0 commit comments

Comments
 (0)