Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions codebase_rag/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,9 +883,16 @@ class EventType(StrEnum):
CYPHER_DELETE_CALLS = "MATCH ()-[r:CALLS]->() DELETE r"

# (H) Queries for orphan pruning — returns all paths stored in the graph
CYPHER_ALL_FILE_PATHS = "MATCH (f:File) RETURN f.path AS path"
CYPHER_ALL_MODULE_PATHS = "MATCH (m:Module) RETURN m.path AS path"
CYPHER_ALL_FOLDER_PATHS = "MATCH (f:Folder) RETURN f.path AS path"
CYPHER_ALL_FILE_PATHS = (
"MATCH (f:File) RETURN f.path AS path, f.absolute_path AS absolute_path"
)
CYPHER_ALL_MODULE_PATHS_INTERNAL = (
"MATCH (m:Module) WHERE m.is_external IS NULL OR m.is_external = false "
"RETURN m.path AS path, m.qualified_name AS qualified_name"
)
CYPHER_ALL_FOLDER_PATHS = (
"MATCH (f:Folder) RETURN f.path AS path, f.absolute_path AS absolute_path"
)

REALTIME_LOGGER_FORMAT = (
"<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
Expand Down
27 changes: 20 additions & 7 deletions codebase_rag/graph_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@
file_path, root_node, language, self.queries
)

def _prune_orphan_nodes(self) -> None:

Check failure on line 490 in codebase_rag/graph_updater.py

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this function to reduce its Cognitive Complexity from 26 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=vitali87_code-graph-rag&issues=AZ0c6TzKiE7KP8YCXxc4&open=AZ0c6TzKiE7KP8YCXxc4&pullRequest=485
"""Remove graph nodes whose files/folders no longer exist on disk."""
if not isinstance(self.ingestor, QueryProtocol):
return
Expand All @@ -495,20 +495,33 @@
logger.info(ls.PRUNE_START)
total_pruned = 0

project_prefix = self.project_name + "."
repo_abs = self.repo_path.resolve().as_posix()
prune_specs: list[tuple[str, str, str]] = [
(cs.CYPHER_ALL_FILE_PATHS, cs.CYPHER_DELETE_FILE, "File"),
(cs.CYPHER_ALL_MODULE_PATHS, cs.CYPHER_DELETE_MODULE, "Module"),
(
cs.CYPHER_ALL_MODULE_PATHS_INTERNAL,
cs.CYPHER_DELETE_MODULE,
"Module",
),
(cs.CYPHER_ALL_FOLDER_PATHS, cs.CYPHER_DELETE_FOLDER, "Folder"),
]

for query_all, delete_query, label in prune_specs:
rows = self.ingestor.fetch_all(query_all)
orphans = [
r["path"]
for r in rows
if r.get("path")
and not (self.repo_path / r["path"]).exists()
]
orphans = []
for r in rows:
path = r.get("path")
if not isinstance(path, str) or not path:
continue
abs_path = r.get("absolute_path")
qn = r.get("qualified_name", "")
if isinstance(abs_path, str) and not abs_path.startswith(repo_abs):
continue
if isinstance(qn, str) and qn and not qn.startswith(project_prefix):
continue
if not (self.repo_path / path).exists():
orphans.append(path)
Comment on lines 500 to +524
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This change removes the pruning logic for orphan File and Folder nodes, restricting it to only Module nodes. While the hash-cache mechanism handles deleted files, orphan Folder nodes (e.g., from a deleted empty directory) will no longer be removed from the graph. This can lead to stale data accumulating over time.

Consider reintroducing pruning for Folder nodes. The new orphan detection logic, which checks for a project-prefixed qualified_name, is specific to modules, so you'll need to adapt it for folders.

        prune_specs: list[tuple[str, str, str]] = [
            (
                cs.CYPHER_ALL_MODULE_PATHS_INTERNAL,
                cs.CYPHER_DELETE_MODULE,
                "Module",
            ),
            (
                cs.CYPHER_ALL_FOLDER_PATHS,
                cs.CYPHER_DELETE_FOLDER,
                "Folder",
            ),
        ]

        for query_all, delete_query, label in prune_specs:
            rows = self.ingestor.fetch_all(query_all)
            orphans = []
            for r in rows:
                path = r.get("path")
                if not isinstance(path, str) or not path:
                    continue

                # The project prefix check is only applicable to Modules
                if label == "Module":
                    qn = r.get("qualified_name", "")
                    if isinstance(qn, str) and qn and not qn.startswith(project_prefix):
                        continue

                if not (self.repo_path / path).exists():
                    orphans.append(path)


if orphans:
logger.info(ls.PRUNE_FOUND, count=len(orphans), label=label)
Expand Down
Loading
Loading