From ab6eb5f5123e3bc7c84d09143689d41894455276 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Thu, 24 Jul 2025 16:32:38 +0200 Subject: [PATCH 01/17] test --- src/gitingest/entrypoint.py | 35 ++++++++++------- src/gitingest/ingestion.py | 15 ++++---- src/gitingest/schemas/__init__.py | 4 +- src/gitingest/schemas/filesystem.py | 58 +++++++++++++++++++++++++++++ src/server/query_processor.py | 6 ++- 5 files changed, 93 insertions(+), 25 deletions(-) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index f6b5c8c8..eb6b6e22 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -27,6 +27,7 @@ from types import TracebackType from gitingest.schemas import IngestionQuery + from gitingest.schemas import Context # Initialize logger for this module logger = get_logger(__name__) @@ -51,6 +52,8 @@ async def ingest_async( and processes its files according to the specified query parameters. It returns a summary, a tree-like structure of the files, and the content of the files. The results can optionally be written to an output file. + The output is generated lazily using a Context object and its .generate_digest() method. + Parameters ---------- source : str @@ -142,6 +145,8 @@ async def ingest_async( if output: logger.debug("Writing output to file", extra={"output_path": output}) + context = ingest_query(query) + summary, tree, content = context.generate_digest() await _write_output(tree, content=content, target=output) logger.info("Ingestion completed successfully") @@ -167,6 +172,8 @@ def ingest( and processes its files according to the specified query parameters. It returns a summary, a tree-like structure of the files, and the content of the files. The results can optionally be written to an output file. + The output is generated lazily using a Context object and its .generate_digest() method. + Parameters ---------- source : str @@ -206,20 +213,20 @@ def ingest( ``ingest_async`` : The asynchronous version of this function. """ - return asyncio.run( - ingest_async( - source=source, - max_file_size=max_file_size, - include_patterns=include_patterns, - exclude_patterns=exclude_patterns, - branch=branch, - tag=tag, - include_gitignored=include_gitignored, - include_submodules=include_submodules, - token=token, - output=output, - ), - ) + import asyncio + context = asyncio.run(ingest_async( + source, + max_file_size=max_file_size, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + branch=branch, + tag=tag, + include_gitignored=include_gitignored, + include_submodules=include_submodules, + token=token, + output=output, + )) + return context.generate_digest() def _override_branch_and_tag(query: IngestionQuery, branch: str | None, tag: str | None) -> None: diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 01a2c8f3..836391d4 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -7,7 +7,7 @@ from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES from gitingest.output_formatter import format_node -from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats +from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats, Context from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.logging_config import get_logger @@ -18,12 +18,11 @@ logger = get_logger(__name__) -def ingest_query(query: IngestionQuery) -> tuple[str, str, str]: +def ingest_query(query: IngestionQuery) -> Context: """Run the ingestion process for a parsed query. This is the main entry point for analyzing a codebase directory or single file. It processes the query - parameters, reads the file or directory content, and generates a summary, directory structure, and file content, - along with token estimations. + parameters, reads the file or directory content, and returns a Context object that can generate the final output digest on demand. Parameters ---------- @@ -32,8 +31,8 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]: Returns ------- - tuple[str, str, str] - A tuple containing the summary, directory structure, and file contents. + Context + A Context object representing the ingested file system nodes. Call .generate_digest() to get the summary, directory structure, and file contents. Raises ------ @@ -91,7 +90,7 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]: "file_size": file_node.size, }, ) - return format_node(file_node, query=query) + return Context([file_node]) logger.info("Processing directory", extra={"directory_path": str(path)}) @@ -117,7 +116,7 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]: }, ) - return format_node(root_node, query=query) + return Context([root_node]) def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None: diff --git a/src/gitingest/schemas/__init__.py b/src/gitingest/schemas/__init__.py index db5cb12f..ebaf9cbb 100644 --- a/src/gitingest/schemas/__init__.py +++ b/src/gitingest/schemas/__init__.py @@ -1,7 +1,7 @@ """Module containing the schemas for the Gitingest package.""" from gitingest.schemas.cloning import CloneConfig -from gitingest.schemas.filesystem import FileSystemNode, FileSystemNodeType, FileSystemStats +from gitingest.schemas.filesystem import Context, FileSystemNode, FileSystemNodeType, FileSystemStats from gitingest.schemas.ingestion import IngestionQuery -__all__ = ["CloneConfig", "FileSystemNode", "FileSystemNodeType", "FileSystemStats", "IngestionQuery"] +__all__ = ["CloneConfig", "FileSystemNode", "FileSystemNodeType", "FileSystemStats", "IngestionQuery", "Context"] diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index cc66e7b1..a500b5d3 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -159,3 +159,61 @@ def content(self) -> str: # pylint: disable=too-many-return-statements return fp.read() except (OSError, UnicodeDecodeError) as exc: return f"Error reading file with {good_enc!r}: {exc}" + + +@dataclass +class Context: + """Context for holding a list of FileSystemNode objects and generating a digest on demand.""" + nodes: list[FileSystemNode] + + def generate_digest(self) -> tuple[str, str, str]: + """Generate a summary, directory structure, and file contents for the context's nodes. + + Returns + ------- + tuple[str, str, str] + A tuple containing the summary, directory structure, and file contents. + """ + summary_lines = ["Context Digest"] + total_files = 0 + for node in self.nodes: + if node.type == FileSystemNodeType.DIRECTORY: + total_files += node.file_count + elif node.type == FileSystemNodeType.FILE: + total_files += 1 + summary_lines.append(f"Files analyzed: {total_files}") + summary = "\n".join(summary_lines) + + # Directory structure + tree_lines = ["Directory structure:"] + for node in self.nodes: + tree_lines.append(self._create_tree_structure(node)) + tree = "\n".join(tree_lines) + + # File contents + content_lines = [] + for node in self.nodes: + content_lines.append(self._gather_file_contents(node)) + content = "\n".join(content_lines) + + return summary, tree, content + + def _gather_file_contents(self, node: FileSystemNode) -> str: + if node.type != FileSystemNodeType.DIRECTORY: + return node.content_string + return "\n".join(self._gather_file_contents(child) for child in node.children) + + def _create_tree_structure(self, node: FileSystemNode, prefix: str = "", is_last: bool = True) -> str: + tree_str = "" + current_prefix = "└── " if is_last else "├── " + display_name = node.name + if node.type == FileSystemNodeType.DIRECTORY: + display_name += "/" + elif node.type == FileSystemNodeType.SYMLINK: + display_name += " -> " + readlink(node.path).name + tree_str += f"{prefix}{current_prefix}{display_name}\n" + if node.type == FileSystemNodeType.DIRECTORY and node.children: + prefix += " " if is_last else "│ " + for i, child in enumerate(node.children): + tree_str += self._create_tree_structure(child, prefix=prefix, is_last=i == len(node.children) - 1) + return tree_str diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 03f52f16..5079df4a 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -22,6 +22,7 @@ upload_metadata_to_s3, upload_to_s3, ) +from gitingest.schemas import Context from server.server_config import MAX_DISPLAY_SIZE # Initialize logger for this module @@ -301,7 +302,10 @@ async def process_query( raise RuntimeError(msg) try: - summary, tree, content = ingest_query(query) + context = ingest_query(query) + summary, tree, content = context.generate_digest() + + # Prepare the digest content (tree + content) digest_content = tree + "\n" + content _store_digest_content(query, clone_config, digest_content, summary, tree, content) except Exception as exc: From b82624cf7757c0c1fedb9739efa3feb87c545847 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 02:10:30 +0200 Subject: [PATCH 02/17] wip before jinja --- src/gitingest/entrypoint.py | 32 ++--- src/gitingest/ingestion.py | 47 ++++--- src/gitingest/output_formatter.py | 77 +++++----- src/gitingest/schemas/__init__.py | 4 +- src/gitingest/schemas/filesystem.py | 211 +++++++++------------------- src/server/query_processor.py | 16 ++- src/server/routers_utils.py | 7 + 7 files changed, 160 insertions(+), 234 deletions(-) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index eb6b6e22..fff76ad6 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -45,12 +45,11 @@ async def ingest_async( include_submodules: bool = False, token: str | None = None, output: str | None = None, -) -> tuple[str, str, str]: +) -> str: """Ingest a source and process its contents. This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), - and processes its files according to the specified query parameters. It returns a summary, a tree-like - structure of the files, and the content of the files. The results can optionally be written to an output file. + and processes its files according to the specified query parameters. It returns a single digest string. The output is generated lazily using a Context object and its .generate_digest() method. @@ -82,11 +81,8 @@ async def ingest_async( Returns ------- - tuple[str, str, str] - A tuple containing: - - A summary string of the analyzed repository or directory. - - A tree-like string representation of the file structure. - - The content of the files in the repository or directory. + str + The full digest string. """ logger.info("Starting ingestion process", extra={"source": source}) @@ -146,11 +142,10 @@ async def ingest_async( if output: logger.debug("Writing output to file", extra={"output_path": output}) context = ingest_query(query) - summary, tree, content = context.generate_digest() - await _write_output(tree, content=content, target=output) - + digest = context.generate_digest() + await _write_output(digest, content=None, target=output) logger.info("Ingestion completed successfully") - return summary, tree, content + return digest def ingest( @@ -165,12 +160,11 @@ def ingest( include_submodules: bool = False, token: str | None = None, output: str | None = None, -) -> tuple[str, str, str]: +) -> str: """Provide a synchronous wrapper around ``ingest_async``. This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), - and processes its files according to the specified query parameters. It returns a summary, a tree-like - structure of the files, and the content of the files. The results can optionally be written to an output file. + and processes its files according to the specified query parameters. It returns a single digest string. The output is generated lazily using a Context object and its .generate_digest() method. @@ -202,18 +196,14 @@ def ingest( Returns ------- - tuple[str, str, str] - A tuple containing: - - A summary string of the analyzed repository or directory. - - A tree-like string representation of the file structure. - - The content of the files in the repository or directory. + str + The full digest string. See Also -------- ``ingest_async`` : The asynchronous version of this function. """ - import asyncio context = asyncio.run(ingest_async( source, max_file_size=max_file_size, diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 836391d4..74f9bcb8 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -6,8 +6,9 @@ from typing import TYPE_CHECKING from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES -from gitingest.output_formatter import format_node -from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats, Context +from gitingest.output_formatter import DefaultFormatter +from gitingest.schemas import FileSystemNode, FileSystemStats, Context +from gitingest.schemas.filesystem import FileSystemDirectory, FileSystemFile, FileSystemSymlink from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.logging_config import get_logger @@ -69,11 +70,16 @@ def ingest_query(query: IngestionQuery) -> Context: relative_path = path.relative_to(query.local_path) - file_node = FileSystemNode( + # file_node = FileSystemNode( + # name=path.name, + # type=FileSystemNodeType.FILE, + # size=path.stat().st_size, + # file_count=1, + # path_str=str(relative_path), + # path=path, + # ) + file_node = FileSystemFile( name=path.name, - type=FileSystemNodeType.FILE, - size=path.stat().st_size, - file_count=1, path_str=str(relative_path), path=path, ) @@ -90,13 +96,15 @@ def ingest_query(query: IngestionQuery) -> Context: "file_size": file_node.size, }, ) - return Context([file_node]) - logger.info("Processing directory", extra={"directory_path": str(path)}) - - root_node = FileSystemNode( + # root_node = FileSystemNode( + # name=path.name, + # type=FileSystemNodeType.DIRECTORY, + # path_str=str(path.relative_to(query.local_path)), + # path=path, + # ) + root_node = FileSystemDirectory( name=path.name, - type=FileSystemNodeType.DIRECTORY, path_str=str(path.relative_to(query.local_path)), path=path, ) @@ -116,7 +124,7 @@ def ingest_query(query: IngestionQuery) -> Context: }, ) - return Context([root_node]) + return Context([root_node], DefaultFormatter(), query) def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None: @@ -160,9 +168,8 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem continue _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_dir(): - child_directory_node = FileSystemNode( + child_directory_node = FileSystemDirectory( name=sub_path.name, - type=FileSystemNodeType.DIRECTORY, path_str=str(sub_path.relative_to(query.local_path)), path=sub_path, depth=node.depth + 1, @@ -200,9 +207,8 @@ def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemS The base path of the repository or directory being processed. """ - child = FileSystemNode( + child = FileSystemSymlink( name=path.name, - type=FileSystemNodeType.SYMLINK, path_str=str(path.relative_to(local_path)), path=path, depth=parent_node.depth + 1, @@ -212,7 +218,7 @@ def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemS parent_node.file_count += 1 -def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None: +def _process_file(path: Path, parent_node: FileSystemDirectory, stats: FileSystemStats, local_path: Path) -> None: """Process a file in the file system. This function checks the file's size, increments the statistics, and reads its content. @@ -222,7 +228,7 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat ---------- path : Path The full path of the file. - parent_node : FileSystemNode + parent_node : FileSystemDirectory The dictionary to accumulate the results. stats : FileSystemStats Statistics tracking object for the total file count and size. @@ -257,11 +263,8 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat stats.total_files += 1 stats.total_size += file_size - child = FileSystemNode( + child = FileSystemFile( name=path.name, - type=FileSystemNodeType.FILE, - size=file_size, - file_count=1, path_str=str(path.relative_to(local_path)), path=path, depth=parent_node.depth + 1, diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 5c2b59ae..0afd63a3 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -8,8 +8,11 @@ import requests.exceptions import tiktoken -from gitingest.schemas import FileSystemNode, FileSystemNodeType +from gitingest.schemas import FileSystemNode from gitingest.utils.compat_func import readlink +from functools import singledispatchmethod +from gitingest.schemas import Source, FileSystemFile, FileSystemDirectory, FileSystemSymlink +from gitingest.schemas.filesystem import SEPARATOR from gitingest.utils.logging_config import get_logger if TYPE_CHECKING: @@ -24,43 +27,41 @@ ] -def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, str]: - """Generate a summary, directory structure, and file contents for a given file system node. - - If the node represents a directory, the function will recursively process its contents. - - Parameters - ---------- - node : FileSystemNode - The file system node to be summarized. - query : IngestionQuery - The parsed query object containing information about the repository and query parameters. - - Returns - ------- - tuple[str, str, str] - A tuple containing the summary, directory structure, and file contents. - - """ - is_single_file = node.type == FileSystemNodeType.FILE - summary = _create_summary_prefix(query, single_file=is_single_file) - - if node.type == FileSystemNodeType.DIRECTORY: - summary += f"Files analyzed: {node.file_count}\n" - elif node.type == FileSystemNodeType.FILE: - summary += f"File: {node.name}\n" - summary += f"Lines: {len(node.content.splitlines()):,}\n" - - tree = "Directory structure:\n" + _create_tree_structure(query, node=node) - - content = _gather_file_contents(node) - - token_estimate = _format_token_count(tree + content) - if token_estimate: - summary += f"\nEstimated tokens: {token_estimate}" - - return summary, tree, content - +class Formatter: + @singledispatchmethod + def format(self, node: Source, query): + return f"{getattr(node, 'content', '')}" + + @format.register + def _(self, node: FileSystemFile, query): + return ( + f"{SEPARATOR}\n" + f"{node.name}\n" + f"{SEPARATOR}\n\n" + f"{node.content}" + ) + + @format.register + def _(self, node: FileSystemDirectory, query): + formatted = [] + for child in node.children: + formatted.append(self.format(child, query)) + return "\n".join(formatted) + + @format.register + def _(self, node: FileSystemSymlink, query): + target = getattr(node, 'target', None) + target_str = f" -> {target}" if target else "" + return ( + f"{SEPARATOR}\n" + f"{node.name}{target_str}\n" + f"{SEPARATOR}\n" + ) + +class DefaultFormatter(Formatter): + pass + +# Backward compatibility def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False) -> str: """Create a prefix string for summarizing a repository or local directory. diff --git a/src/gitingest/schemas/__init__.py b/src/gitingest/schemas/__init__.py index ebaf9cbb..6cf0c3cc 100644 --- a/src/gitingest/schemas/__init__.py +++ b/src/gitingest/schemas/__init__.py @@ -1,7 +1,7 @@ """Module containing the schemas for the Gitingest package.""" from gitingest.schemas.cloning import CloneConfig -from gitingest.schemas.filesystem import Context, FileSystemNode, FileSystemNodeType, FileSystemStats +from gitingest.schemas.filesystem import FileSystemNode, FileSystemFile, FileSystemDirectory, FileSystemSymlink, FileSystemStats, Context, Source from gitingest.schemas.ingestion import IngestionQuery -__all__ = ["CloneConfig", "FileSystemNode", "FileSystemNodeType", "FileSystemStats", "IngestionQuery", "Context"] +__all__ = ["CloneConfig", "FileSystemNode", "FileSystemFile", "FileSystemDirectory", "FileSystemSymlink", "FileSystemStats", "IngestionQuery", "Context"] diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index a500b5d3..49643c71 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -6,6 +6,7 @@ from dataclasses import dataclass, field from enum import Enum, auto from typing import TYPE_CHECKING +from abc import ABC from gitingest.utils.compat_func import readlink from gitingest.utils.file_utils import _decodes, _get_preferred_encodings, _read_chunk @@ -13,9 +14,12 @@ if TYPE_CHECKING: from pathlib import Path + from gitingest.schemas import IngestionQuery + from gitingest.output_formatter import Formatter SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48 - +CONTEXT_HEADER = "# Generated using https://gitingest.com{}\n" # Replace with /user/repo if we have it otherwise leave it blank +CONTEXT_FOOTER = "# End of gitingest context\n" class FileSystemNodeType(Enum): """Enum representing the type of a file system node (directory or file).""" @@ -33,22 +37,44 @@ class FileSystemStats: total_size: int = 0 -@dataclass -class FileSystemNode: # pylint: disable=too-many-instance-attributes - """Class representing a node in the file system (either a file or directory). +class Source: + """Abstract base class for all sources (files, directories, etc).""" + summary: str = "" + tree: str = "" + @property + def content(self) -> str: + return self._content + @content.setter + def content(self, value: str) -> None: + self._content = value + +class FileSystemNode(Source): + """Base class for all file system nodes (file, directory, symlink).""" + def __init__(self, name: str, path_str: str, path: 'Path', depth: int = 0): + self.name = name + self.path_str = path_str + self.path = path + self.depth = depth + self.summary = "" + self.tree = "" + self.children: list[FileSystemNode] = [] + self.size: int = 0 - Tracks properties of files/directories for comprehensive analysis. - """ + @property + def content(self) -> str: + raise NotImplementedError("Content is not implemented for FileSystemNode") - name: str - type: FileSystemNodeType - path_str: str - path: Path - size: int = 0 +class FileSystemFile(FileSystemNode): + @property + def content(self) -> str: + with open(self.path, "r", encoding="utf-8") as f: + return f.read() + +class FileSystemDirectory(FileSystemNode): + children: list['FileSystemNode'] = field(default_factory=list) file_count: int = 0 dir_count: int = 0 - depth: int = 0 - children: list[FileSystemNode] = field(default_factory=list) + type: FileSystemNodeType = FileSystemNodeType.DIRECTORY def sort_children(self) -> None: """Sort the children nodes of a directory according to a specific order. @@ -65,7 +91,6 @@ def sort_children(self) -> None: ------ ValueError If the node is not a directory. - """ if self.type != FileSystemNodeType.DIRECTORY: msg = "Cannot sort children of a non-directory node" @@ -75,7 +100,7 @@ def _sort_key(child: FileSystemNode) -> tuple[int, str]: # returns the priority order for the sort function, 0 is first # Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir name = child.name.lower() - if child.type == FileSystemNodeType.FILE: + if hasattr(child, 'type') and child.type == FileSystemNodeType.FILE: if name == "readme" or name.startswith("readme."): return (0, name) return (1 if not name.startswith(".") else 2, name) @@ -83,137 +108,35 @@ def _sort_key(child: FileSystemNode) -> tuple[int, str]: self.children.sort(key=_sort_key) - @property - def content_string(self) -> str: - """Return the content of the node as a string, including path and content. - - Returns - ------- - str - A string representation of the node's content. - - """ - parts = [ - SEPARATOR, - f"{self.type.name}: {str(self.path_str).replace(os.sep, '/')}" - + (f" -> {readlink(self.path).name}" if self.type == FileSystemNodeType.SYMLINK else ""), - SEPARATOR, - f"{self.content}", - ] - - return "\n".join(parts) + "\n\n" - - @property - def content(self) -> str: # pylint: disable=too-many-return-statements - """Return file content (if text / notebook) or an explanatory placeholder. - - Heuristically decides whether the file is text or binary by decoding a small chunk of the file - with multiple encodings and checking for common binary markers. - - Returns - ------- - str - The content of the file, or an error message if the file could not be read. - - Raises - ------ - ValueError - If the node is a directory. - - """ - if self.type == FileSystemNodeType.DIRECTORY: - msg = "Cannot read content of a directory node" - raise ValueError(msg) - - if self.type == FileSystemNodeType.SYMLINK: - return "" # TODO: are we including the empty content of symlinks? - - if self.path.suffix == ".ipynb": # Notebook - try: - return process_notebook(self.path) - except Exception as exc: - return f"Error processing notebook: {exc}" - - chunk = _read_chunk(self.path) - - if chunk is None: - return "Error reading file" - - if chunk == b"": - return "[Empty file]" - - if not _decodes(chunk, "utf-8"): - return "[Binary file]" - - # Find the first encoding that decodes the sample - good_enc: str | None = next( - (enc for enc in _get_preferred_encodings() if _decodes(chunk, encoding=enc)), - None, - ) - - if good_enc is None: - return "Error: Unable to decode file with available encodings" - - try: - with self.path.open(encoding=good_enc) as fp: - return fp.read() - except (OSError, UnicodeDecodeError) as exc: - return f"Error reading file with {good_enc!r}: {exc}" +class FileSystemSymlink(FileSystemNode): + # Add symlink-specific fields if needed + pass @dataclass class Context: - """Context for holding a list of FileSystemNode objects and generating a digest on demand.""" - nodes: list[FileSystemNode] - - def generate_digest(self) -> tuple[str, str, str]: - """Generate a summary, directory structure, and file contents for the context's nodes. - - Returns - ------- - tuple[str, str, str] - A tuple containing the summary, directory structure, and file contents. - """ - summary_lines = ["Context Digest"] - total_files = 0 - for node in self.nodes: - if node.type == FileSystemNodeType.DIRECTORY: - total_files += node.file_count - elif node.type == FileSystemNodeType.FILE: - total_files += 1 - summary_lines.append(f"Files analyzed: {total_files}") - summary = "\n".join(summary_lines) - - # Directory structure - tree_lines = ["Directory structure:"] - for node in self.nodes: - tree_lines.append(self._create_tree_structure(node)) - tree = "\n".join(tree_lines) - - # File contents - content_lines = [] + """Context for holding a list of Source objects and generating a digest on demand using a Formatter. + + Attributes + ---------- + nodes : list[Source] + The list of source objects to generate a digest for. + formatter : Formatter + The formatter to use for formatting sources. + query : IngestionQuery + The query context. + """ + nodes: list[Source] + formatter: Formatter + query: IngestionQuery + + def generate_digest(self) -> str: + if self.query.user_name and self.query.repo_name: + context_header = CONTEXT_HEADER.format(f"/{self.query.user_name}/{self.query.repo_name}") + else: + context_header = CONTEXT_HEADER.format("") + context_footer = CONTEXT_FOOTER + formatted = [] for node in self.nodes: - content_lines.append(self._gather_file_contents(node)) - content = "\n".join(content_lines) - - return summary, tree, content - - def _gather_file_contents(self, node: FileSystemNode) -> str: - if node.type != FileSystemNodeType.DIRECTORY: - return node.content_string - return "\n".join(self._gather_file_contents(child) for child in node.children) - - def _create_tree_structure(self, node: FileSystemNode, prefix: str = "", is_last: bool = True) -> str: - tree_str = "" - current_prefix = "└── " if is_last else "├── " - display_name = node.name - if node.type == FileSystemNodeType.DIRECTORY: - display_name += "/" - elif node.type == FileSystemNodeType.SYMLINK: - display_name += " -> " + readlink(node.path).name - tree_str += f"{prefix}{current_prefix}{display_name}\n" - if node.type == FileSystemNodeType.DIRECTORY and node.children: - prefix += " " if is_last else "│ " - for i, child in enumerate(node.children): - tree_str += self._create_tree_structure(child, prefix=prefix, is_last=i == len(node.children) - 1) - return tree_str + formatted.append(self.formatter.format(node, self.query)) + return context_header + "\n".join(formatted) + context_footer diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 5079df4a..1bbf44b3 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -303,6 +303,8 @@ async def process_query( try: context = ingest_query(query) + digest = context.generate_digest() + summary, tree, content = context.generate_digest() # Prepare the digest content (tree + content) @@ -314,10 +316,10 @@ async def process_query( _cleanup_repository(clone_config) return IngestErrorResponse(error=str(exc)) - if len(content) > MAX_DISPLAY_SIZE: - content = ( - f"(Files content cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " - "download full ingest to see more)\n" + content[:MAX_DISPLAY_SIZE] + if len(digest) > MAX_DISPLAY_SIZE: + digest = ( + f"(Digest cropped to {int(MAX_DISPLAY_SIZE / 1_000)}k characters, " + "download full ingest to see more)\n" + digest[:MAX_DISPLAY_SIZE] ) _print_success( @@ -325,7 +327,7 @@ async def process_query( max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, - summary=summary, + summary=digest, ) digest_url = _generate_digest_url(query) @@ -336,9 +338,9 @@ async def process_query( return IngestSuccessResponse( repo_url=input_text, short_repo_url=short_repo_url, - summary=summary, + summary="", digest_url=digest_url, - tree=tree, + tree="", content=content, default_max_file_size=max_file_size, pattern_type=pattern_type, diff --git a/src/server/routers_utils.py b/src/server/routers_utils.py index 3eaf0e59..ddea340f 100644 --- a/src/server/routers_utils.py +++ b/src/server/routers_utils.py @@ -2,6 +2,7 @@ from __future__ import annotations +import traceback from typing import Any from fastapi import status @@ -40,6 +41,8 @@ async def _perform_ingestion( ) if isinstance(result, IngestErrorResponse): + # print stack trace to console for debugging + print(traceback.format_exc()) # Return structured error response with 400 status code return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content=result.model_dump()) @@ -49,9 +52,13 @@ async def _perform_ingestion( except ValueError as ve: # Handle validation errors with 400 status code error_response = IngestErrorResponse(error=f"Validation error: {ve!s}") + # print stack trace to console for debugging + print(traceback.format_exc()) return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content=error_response.model_dump()) except Exception as exc: # Handle unexpected errors with 500 status code error_response = IngestErrorResponse(error=f"Internal server error: {exc!s}") + # print stack trace to console for debugging + print(traceback.format_exc()) return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content=error_response.model_dump()) From 0765f0ec749f14ddf4bc01f020310d0ee889cd32 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 02:46:39 +0200 Subject: [PATCH 03/17] idk --- src/gitingest/output_formatter.py | 103 +++++++++++++++++++--------- src/gitingest/schemas/filesystem.py | 4 ++ 2 files changed, 73 insertions(+), 34 deletions(-) diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 0afd63a3..40a548b8 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -14,6 +14,7 @@ from gitingest.schemas import Source, FileSystemFile, FileSystemDirectory, FileSystemSymlink from gitingest.schemas.filesystem import SEPARATOR from gitingest.utils.logging_config import get_logger +from jinja2 import Environment, BaseLoader if TYPE_CHECKING: from gitingest.schemas import IngestionQuery @@ -27,40 +28,6 @@ ] -class Formatter: - @singledispatchmethod - def format(self, node: Source, query): - return f"{getattr(node, 'content', '')}" - - @format.register - def _(self, node: FileSystemFile, query): - return ( - f"{SEPARATOR}\n" - f"{node.name}\n" - f"{SEPARATOR}\n\n" - f"{node.content}" - ) - - @format.register - def _(self, node: FileSystemDirectory, query): - formatted = [] - for child in node.children: - formatted.append(self.format(child, query)) - return "\n".join(formatted) - - @format.register - def _(self, node: FileSystemSymlink, query): - target = getattr(node, 'target', None) - target_str = f" -> {target}" if target else "" - return ( - f"{SEPARATOR}\n" - f"{node.name}{target_str}\n" - f"{SEPARATOR}\n" - ) - -class DefaultFormatter(Formatter): - pass - # Backward compatibility def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False) -> str: @@ -209,3 +176,71 @@ def _format_token_count(text: str) -> str | None: return f"{total_tokens / threshold:.1f}{suffix}" return str(total_tokens) + +# Rename JinjaFormatter to DefaultFormatter throughout the file +class DefaultFormatter: + def __init__(self): + self.env = Environment(loader=BaseLoader()) + + @singledispatchmethod + def format(self, node: Source, query): + return f"{getattr(node, 'content', '')}" + + @singledispatchmethod + def summary(self, node: Source, query): + # Default summary: just the name + return f"{getattr(node, 'name', '')}" + + @format.register + def _(self, node: FileSystemFile, query): + template = \ +""" +{{ SEPARATOR }} +{{ node.name }} +{{ SEPARATOR }} + +{{ node.content }} +""" + file_template = self.env.from_string(template) + return file_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) + + @format.register + def _(self, node: FileSystemDirectory, query): + template = \ +""" +{% for child in node.children %} +{{ formatter.format(child, query) }} +{% endfor %} +""" + dir_template = self.env.from_string(template) + return dir_template.render(node=node, query=query, formatter=self) + + @summary.register + def _(self, node: FileSystemDirectory, query): + template = """ +{%- macro render_tree(node, prefix='', is_last=True) -%} + {{ prefix }}{{ '└── ' if is_last else '├── ' }}{{ node.name }}{% if node.type == 'directory' %}/{% endif %} + {%- if node.type == 'directory' and node.children %} + {%- for i, child in enumerate(node.children) %} + {{ render_tree(child, prefix + (' ' if is_last else '│ '), i == (node.children | length - 1)) }} + {%- endfor %} + {%- endif %} +{%- endmacro %} + +Directory structure: +{{ render_tree(node) }} +""" + summary_template = self.env.from_string(template) + return summary_template.render(node=node, query=query, formatter=self) + + + @format.register + def _(self, node: FileSystemSymlink, query): + template = \ +""" +{{ SEPARATOR }} +{{ node.name }}{% if node.target %} -> {{ node.target }}{% endif %} +{{ SEPARATOR }} +""" + symlink_template = self.env.from_string(template) + return symlink_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index 49643c71..3f023c9b 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -140,3 +140,7 @@ def generate_digest(self) -> str: for node in self.nodes: formatted.append(self.formatter.format(node, self.query)) return context_header + "\n".join(formatted) + context_footer + + @property + def summary(self): + return "\n".join(self.formatter.summary(node, self.query) for node in self.nodes) \ No newline at end of file From 78695f9bfe1cd7dbe086383458dcb48fd12c5563 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Fri, 25 Jul 2025 16:23:31 +0200 Subject: [PATCH 04/17] idk --- src/gitingest/output_formatter.py | 11 +-- src/gitingest/schemas/filesystem.py | 102 +++++++++++++++------------- 2 files changed, 54 insertions(+), 59 deletions(-) diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 40a548b8..c2d75dc4 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -218,17 +218,8 @@ def _(self, node: FileSystemDirectory, query): @summary.register def _(self, node: FileSystemDirectory, query): template = """ -{%- macro render_tree(node, prefix='', is_last=True) -%} - {{ prefix }}{{ '└── ' if is_last else '├── ' }}{{ node.name }}{% if node.type == 'directory' %}/{% endif %} - {%- if node.type == 'directory' and node.children %} - {%- for i, child in enumerate(node.children) %} - {{ render_tree(child, prefix + (' ' if is_last else '│ '), i == (node.children | length - 1)) }} - {%- endfor %} - {%- endif %} -{%- endmacro %} - Directory structure: -{{ render_tree(node) }} +{{ node.tree }} """ summary_template = self.env.from_string(template) return summary_template.render(node=node, query=query, formatter=self) diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index 3f023c9b..2bc8e7fe 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -7,6 +7,7 @@ from enum import Enum, auto from typing import TYPE_CHECKING from abc import ABC +from functools import singledispatchmethod from gitingest.utils.compat_func import readlink from gitingest.utils.file_utils import _decodes, _get_preferred_encodings, _read_chunk @@ -36,40 +37,44 @@ class FileSystemStats: total_files: int = 0 total_size: int = 0 - -class Source: +@dataclass +class Source(ABC): """Abstract base class for all sources (files, directories, etc).""" - summary: str = "" - tree: str = "" @property - def content(self) -> str: - return self._content - @content.setter - def content(self, value: str) -> None: - self._content = value + def tree(self) -> str: + return self._tree() + @property + def summary(self) -> str: + return getattr(self, "_summary", "") + @summary.setter + def summary(self, value: str) -> None: + self._summary = value +@dataclass class FileSystemNode(Source): - """Base class for all file system nodes (file, directory, symlink).""" - def __init__(self, name: str, path_str: str, path: 'Path', depth: int = 0): - self.name = name - self.path_str = path_str - self.path = path - self.depth = depth - self.summary = "" - self.tree = "" - self.children: list[FileSystemNode] = [] - self.size: int = 0 + name: str + path_str: str + path: Path + depth: int = 0 + size: int = 0 @property - def content(self) -> str: - raise NotImplementedError("Content is not implemented for FileSystemNode") + def tree(self): + return self._tree() + + @singledispatchmethod + def _tree(self): + return self.name +@dataclass class FileSystemFile(FileSystemNode): - @property - def content(self) -> str: - with open(self.path, "r", encoding="utf-8") as f: - return f.read() + pass # Nothing for now +@FileSystemNode._tree.register +def _(self: 'FileSystemFile'): + return self.name + +@dataclass class FileSystemDirectory(FileSystemNode): children: list['FileSystemNode'] = field(default_factory=list) file_count: int = 0 @@ -77,40 +82,39 @@ class FileSystemDirectory(FileSystemNode): type: FileSystemNodeType = FileSystemNodeType.DIRECTORY def sort_children(self) -> None: - """Sort the children nodes of a directory according to a specific order. - - Order of sorting: - 2. Regular files (not starting with dot) - 3. Hidden files (starting with dot) - 4. Regular directories (not starting with dot) - 5. Hidden directories (starting with dot) - - All groups are sorted alphanumerically within themselves. - - Raises - ------ - ValueError - If the node is not a directory. - """ - if self.type != FileSystemNodeType.DIRECTORY: - msg = "Cannot sort children of a non-directory node" - raise ValueError(msg) - + """Sort the children nodes of a directory according to a specific order.""" def _sort_key(child: FileSystemNode) -> tuple[int, str]: - # returns the priority order for the sort function, 0 is first - # Groups: 0=README, 1=regular file, 2=hidden file, 3=regular dir, 4=hidden dir name = child.name.lower() - if hasattr(child, 'type') and child.type == FileSystemNodeType.FILE: + if hasattr(child, 'type') and getattr(child, 'type', None) == FileSystemNodeType.FILE: if name == "readme" or name.startswith("readme."): return (0, name) return (1 if not name.startswith(".") else 2, name) return (3 if not name.startswith(".") else 4, name) - self.children.sort(key=_sort_key) +@FileSystemNode._tree.register +def _(self: 'FileSystemDirectory'): + def render_tree(node, prefix="", is_last=True): + lines = [] + current_prefix = "└── " if is_last else "├── " + display_name = node.name + "/" + lines.append(f"{prefix}{current_prefix}{display_name}") + if hasattr(node, 'children') and node.children: + new_prefix = prefix + (" " if is_last else "│ ") + for i, child in enumerate(node.children): + is_last_child = i == len(node.children) - 1 + lines.extend(child._tree()(child, prefix=new_prefix, is_last=is_last_child) if hasattr(child, '_tree') else [child.name]) + return lines + return "\n".join(render_tree(self)) + +@dataclass class FileSystemSymlink(FileSystemNode): + target: str = "" # Add symlink-specific fields if needed - pass + +@FileSystemNode._tree.register +def _(self: 'FileSystemSymlink'): + return f"{self.name} -> {self.target}" if self.target else self.name @dataclass From e1f687c9949fed9c5d6b624a2244e277c1c4f689 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Sun, 27 Jul 2025 14:51:50 +0200 Subject: [PATCH 05/17] idk --- src/gitingest/output_formatter.py | 4 ++-- src/server/query_processor.py | 23 +++++++++++++++++++---- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index c2d75dc4..13e166f6 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -177,7 +177,6 @@ def _format_token_count(text: str) -> str | None: return str(total_tokens) -# Rename JinjaFormatter to DefaultFormatter throughout the file class DefaultFormatter: def __init__(self): self.env = Environment(loader=BaseLoader()) @@ -217,7 +216,8 @@ def _(self, node: FileSystemDirectory, query): @summary.register def _(self, node: FileSystemDirectory, query): - template = """ + template = \ +""" Directory structure: {{ node.tree }} """ diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 1bbf44b3..8f034b86 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -305,11 +305,26 @@ async def process_query( context = ingest_query(query) digest = context.generate_digest() - summary, tree, content = context.generate_digest() + # Store digest based on S3 configuration + if is_s3_enabled(): + # Upload to S3 instead of storing locally + s3_file_path = generate_s3_file_path( + source=query.url, + user_name=cast("str", query.user_name), + repo_name=cast("str", query.repo_name), + commit=query.commit, + include_patterns=query.include_patterns, + ignore_patterns=query.ignore_patterns, + ) + s3_url = upload_to_s3(content=context.digest, s3_file_path=s3_file_path, ingest_id=query.id) + # Store S3 URL in query for later use + query.s3_url = s3_url + else: + # Store locally + local_txt_file = Path(clone_config.local_path).with_suffix(".txt") + with local_txt_file.open("w", encoding="utf-8") as f: + f.write(digest) - # Prepare the digest content (tree + content) - digest_content = tree + "\n" + content - _store_digest_content(query, clone_config, digest_content, summary, tree, content) except Exception as exc: _print_error(query.url, exc, max_file_size, pattern_type, pattern) # Clean up repository even if processing failed From 0e6c5bfe42070477af69cdebb7448eaf3fd682e9 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Sun, 27 Jul 2025 16:25:16 +0200 Subject: [PATCH 06/17] commit avant la catastrophe --- src/gitingest/ingestion.py | 29 ++++--- src/gitingest/output_formatter.py | 116 +++++++++++++++++++++------- src/gitingest/schemas/__init__.py | 4 +- src/gitingest/schemas/filesystem.py | 14 +++- src/server/query_processor.py | 15 ++-- 5 files changed, 128 insertions(+), 50 deletions(-) diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 74f9bcb8..7c828e8d 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -6,9 +6,9 @@ from typing import TYPE_CHECKING from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES -from gitingest.output_formatter import DefaultFormatter +from gitingest.output_formatter import DefaultFormatter, StupidFormatter from gitingest.schemas import FileSystemNode, FileSystemStats, Context -from gitingest.schemas.filesystem import FileSystemDirectory, FileSystemFile, FileSystemSymlink +from gitingest.schemas.filesystem import FileSystemDirectory, FileSystemFile, FileSystemSymlink, FileSystemTextFile from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.logging_config import get_logger @@ -96,6 +96,7 @@ def ingest_query(query: IngestionQuery) -> Context: "file_size": file_node.size, }, ) + return Context([file_node], StupidFormatter(), query) # root_node = FileSystemNode( # name=path.name, @@ -124,7 +125,7 @@ def ingest_query(query: IngestionQuery) -> Context: }, ) - return Context([root_node], DefaultFormatter(), query) + return Context([root_node], StupidFormatter(), query) def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None: @@ -263,12 +264,22 @@ def _process_file(path: Path, parent_node: FileSystemDirectory, stats: FileSyste stats.total_files += 1 stats.total_size += file_size - child = FileSystemFile( - name=path.name, - path_str=str(path.relative_to(local_path)), - path=path, - depth=parent_node.depth + 1, - ) + # if file is a .txt file, create a FileSystemTextFile + if path.suffix == ".txt": + child = FileSystemTextFile( + name=path.name, + path_str=str(path.relative_to(local_path)), + path=path, + depth=parent_node.depth + 1, + ) + else: + + child = FileSystemFile( + name=path.name, + path_str=str(path.relative_to(local_path)), + path=path, + depth=parent_node.depth + 1, + ) parent_node.children.append(child) parent_node.size += file_size diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 13e166f6..13553443 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -11,11 +11,36 @@ from gitingest.schemas import FileSystemNode from gitingest.utils.compat_func import readlink from functools import singledispatchmethod -from gitingest.schemas import Source, FileSystemFile, FileSystemDirectory, FileSystemSymlink -from gitingest.schemas.filesystem import SEPARATOR +from gitingest.schemas import Source, FileSystemFile, FileSystemDirectory, FileSystemSymlink, FileSystemTextFile +from gitingest.schemas.filesystem import SEPARATOR, FileSystemNodeType from gitingest.utils.logging_config import get_logger from jinja2 import Environment, BaseLoader + +class OverridableDispatcher: + """Custom dispatcher that allows later registrations to override earlier ones, even for parent types.""" + + def __init__(self, default_func): + self.default_func = default_func + self.registry = [] # List of (type, func) in registration order + + def register(self, type_): + def decorator(func): + # Remove any existing registration for this exact type + self.registry = [(t, f) for t, f in self.registry if t != type_] + # Add new registration at the end (highest priority) + self.registry.append((type_, func)) + return func + return decorator + + def __call__(self, instance, *args, **kwargs): + # Check registrations in reverse order (most recent first) + for type_, func in reversed(self.registry): + if isinstance(instance, type_): + return func(instance, *args, **kwargs) + # Fall back to default + return self.default_func(instance, *args, **kwargs) + if TYPE_CHECKING: from gitingest.schemas import IngestionQuery @@ -179,20 +204,26 @@ def _format_token_count(text: str) -> str | None: class DefaultFormatter: def __init__(self): + self.separator = SEPARATOR self.env = Environment(loader=BaseLoader()) - @singledispatchmethod - def format(self, node: Source, query): - return f"{getattr(node, 'content', '')}" + # Set up custom dispatchers + def _default_format(node: Source, query): + return f"{getattr(node, 'content', '')}" + + def _default_summary(node: Source, query): + return f"{getattr(node, 'name', '')}" - @singledispatchmethod - def summary(self, node: Source, query): - # Default summary: just the name - return f"{getattr(node, 'name', '')}" + self.format = OverridableDispatcher(_default_format) + self.summary = OverridableDispatcher(_default_summary) - @format.register - def _(self, node: FileSystemFile, query): - template = \ + # Register the default implementations + self._register_defaults() + + def _register_defaults(self): + @self.format.register(FileSystemFile) + def _(node: FileSystemFile, query): + template = \ """ {{ SEPARATOR }} {{ node.name }} @@ -200,38 +231,63 @@ def _(self, node: FileSystemFile, query): {{ node.content }} """ - file_template = self.env.from_string(template) - return file_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) + file_template = self.env.from_string(template) + return file_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) - @format.register - def _(self, node: FileSystemDirectory, query): - template = \ + @self.format.register(FileSystemDirectory) + def _(node: FileSystemDirectory, query): + template = \ """ {% for child in node.children %} {{ formatter.format(child, query) }} {% endfor %} """ - dir_template = self.env.from_string(template) - return dir_template.render(node=node, query=query, formatter=self) + dir_template = self.env.from_string(template) + return dir_template.render(node=node, query=query, formatter=self) - @summary.register - def _(self, node: FileSystemDirectory, query): - template = \ + @self.summary.register(FileSystemDirectory) + def _(node: FileSystemDirectory, query): + template = \ """ Directory structure: {{ node.tree }} """ - summary_template = self.env.from_string(template) - return summary_template.render(node=node, query=query, formatter=self) - + summary_template = self.env.from_string(template) + return summary_template.render(node=node, query=query, formatter=self) - @format.register - def _(self, node: FileSystemSymlink, query): - template = \ + @self.format.register(FileSystemSymlink) + def _(node: FileSystemSymlink, query): + template = \ """ {{ SEPARATOR }} {{ node.name }}{% if node.target %} -> {{ node.target }}{% endif %} {{ SEPARATOR }} """ - symlink_template = self.env.from_string(template) - return symlink_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) + symlink_template = self.env.from_string(template) + return symlink_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) + +class StupidFormatter(DefaultFormatter): + def __init__(self): + super().__init__() + + @self.summary.register(FileSystemTextFile) + def _(node: FileSystemTextFile, query): + template = \ +""" +{{ SEPARATOR }} +{{ node.name }} +{{ SEPARATOR }} +FileSystemTextFile +""" + + @self.format.register(FileSystemFile) + def _(node: FileSystemFile, query): + template = \ +""" +{{ SEPARATOR }} +{{ node.name }} +{{ SEPARATOR }} +FileSystemFile +""" + file_template = self.env.from_string(template) + return file_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) diff --git a/src/gitingest/schemas/__init__.py b/src/gitingest/schemas/__init__.py index 6cf0c3cc..3478ac64 100644 --- a/src/gitingest/schemas/__init__.py +++ b/src/gitingest/schemas/__init__.py @@ -1,7 +1,7 @@ """Module containing the schemas for the Gitingest package.""" from gitingest.schemas.cloning import CloneConfig -from gitingest.schemas.filesystem import FileSystemNode, FileSystemFile, FileSystemDirectory, FileSystemSymlink, FileSystemStats, Context, Source +from gitingest.schemas.filesystem import FileSystemNode, FileSystemFile, FileSystemDirectory, FileSystemSymlink, FileSystemTextFile, FileSystemStats, Context, Source from gitingest.schemas.ingestion import IngestionQuery -__all__ = ["CloneConfig", "FileSystemNode", "FileSystemFile", "FileSystemDirectory", "FileSystemSymlink", "FileSystemStats", "IngestionQuery", "Context"] +__all__ = ["CloneConfig", "FileSystemNode", "FileSystemFile", "FileSystemDirectory", "FileSystemSymlink", "FileSystemTextFile", "FileSystemStats", "IngestionQuery", "Context"] diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index 2bc8e7fe..6cb000a8 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -68,7 +68,18 @@ def _tree(self): @dataclass class FileSystemFile(FileSystemNode): - pass # Nothing for now + @property + def content(self): + # read the file + try: + with open(self.path, "r") as f: + return f.read() + except Exception as e: + return f"Error reading content of {self.name}: {e}" + +@dataclass +class FileSystemTextFile(FileSystemFile): + pass @FileSystemNode._tree.register def _(self: 'FileSystemFile'): @@ -117,7 +128,6 @@ def _(self: 'FileSystemSymlink'): return f"{self.name} -> {self.target}" if self.target else self.name -@dataclass class Context: """Context for holding a list of Source objects and generating a digest on demand using a Formatter. diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 8f034b86..483d236a 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -322,6 +322,7 @@ async def process_query( else: # Store locally local_txt_file = Path(clone_config.local_path).with_suffix(".txt") + print(f"Writing to {local_txt_file}") with local_txt_file.open("w", encoding="utf-8") as f: f.write(digest) @@ -337,13 +338,13 @@ async def process_query( "download full ingest to see more)\n" + digest[:MAX_DISPLAY_SIZE] ) - _print_success( - url=query.url, - max_file_size=max_file_size, - pattern_type=pattern_type, - pattern=pattern, - summary=digest, - ) + # _print_success( + # url=query.url, + # max_file_size=max_file_size, + # pattern_type=pattern_type, + # pattern=pattern, + # summary=digest, + # ) digest_url = _generate_digest_url(query) From 6d2941bc75bcd9a7e2fa510b1729d3f1b028b023 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Sun, 27 Jul 2025 18:02:30 +0200 Subject: [PATCH 07/17] commit post catastrophe --- src/gitingest/entrypoint.py | 11 ++++++----- src/gitingest/ingestion.py | 2 +- src/gitingest/output_formatter.py | 26 +++++++++++++++++++++++++- src/gitingest/schemas/filesystem.py | 23 +++++++---------------- src/server/query_processor.py | 5 +++-- 5 files changed, 42 insertions(+), 25 deletions(-) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index fff76ad6..b289f1e8 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -15,6 +15,7 @@ from gitingest.clone import clone_repo from gitingest.config import MAX_FILE_SIZE from gitingest.ingestion import ingest_query +from gitingest.output_formatter import generate_digest from gitingest.query_parser import parse_local_dir_path, parse_remote_repo from gitingest.utils.auth import resolve_token from gitingest.utils.compat_func import removesuffix @@ -51,7 +52,7 @@ async def ingest_async( This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), and processes its files according to the specified query parameters. It returns a single digest string. - The output is generated lazily using a Context object and its .generate_digest() method. + The output is generated lazily using a Context object and the generate_digest() function. Parameters ---------- @@ -142,7 +143,7 @@ async def ingest_async( if output: logger.debug("Writing output to file", extra={"output_path": output}) context = ingest_query(query) - digest = context.generate_digest() + digest = generate_digest(context) await _write_output(digest, content=None, target=output) logger.info("Ingestion completed successfully") return digest @@ -166,7 +167,7 @@ def ingest( This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), and processes its files according to the specified query parameters. It returns a single digest string. - The output is generated lazily using a Context object and its .generate_digest() method. + The output is generated lazily using a Context object and the generate_digest() function. Parameters ---------- @@ -204,7 +205,7 @@ def ingest( ``ingest_async`` : The asynchronous version of this function. """ - context = asyncio.run(ingest_async( + digest = asyncio.run(ingest_async( source, max_file_size=max_file_size, include_patterns=include_patterns, @@ -216,7 +217,7 @@ def ingest( token=token, output=output, )) - return context.generate_digest() + return digest def _override_branch_and_tag(query: IngestionQuery, branch: str | None, tag: str | None) -> None: diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 7c828e8d..3a770a87 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -33,7 +33,7 @@ def ingest_query(query: IngestionQuery) -> Context: Returns ------- Context - A Context object representing the ingested file system nodes. Call .generate_digest() to get the summary, directory structure, and file contents. + A Context object representing the ingested file system nodes. Use generate_digest(context) to get the summary, directory structure, and file contents. Raises ------ diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 13553443..3ed304ed 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -12,7 +12,7 @@ from gitingest.utils.compat_func import readlink from functools import singledispatchmethod from gitingest.schemas import Source, FileSystemFile, FileSystemDirectory, FileSystemSymlink, FileSystemTextFile -from gitingest.schemas.filesystem import SEPARATOR, FileSystemNodeType +from gitingest.schemas.filesystem import SEPARATOR, FileSystemNodeType, CONTEXT_HEADER, CONTEXT_FOOTER from gitingest.utils.logging_config import get_logger from jinja2 import Environment, BaseLoader @@ -291,3 +291,27 @@ def _(node: FileSystemFile, query): """ file_template = self.env.from_string(template) return file_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) + + +def generate_digest(context) -> str: + """Generate a digest from a Context object. + + Parameters + ---------- + context : Context + The context object containing nodes, formatter, and query. + + Returns + ------- + str + The formatted digest string with header, content, and footer. + """ + if context.query.user_name and context.query.repo_name: + context_header = CONTEXT_HEADER.format(f"/{context.query.user_name}/{context.query.repo_name}") + else: + context_header = CONTEXT_HEADER.format("") + context_footer = CONTEXT_FOOTER + formatted = [] + for node in context.nodes: + formatted.append(context.formatter.format(node, context.query)) + return context_header + "\n".join(formatted) + context_footer diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index 6cb000a8..98858f95 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -129,31 +129,22 @@ def _(self: 'FileSystemSymlink'): class Context: - """Context for holding a list of Source objects and generating a digest on demand using a Formatter. + """Context for holding a list of Source objects that can be formatted using a Formatter. Attributes ---------- nodes : list[Source] - The list of source objects to generate a digest for. + The list of source objects to format. formatter : Formatter The formatter to use for formatting sources. query : IngestionQuery The query context. """ - nodes: list[Source] - formatter: Formatter - query: IngestionQuery - - def generate_digest(self) -> str: - if self.query.user_name and self.query.repo_name: - context_header = CONTEXT_HEADER.format(f"/{self.query.user_name}/{self.query.repo_name}") - else: - context_header = CONTEXT_HEADER.format("") - context_footer = CONTEXT_FOOTER - formatted = [] - for node in self.nodes: - formatted.append(self.formatter.format(node, self.query)) - return context_header + "\n".join(formatted) + context_footer + + def __init__(self, nodes: list[Source], formatter: Formatter, query: IngestionQuery): + self.nodes = nodes + self.formatter = formatter + self.query = query @property def summary(self): diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 483d236a..871e9b38 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -8,6 +8,7 @@ from gitingest.clone import clone_repo from gitingest.ingestion import ingest_query +from gitingest.output_formatter import generate_digest from gitingest.query_parser import parse_remote_repo from gitingest.utils.git_utils import resolve_commit, validate_github_token from gitingest.utils.logging_config import get_logger @@ -303,7 +304,7 @@ async def process_query( try: context = ingest_query(query) - digest = context.generate_digest() + digest = generate_digest(context) # Store digest based on S3 configuration if is_s3_enabled(): @@ -357,7 +358,7 @@ async def process_query( summary="", digest_url=digest_url, tree="", - content=content, + content=digest, default_max_file_size=max_file_size, pattern_type=pattern_type, pattern=pattern, From fcb7e01b4f95df8fbf927ebfee89a0a1d32e72dc Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Mon, 28 Jul 2025 03:42:24 +0200 Subject: [PATCH 08/17] wip --- src/gitingest/ingestion.py | 39 ++-- src/gitingest/output_formatter.py | 229 ++++++++++++----------- src/gitingest/schemas/__init__.py | 4 +- src/gitingest/schemas/filesystem.py | 70 +++---- src/server/query_processor.py | 9 +- tests/test_output_formatter.py | 278 ++++++++++++++++++++++++++++ 6 files changed, 446 insertions(+), 183 deletions(-) create mode 100644 tests/test_output_formatter.py diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 3a770a87..398463c0 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -6,9 +6,9 @@ from typing import TYPE_CHECKING from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES -from gitingest.output_formatter import DefaultFormatter, StupidFormatter +from gitingest.output_formatter import DefaultFormatter, DebugFormatter, SummaryFormatter from gitingest.schemas import FileSystemNode, FileSystemStats, Context -from gitingest.schemas.filesystem import FileSystemDirectory, FileSystemFile, FileSystemSymlink, FileSystemTextFile +from gitingest.schemas.filesystem import FileSystemDirectory, FileSystemFile, FileSystemSymlink from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.logging_config import get_logger @@ -70,14 +70,6 @@ def ingest_query(query: IngestionQuery) -> Context: relative_path = path.relative_to(query.local_path) - # file_node = FileSystemNode( - # name=path.name, - # type=FileSystemNodeType.FILE, - # size=path.stat().st_size, - # file_count=1, - # path_str=str(relative_path), - # path=path, - # ) file_node = FileSystemFile( name=path.name, path_str=str(relative_path), @@ -96,7 +88,7 @@ def ingest_query(query: IngestionQuery) -> Context: "file_size": file_node.size, }, ) - return Context([file_node], StupidFormatter(), query) + return Context([file_node], query) # root_node = FileSystemNode( # name=path.name, @@ -125,7 +117,7 @@ def ingest_query(query: IngestionQuery) -> Context: }, ) - return Context([root_node], StupidFormatter(), query) + return Context([root_node], query) def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None: @@ -264,22 +256,13 @@ def _process_file(path: Path, parent_node: FileSystemDirectory, stats: FileSyste stats.total_files += 1 stats.total_size += file_size - # if file is a .txt file, create a FileSystemTextFile - if path.suffix == ".txt": - child = FileSystemTextFile( - name=path.name, - path_str=str(path.relative_to(local_path)), - path=path, - depth=parent_node.depth + 1, - ) - else: - child = FileSystemFile( - name=path.name, - path_str=str(path.relative_to(local_path)), - path=path, - depth=parent_node.depth + 1, - ) + child = FileSystemFile( + name=path.name, + path_str=str(path.relative_to(local_path)), + path=path, + depth=parent_node.depth + 1, + ) parent_node.children.append(child) parent_node.size += file_size @@ -290,7 +273,7 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: """Check if any of the traversal limits have been exceeded. This function checks if the current traversal has exceeded any of the configured limits: - maximum directory depth, maximum number of files, or maximum total size in bytes. + maximum directory depth, ma ximum number of files, or maximum total size in bytes. Parameters ---------- diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 3ed304ed..30e68354 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -11,36 +11,11 @@ from gitingest.schemas import FileSystemNode from gitingest.utils.compat_func import readlink from functools import singledispatchmethod -from gitingest.schemas import Source, FileSystemFile, FileSystemDirectory, FileSystemSymlink, FileSystemTextFile -from gitingest.schemas.filesystem import SEPARATOR, FileSystemNodeType, CONTEXT_HEADER, CONTEXT_FOOTER +from gitingest.schemas import Source, FileSystemFile, FileSystemDirectory, FileSystemSymlink +from gitingest.schemas.filesystem import SEPARATOR, Context, FileSystemNodeType from gitingest.utils.logging_config import get_logger from jinja2 import Environment, BaseLoader - -class OverridableDispatcher: - """Custom dispatcher that allows later registrations to override earlier ones, even for parent types.""" - - def __init__(self, default_func): - self.default_func = default_func - self.registry = [] # List of (type, func) in registration order - - def register(self, type_): - def decorator(func): - # Remove any existing registration for this exact type - self.registry = [(t, f) for t, f in self.registry if t != type_] - # Add new registration at the end (highest priority) - self.registry.append((type_, func)) - return func - return decorator - - def __call__(self, instance, *args, **kwargs): - # Check registrations in reverse order (most recent first) - for type_, func in reversed(self.registry): - if isinstance(instance, type_): - return func(instance, *args, **kwargs) - # Fall back to default - return self.default_func(instance, *args, **kwargs) - if TYPE_CHECKING: from gitingest.schemas import IngestionQuery @@ -202,28 +177,38 @@ def _format_token_count(text: str) -> str | None: return str(total_tokens) + +def generate_digest(context: Context) -> str: + """Generate a digest string from a Context object. + + This is a convenience function that uses the DefaultFormatter to format a Context. + + Parameters + ---------- + context : Context + The Context object containing sources and query information. + + Returns + ------- + str + The formatted digest string. + """ + formatter = DefaultFormatter() + return formatter.format(context, context.query) + + class DefaultFormatter: def __init__(self): self.separator = SEPARATOR self.env = Environment(loader=BaseLoader()) - # Set up custom dispatchers - def _default_format(node: Source, query): - return f"{getattr(node, 'content', '')}" - - def _default_summary(node: Source, query): - return f"{getattr(node, 'name', '')}" + @singledispatchmethod + def format(self, node: Source, query): + return f"{getattr(node, 'content', '')}" - self.format = OverridableDispatcher(_default_format) - self.summary = OverridableDispatcher(_default_summary) - - # Register the default implementations - self._register_defaults() - - def _register_defaults(self): - @self.format.register(FileSystemFile) - def _(node: FileSystemFile, query): - template = \ + @format.register + def _(self, node: FileSystemFile, query): + template = \ """ {{ SEPARATOR }} {{ node.name }} @@ -231,87 +216,125 @@ def _(node: FileSystemFile, query): {{ node.content }} """ - file_template = self.env.from_string(template) - return file_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) + file_template = self.env.from_string(template) + return file_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) - @self.format.register(FileSystemDirectory) - def _(node: FileSystemDirectory, query): - template = \ + @format.register + def _(self, node: FileSystemDirectory, query): + template = \ """ +{% if node.depth == 0 %} +{{ node.name }}: +{{ node.tree }} + +{% endif %} {% for child in node.children %} {{ formatter.format(child, query) }} {% endfor %} """ - dir_template = self.env.from_string(template) - return dir_template.render(node=node, query=query, formatter=self) + dir_template = self.env.from_string(template) + return dir_template.render(node=node, query=query, formatter=self) - @self.summary.register(FileSystemDirectory) - def _(node: FileSystemDirectory, query): - template = \ -""" -Directory structure: -{{ node.tree }} -""" - summary_template = self.env.from_string(template) - return summary_template.render(node=node, query=query, formatter=self) - - @self.format.register(FileSystemSymlink) - def _(node: FileSystemSymlink, query): - template = \ + @format.register + def _(self, node: FileSystemSymlink, query): + template = \ """ {{ SEPARATOR }} {{ node.name }}{% if node.target %} -> {{ node.target }}{% endif %} {{ SEPARATOR }} """ - symlink_template = self.env.from_string(template) - return symlink_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) + symlink_template = self.env.from_string(template) + return symlink_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) + + @format.register + def _(self, context: Context, query): + """Format a Context by formatting all its sources.""" + template = \ +""" +# Generated using https://gitingest.com/{{ context.query.user_name }}/{{ context.query.repo_name }} +Sources used: +{% for source in context.sources %} +- {{ source.name }}: {{ source.__class__.__name__ }} +{% endfor %} + +{% for source in context.sources %} +{{ formatter.format(source, context.query) }} +{% endfor %} +# End of generated content +""" + context_template = self.env.from_string(template) + return context_template.render(context=context, formatter=self) -class StupidFormatter(DefaultFormatter): + +class DebugFormatter: def __init__(self): - super().__init__() + self.separator = SEPARATOR + self.env = Environment(loader=BaseLoader()) - @self.summary.register(FileSystemTextFile) - def _(node: FileSystemTextFile, query): - template = \ + @singledispatchmethod + def format(self, node: Source, query): + """Format any Source type with debug information.""" + # Get the actual class name + class_name = node.__class__.__name__ + + # Get all field names (both from dataclass fields and regular attributes) + field_names = [] + + # Try to get dataclass fields first + try: + if hasattr(node, '__dataclass_fields__') and hasattr(node.__dataclass_fields__, 'keys'): + field_names.extend(node.__dataclass_fields__.keys()) + else: + raise AttributeError # Fall through to backup method + except (AttributeError, TypeError): + # Fall back to getting all non-private attributes + field_names = [attr for attr in dir(node) + if not attr.startswith('_') + and not callable(getattr(node, attr, None))] + + # Format the debug output + fields_str = ", ".join(field_names) + template = \ """ {{ SEPARATOR }} -{{ node.name }} +DEBUG: {{ class_name }} +Fields: {{ fields_str }} {{ SEPARATOR }} -FileSystemTextFile """ + debug_template = self.env.from_string(template) + return debug_template.render( + SEPARATOR=SEPARATOR, + class_name=class_name, + fields_str=fields_str + ) - @self.format.register(FileSystemFile) - def _(node: FileSystemFile, query): - template = \ + +class SummaryFormatter: + """Dedicated formatter for generating summaries of filesystem nodes.""" + + def __init__(self): + self.env = Environment(loader=BaseLoader()) + + @singledispatchmethod + def summary(self, node: Source, query): + return f"{getattr(node, 'name', '')}" + + @summary.register + def _(self, node: FileSystemDirectory, query): + template = \ """ -{{ SEPARATOR }} -{{ node.name }} -{{ SEPARATOR }} -FileSystemFile +Directory structure: +{{ node.tree }} """ - file_template = self.env.from_string(template) - return file_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) + summary_template = self.env.from_string(template) + return summary_template.render(node=node, query=query) -def generate_digest(context) -> str: - """Generate a digest from a Context object. - - Parameters - ---------- - context : Context - The context object containing nodes, formatter, and query. - - Returns - ------- - str - The formatted digest string with header, content, and footer. - """ - if context.query.user_name and context.query.repo_name: - context_header = CONTEXT_HEADER.format(f"/{context.query.user_name}/{context.query.repo_name}") - else: - context_header = CONTEXT_HEADER.format("") - context_footer = CONTEXT_FOOTER - formatted = [] - for node in context.nodes: - formatted.append(context.formatter.format(node, context.query)) - return context_header + "\n".join(formatted) + context_footer + @summary.register + def _(self, context: Context, query): + template = \ +""" +{{ context.summary }} +""" + summary_template = self.env.from_string(template) + return summary_template.render(context=context, query=query) \ No newline at end of file diff --git a/src/gitingest/schemas/__init__.py b/src/gitingest/schemas/__init__.py index 3478ac64..6cf0c3cc 100644 --- a/src/gitingest/schemas/__init__.py +++ b/src/gitingest/schemas/__init__.py @@ -1,7 +1,7 @@ """Module containing the schemas for the Gitingest package.""" from gitingest.schemas.cloning import CloneConfig -from gitingest.schemas.filesystem import FileSystemNode, FileSystemFile, FileSystemDirectory, FileSystemSymlink, FileSystemTextFile, FileSystemStats, Context, Source +from gitingest.schemas.filesystem import FileSystemNode, FileSystemFile, FileSystemDirectory, FileSystemSymlink, FileSystemStats, Context, Source from gitingest.schemas.ingestion import IngestionQuery -__all__ = ["CloneConfig", "FileSystemNode", "FileSystemFile", "FileSystemDirectory", "FileSystemSymlink", "FileSystemTextFile", "FileSystemStats", "IngestionQuery", "Context"] +__all__ = ["CloneConfig", "FileSystemNode", "FileSystemFile", "FileSystemDirectory", "FileSystemSymlink", "FileSystemStats", "IngestionQuery", "Context"] diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index 98858f95..a03afaa0 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -19,8 +19,6 @@ from gitingest.output_formatter import Formatter SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48 -CONTEXT_HEADER = "# Generated using https://gitingest.com{}\n" # Replace with /user/repo if we have it otherwise leave it blank -CONTEXT_FOOTER = "# End of gitingest context\n" class FileSystemNodeType(Enum): """Enum representing the type of a file system node (directory or file).""" @@ -40,15 +38,7 @@ class FileSystemStats: @dataclass class Source(ABC): """Abstract base class for all sources (files, directories, etc).""" - @property - def tree(self) -> str: - return self._tree() - @property - def summary(self) -> str: - return getattr(self, "_summary", "") - @summary.setter - def summary(self, value: str) -> None: - self._summary = value + pass @dataclass class FileSystemNode(Source): @@ -60,10 +50,6 @@ class FileSystemNode(Source): @property def tree(self): - return self._tree() - - @singledispatchmethod - def _tree(self): return self.name @dataclass @@ -77,13 +63,10 @@ def content(self): except Exception as e: return f"Error reading content of {self.name}: {e}" -@dataclass -class FileSystemTextFile(FileSystemFile): - pass + def render_tree(self, prefix="", is_last=True): + current_prefix = "└── " if is_last else "├── " + return [f"{prefix}{current_prefix}{self.name}"] -@FileSystemNode._tree.register -def _(self: 'FileSystemFile'): - return self.name @dataclass class FileSystemDirectory(FileSystemNode): @@ -103,49 +86,44 @@ def _sort_key(child: FileSystemNode) -> tuple[int, str]: return (3 if not name.startswith(".") else 4, name) self.children.sort(key=_sort_key) -@FileSystemNode._tree.register -def _(self: 'FileSystemDirectory'): - def render_tree(node, prefix="", is_last=True): + def render_tree(self, prefix="", is_last=True): lines = [] current_prefix = "└── " if is_last else "├── " - display_name = node.name + "/" + display_name = self.name + "/" lines.append(f"{prefix}{current_prefix}{display_name}") - if hasattr(node, 'children') and node.children: + if hasattr(self, 'children') and self.children: new_prefix = prefix + (" " if is_last else "│ ") - for i, child in enumerate(node.children): - is_last_child = i == len(node.children) - 1 - lines.extend(child._tree()(child, prefix=new_prefix, is_last=is_last_child) if hasattr(child, '_tree') else [child.name]) + for i, child in enumerate(self.children): + is_last_child = i == len(self.children) - 1 + lines.extend(child.render_tree(prefix=new_prefix, is_last=is_last_child)) return lines - return "\n".join(render_tree(self)) + + @property + def tree(self): + return "\n".join(self.render_tree()) @dataclass class FileSystemSymlink(FileSystemNode): target: str = "" # Add symlink-specific fields if needed -@FileSystemNode._tree.register -def _(self: 'FileSystemSymlink'): - return f"{self.name} -> {self.target}" if self.target else self.name + def render_tree(self, prefix="", is_last=True): + current_prefix = "└── " if is_last else "├── " + display_name = f"{self.name} -> {self.target}" if self.target else self.name + return [f"{prefix}{current_prefix}{display_name}"] -class Context: - """Context for holding a list of Source objects that can be formatted using a Formatter. +class Context(Source): + """The Context object is a general container for multiple unrelated sources. Attributes ---------- - nodes : list[Source] + sources : list[Source] The list of source objects to format. - formatter : Formatter - The formatter to use for formatting sources. query : IngestionQuery The query context. """ - - def __init__(self, nodes: list[Source], formatter: Formatter, query: IngestionQuery): - self.nodes = nodes - self.formatter = formatter - self.query = query - @property - def summary(self): - return "\n".join(self.formatter.summary(node, self.query) for node in self.nodes) \ No newline at end of file + def __init__(self, sources: list[Source], query: IngestionQuery): + self.sources = sources + self.query = query diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 871e9b38..6f60c175 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -8,7 +8,7 @@ from gitingest.clone import clone_repo from gitingest.ingestion import ingest_query -from gitingest.output_formatter import generate_digest +from gitingest.output_formatter import DefaultFormatter, SummaryFormatter from gitingest.query_parser import parse_remote_repo from gitingest.utils.git_utils import resolve_commit, validate_github_token from gitingest.utils.logging_config import get_logger @@ -304,7 +304,8 @@ async def process_query( try: context = ingest_query(query) - digest = generate_digest(context) + digest = DefaultFormatter().format(context, query) + summary = SummaryFormatter().summary(context, query) # Store digest based on S3 configuration if is_s3_enabled(): @@ -355,9 +356,9 @@ async def process_query( return IngestSuccessResponse( repo_url=input_text, short_repo_url=short_repo_url, - summary="", + summary=summary, digest_url=digest_url, - tree="", + tree=context.sources[0].tree, # TODO: this is a hack to get the tree of the first source content=digest, default_max_file_size=max_file_size, pattern_type=pattern_type, diff --git a/tests/test_output_formatter.py b/tests/test_output_formatter.py new file mode 100644 index 00000000..f916e2ee --- /dev/null +++ b/tests/test_output_formatter.py @@ -0,0 +1,278 @@ +"""Tests for the output_formatter module. + +These tests validate the formatting behavior of DefaultFormatter and StupidFormatter +for different FileSystemNode types (File, Directory, Symlink). +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import Mock, patch + +import pytest + +from gitingest.output_formatter import DefaultFormatter, DebugFormatter, SummaryFormatter +from gitingest.schemas import FileSystemFile, FileSystemDirectory, FileSystemSymlink, IngestionQuery +from gitingest.schemas.filesystem import FileSystemNodeType + + +@pytest.fixture +def mock_query() -> IngestionQuery: + """Create a mock IngestionQuery for testing.""" + query = Mock(spec=IngestionQuery) + query.user_name = "test_user" + query.repo_name = "test_repo" + query.slug = "test_slug" + query.branch = "main" + query.commit = "abc123" + query.subpath = "/" + query.tag = None + return query + + +@pytest.fixture +def mock_file_node() -> FileSystemFile: + """Create a mock FileSystemFile for testing.""" + file_node = Mock(spec=FileSystemFile) + file_node.name = "test_file.py" + file_node.path = Path("/fake/path/test_file.py") + file_node.path_str = "/fake/path/test_file.py" + file_node.content = "print('hello world')\nprint('test content')" + file_node.size = 100 + file_node.depth = 1 + file_node.type = FileSystemNodeType.FILE + return file_node + + +@pytest.fixture +def mock_directory_node() -> FileSystemDirectory: + """Create a mock FileSystemDirectory for testing.""" + dir_node = Mock(spec=FileSystemDirectory) + dir_node.name = "src" + dir_node.path = Path("/fake/path/src") + dir_node.path_str = "/fake/path/src" + dir_node.children = [] + dir_node.file_count = 2 + dir_node.dir_count = 1 + dir_node.size = 500 + dir_node.depth = 0 + dir_node.type = FileSystemNodeType.DIRECTORY + dir_node.tree = "src/\n├── file1.py\n└── file2.py" + return dir_node + + +@pytest.fixture +def mock_symlink_node() -> FileSystemSymlink: + """Create a mock FileSystemSymlink for testing.""" + symlink_node = Mock(spec=FileSystemSymlink) + symlink_node.name = "link_to_file" + symlink_node.path = Path("/fake/path/link_to_file") + symlink_node.path_str = "/fake/path/link_to_file" + symlink_node.target = "target_file.py" + symlink_node.size = 0 + symlink_node.depth = 1 + symlink_node.type = FileSystemNodeType.SYMLINK + return symlink_node + + +class TestDefaultFormatter: + """Test cases for DefaultFormatter class.""" + + def test_init(self): + """Test DefaultFormatter initialization.""" + formatter = DefaultFormatter() + assert formatter.env is not None + assert formatter.format is not None + + def test_format_file_node(self, mock_file_node, mock_query): + """Test formatting a FileSystemFile node.""" + formatter = DefaultFormatter() + result = formatter.format(mock_file_node, mock_query) + + # Should contain separator, filename, and content + assert "================================================" in result + assert "test_file.py" in result + assert "print('hello world')" in result + assert "print('test content')" in result + + def test_format_directory_node(self, mock_directory_node, mock_query): + """Test formatting a FileSystemDirectory node.""" + # Create mock child nodes + child1 = Mock() + child2 = Mock() + mock_directory_node.children = [child1, child2] + + formatter = DefaultFormatter() + + # Mock the format method calls for children + with patch.object(formatter, 'format', side_effect=lambda node, query: f"formatted_{node.name}" if hasattr(node, 'name') else "formatted_child") as mock_format: + # Need to call the actual method for the directory node itself + mock_format.side_effect = None + result = formatter.format(mock_directory_node, mock_query) + + # Reset side effect and call again to test child formatting + mock_format.side_effect = lambda node, query: f"formatted_{getattr(node, 'name', 'child')}" + result = formatter.format(mock_directory_node, mock_query) + + def test_format_symlink_node(self, mock_symlink_node, mock_query): + """Test formatting a FileSystemSymlink node.""" + formatter = DefaultFormatter() + result = formatter.format(mock_symlink_node, mock_query) + + # Should contain separator, filename, and target + assert "================================================" in result + assert "link_to_file" in result + assert "target_file.py" in result + + def test_format_symlink_node_no_target(self, mock_symlink_node, mock_query): + """Test formatting a FileSystemSymlink node without target.""" + mock_symlink_node.target = "" + formatter = DefaultFormatter() + result = formatter.format(mock_symlink_node, mock_query) + + # Should contain separator and filename but no arrow + assert "================================================" in result + assert "link_to_file" in result + assert " -> " not in result + +class TestSummaryFormatter: + """Test cases for SummaryFormatter class.""" + + def test_init(self): + """Test SummaryFormatter initialization.""" + formatter = SummaryFormatter() + assert formatter.env is not None + assert formatter.summary is not None + + def test_summary_directory_node(self, mock_directory_node, mock_query): + """Test summary generation for a FileSystemDirectory node.""" + formatter = SummaryFormatter() + result = formatter.summary(mock_directory_node, mock_query) + + assert "Directory structure:" in result + assert "src/" in result + assert "file1.py" in result + assert "file2.py" in result + + def test_summary_file_node_default(self, mock_file_node, mock_query): + """Test default summary for FileSystemFile node.""" + formatter = SummaryFormatter() + result = formatter.summary(mock_file_node, mock_query) + + # Should use default handler and return the name + assert "test_file.py" in result + + +class TestDebugFormatter: + """Test cases for DebugFormatter class.""" + + def test_init(self): + """Test DebugFormatter initialization.""" + formatter = DebugFormatter() + assert formatter.env is not None + assert formatter.format is not None + + def test_format_file_node_debug_info(self, mock_file_node, mock_query): + """Test that DebugFormatter shows debug info for FileSystemFile.""" + formatter = DebugFormatter() + result = formatter.format(mock_file_node, mock_query) + + # Should contain debug information + assert "================================================" in result + assert "DEBUG: FileSystemFile" in result + assert "Fields:" in result + # Should contain field names + assert "name" in result + assert "path" in result + assert "size" in result + + def test_format_directory_node_debug_info(self, mock_directory_node, mock_query): + """Test that DebugFormatter shows debug info for FileSystemDirectory.""" + formatter = DebugFormatter() + result = formatter.format(mock_directory_node, mock_query) + + # Should contain debug information + assert "DEBUG: FileSystemDirectory" in result + assert "Fields:" in result + assert "name" in result + assert "children" in result + + def test_format_symlink_node_debug_info(self, mock_symlink_node, mock_query): + """Test that DebugFormatter shows debug info for FileSystemSymlink.""" + formatter = DebugFormatter() + result = formatter.format(mock_symlink_node, mock_query) + + # Should contain debug information + assert "DEBUG: FileSystemSymlink" in result + assert "Fields:" in result + assert "name" in result + assert "target" in result + + def test_format_all_node_types_show_debug(self, mock_file_node, mock_directory_node, mock_symlink_node, mock_query): + """Test that DebugFormatter shows debug info for all node types.""" + formatter = DebugFormatter() + + file_result = formatter.format(mock_file_node, mock_query) + dir_result = formatter.format(mock_directory_node, mock_query) + symlink_result = formatter.format(mock_symlink_node, mock_query) + + # All should contain debug headers + assert "DEBUG: FileSystemFile" in file_result + assert "DEBUG: FileSystemDirectory" in dir_result + assert "DEBUG: FileSystemSymlink" in symlink_result + + # All should contain field information + assert "Fields:" in file_result + assert "Fields:" in dir_result + assert "Fields:" in symlink_result + + def test_debug_formatter_vs_default_formatter(self, mock_file_node, mock_query): + """Test that DebugFormatter produces different output than DefaultFormatter.""" + default_formatter = DefaultFormatter() + debug_formatter = DebugFormatter() + + default_result = default_formatter.format(mock_file_node, mock_query) + debug_result = debug_formatter.format(mock_file_node, mock_query) + + # Results should be different + assert default_result != debug_result + + # Debug should contain debug info, default should not + assert "DEBUG:" in debug_result + assert "DEBUG:" not in default_result + + # Debug should show fields, default shows content + assert "Fields:" in debug_result + assert "Fields:" not in default_result + + +class TestFormatterEdgeCases: + """Test edge cases and error conditions.""" + + def test_format_unknown_node_type(self, mock_query): + """Test formatting with an unknown node type.""" + unknown_node = Mock() + unknown_node.name = "unknown" + + formatter = DefaultFormatter() + # Should fall back to default behavior + result = formatter.format(unknown_node, mock_query) + assert result is not None + + def test_format_node_without_name(self, mock_query): + """Test formatting a node without a name attribute.""" + nameless_node = Mock(spec=FileSystemFile) + # Remove name attribute + del nameless_node.name + + formatter = DebugFormatter() + # Should handle gracefully (jinja template will show empty) + result = formatter.format(nameless_node, mock_query) + assert result is not None + + def test_format_with_none_query(self, mock_file_node): + """Test formatting with None query.""" + formatter = DefaultFormatter() + # Should handle None query gracefully + result = formatter.format(mock_file_node, None) + assert result is not None \ No newline at end of file From 8e7070d4185c18efa8d847ccca1c99272832852d Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Mon, 28 Jul 2025 03:51:23 +0200 Subject: [PATCH 09/17] feat: add GitRepository Type --- src/gitingest/ingestion.py | 56 +++++++++++++++++---------- src/gitingest/output_formatter.py | 59 ++++++++++++++++------------- src/gitingest/schemas/__init__.py | 23 ++++++++++- src/gitingest/schemas/filesystem.py | 47 ++++++++++++++++------- 4 files changed, 123 insertions(+), 62 deletions(-) diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 398463c0..7910316a 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -6,9 +6,8 @@ from typing import TYPE_CHECKING from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES -from gitingest.output_formatter import DefaultFormatter, DebugFormatter, SummaryFormatter -from gitingest.schemas import FileSystemNode, FileSystemStats, Context -from gitingest.schemas.filesystem import FileSystemDirectory, FileSystemFile, FileSystemSymlink +from gitingest.schemas import Context, FileSystemNode, FileSystemStats +from gitingest.schemas.filesystem import FileSystemDirectory, FileSystemFile, FileSystemSymlink, GitRepository from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.logging_config import get_logger @@ -19,6 +18,11 @@ logger = get_logger(__name__) +def _is_git_repository(path: Path) -> bool: + """Check if a directory contains a .git folder.""" + return (path / ".git").exists() + + def ingest_query(query: IngestionQuery) -> Context: """Run the ingestion process for a parsed query. @@ -90,17 +94,19 @@ def ingest_query(query: IngestionQuery) -> Context: ) return Context([file_node], query) - # root_node = FileSystemNode( - # name=path.name, - # type=FileSystemNodeType.DIRECTORY, - # path_str=str(path.relative_to(query.local_path)), - # path=path, - # ) - root_node = FileSystemDirectory( - name=path.name, - path_str=str(path.relative_to(query.local_path)), - path=path, - ) + # Check if this is a git repository and create appropriate node type + if _is_git_repository(path): + root_node = GitRepository( + name=path.name, + path_str=str(path.relative_to(query.local_path)), + path=path, + ) + else: + root_node = FileSystemDirectory( + name=path.name, + path_str=str(path.relative_to(query.local_path)), + path=path, + ) stats = FileSystemStats() @@ -161,12 +167,21 @@ def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystem continue _process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path) elif sub_path.is_dir(): - child_directory_node = FileSystemDirectory( - name=sub_path.name, - path_str=str(sub_path.relative_to(query.local_path)), - path=sub_path, - depth=node.depth + 1, - ) + # Check if this subdirectory is a git repository + if _is_git_repository(sub_path): + child_directory_node = GitRepository( + name=sub_path.name, + path_str=str(sub_path.relative_to(query.local_path)), + path=sub_path, + depth=node.depth + 1, + ) + else: + child_directory_node = FileSystemDirectory( + name=sub_path.name, + path_str=str(sub_path.relative_to(query.local_path)), + path=sub_path, + depth=node.depth + 1, + ) _process_node(node=child_directory_node, query=query, stats=stats) @@ -256,7 +271,6 @@ def _process_file(path: Path, parent_node: FileSystemDirectory, stats: FileSyste stats.total_files += 1 stats.total_size += file_size - child = FileSystemFile( name=path.name, path_str=str(path.relative_to(local_path)), diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 30e68354..b98b250f 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -3,18 +3,17 @@ from __future__ import annotations import ssl +from functools import singledispatchmethod from typing import TYPE_CHECKING import requests.exceptions import tiktoken +from jinja2 import BaseLoader, Environment -from gitingest.schemas import FileSystemNode +from gitingest.schemas import FileSystemDirectory, FileSystemFile, FileSystemNode, FileSystemSymlink, Source +from gitingest.schemas.filesystem import SEPARATOR, Context, FileSystemNodeType, GitRepository from gitingest.utils.compat_func import readlink -from functools import singledispatchmethod -from gitingest.schemas import Source, FileSystemFile, FileSystemDirectory, FileSystemSymlink -from gitingest.schemas.filesystem import SEPARATOR, Context, FileSystemNodeType from gitingest.utils.logging_config import get_logger -from jinja2 import Environment, BaseLoader if TYPE_CHECKING: from gitingest.schemas import IngestionQuery @@ -30,6 +29,7 @@ # Backward compatibility + def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False) -> str: """Create a prefix string for summarizing a repository or local directory. @@ -208,8 +208,7 @@ def format(self, node: Source, query): @format.register def _(self, node: FileSystemFile, query): - template = \ -""" + template = """ {{ SEPARATOR }} {{ node.name }} {{ SEPARATOR }} @@ -221,8 +220,7 @@ def _(self, node: FileSystemFile, query): @format.register def _(self, node: FileSystemDirectory, query): - template = \ -""" + template = """ {% if node.depth == 0 %} {{ node.name }}: {{ node.tree }} @@ -236,9 +234,23 @@ def _(self, node: FileSystemDirectory, query): return dir_template.render(node=node, query=query, formatter=self) @format.register - def _(self, node: FileSystemSymlink, query): - template = \ + def _(self, node: GitRepository, query): + template = """ +{% if node.depth == 0 %} +🔗 Git Repository: {{ node.name }} +{{ node.tree }} + +{% endif %} +{% for child in node.children %} +{{ formatter.format(child, query) }} +{% endfor %} """ + git_template = self.env.from_string(template) + return git_template.render(node=node, query=query, formatter=self) + + @format.register + def _(self, node: FileSystemSymlink, query): + template = """ {{ SEPARATOR }} {{ node.name }}{% if node.target %} -> {{ node.target }}{% endif %} {{ SEPARATOR }} @@ -249,8 +261,7 @@ def _(self, node: FileSystemSymlink, query): @format.register def _(self, context: Context, query): """Format a Context by formatting all its sources.""" - template = \ -""" + template = """ # Generated using https://gitingest.com/{{ context.query.user_name }}/{{ context.query.repo_name }} Sources used: {% for source in context.sources %} @@ -282,20 +293,19 @@ def format(self, node: Source, query): # Try to get dataclass fields first try: - if hasattr(node, '__dataclass_fields__') and hasattr(node.__dataclass_fields__, 'keys'): + if hasattr(node, "__dataclass_fields__") and hasattr(node.__dataclass_fields__, "keys"): field_names.extend(node.__dataclass_fields__.keys()) else: raise AttributeError # Fall through to backup method except (AttributeError, TypeError): # Fall back to getting all non-private attributes - field_names = [attr for attr in dir(node) - if not attr.startswith('_') - and not callable(getattr(node, attr, None))] + field_names = [ + attr for attr in dir(node) if not attr.startswith("_") and not callable(getattr(node, attr, None)) + ] # Format the debug output fields_str = ", ".join(field_names) - template = \ -""" + template = """ {{ SEPARATOR }} DEBUG: {{ class_name }} Fields: {{ fields_str }} @@ -305,7 +315,7 @@ def format(self, node: Source, query): return debug_template.render( SEPARATOR=SEPARATOR, class_name=class_name, - fields_str=fields_str + fields_str=fields_str, ) @@ -321,20 +331,17 @@ def summary(self, node: Source, query): @summary.register def _(self, node: FileSystemDirectory, query): - template = \ -""" + template = """ Directory structure: {{ node.tree }} """ summary_template = self.env.from_string(template) return summary_template.render(node=node, query=query) - @summary.register def _(self, context: Context, query): - template = \ -""" + template = """ {{ context.summary }} """ summary_template = self.env.from_string(template) - return summary_template.render(context=context, query=query) \ No newline at end of file + return summary_template.render(context=context, query=query) diff --git a/src/gitingest/schemas/__init__.py b/src/gitingest/schemas/__init__.py index 6cf0c3cc..ac8ca2b3 100644 --- a/src/gitingest/schemas/__init__.py +++ b/src/gitingest/schemas/__init__.py @@ -1,7 +1,26 @@ """Module containing the schemas for the Gitingest package.""" from gitingest.schemas.cloning import CloneConfig -from gitingest.schemas.filesystem import FileSystemNode, FileSystemFile, FileSystemDirectory, FileSystemSymlink, FileSystemStats, Context, Source +from gitingest.schemas.filesystem import ( + Context, + FileSystemDirectory, + FileSystemFile, + FileSystemNode, + FileSystemStats, + FileSystemSymlink, + GitRepository, + Source, +) from gitingest.schemas.ingestion import IngestionQuery -__all__ = ["CloneConfig", "FileSystemNode", "FileSystemFile", "FileSystemDirectory", "FileSystemSymlink", "FileSystemStats", "IngestionQuery", "Context"] +__all__ = [ + "CloneConfig", + "Context", + "FileSystemDirectory", + "FileSystemFile", + "FileSystemNode", + "FileSystemStats", + "FileSystemSymlink", + "GitRepository", + "IngestionQuery", +] diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index a03afaa0..17b72997 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -2,24 +2,19 @@ from __future__ import annotations -import os +from abc import ABC from dataclasses import dataclass, field from enum import Enum, auto from typing import TYPE_CHECKING -from abc import ABC -from functools import singledispatchmethod - -from gitingest.utils.compat_func import readlink -from gitingest.utils.file_utils import _decodes, _get_preferred_encodings, _read_chunk -from gitingest.utils.notebook import process_notebook if TYPE_CHECKING: from pathlib import Path + from gitingest.schemas import IngestionQuery - from gitingest.output_formatter import Formatter SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48 + class FileSystemNodeType(Enum): """Enum representing the type of a file system node (directory or file).""" @@ -35,10 +30,11 @@ class FileSystemStats: total_files: int = 0 total_size: int = 0 + @dataclass class Source(ABC): """Abstract base class for all sources (files, directories, etc).""" - pass + @dataclass class FileSystemNode(Source): @@ -52,13 +48,14 @@ class FileSystemNode(Source): def tree(self): return self.name + @dataclass class FileSystemFile(FileSystemNode): @property def content(self): # read the file try: - with open(self.path, "r") as f: + with open(self.path) as f: return f.read() except Exception as e: return f"Error reading content of {self.name}: {e}" @@ -70,20 +67,22 @@ def render_tree(self, prefix="", is_last=True): @dataclass class FileSystemDirectory(FileSystemNode): - children: list['FileSystemNode'] = field(default_factory=list) + children: list[FileSystemNode] = field(default_factory=list) file_count: int = 0 dir_count: int = 0 type: FileSystemNodeType = FileSystemNodeType.DIRECTORY def sort_children(self) -> None: """Sort the children nodes of a directory according to a specific order.""" + def _sort_key(child: FileSystemNode) -> tuple[int, str]: name = child.name.lower() - if hasattr(child, 'type') and getattr(child, 'type', None) == FileSystemNodeType.FILE: + if hasattr(child, "type") and getattr(child, "type", None) == FileSystemNodeType.FILE: if name == "readme" or name.startswith("readme."): return (0, name) return (1 if not name.startswith(".") else 2, name) return (3 if not name.startswith(".") else 4, name) + self.children.sort(key=_sort_key) def render_tree(self, prefix="", is_last=True): @@ -91,7 +90,7 @@ def render_tree(self, prefix="", is_last=True): current_prefix = "└── " if is_last else "├── " display_name = self.name + "/" lines.append(f"{prefix}{current_prefix}{display_name}") - if hasattr(self, 'children') and self.children: + if hasattr(self, "children") and self.children: new_prefix = prefix + (" " if is_last else "│ ") for i, child in enumerate(self.children): is_last_child = i == len(self.children) - 1 @@ -102,6 +101,27 @@ def render_tree(self, prefix="", is_last=True): def tree(self): return "\n".join(self.render_tree()) + +@dataclass +class GitRepository(FileSystemDirectory): + """A directory that contains a .git folder, representing a Git repository.""" + + git_info: dict = field(default_factory=dict) # Store git metadata like branch, commit, etc. + + def render_tree(self, prefix="", is_last=True): + lines = [] + current_prefix = "└── " if is_last else "├── " + # Mark as git repo in the tree + display_name = f"{self.name}/ (git repository)" + lines.append(f"{prefix}{current_prefix}{display_name}") + if hasattr(self, "children") and self.children: + new_prefix = prefix + (" " if is_last else "│ ") + for i, child in enumerate(self.children): + is_last_child = i == len(self.children) - 1 + lines.extend(child.render_tree(prefix=new_prefix, is_last=is_last_child)) + return lines + + @dataclass class FileSystemSymlink(FileSystemNode): target: str = "" @@ -122,6 +142,7 @@ class Context(Source): The list of source objects to format. query : IngestionQuery The query context. + """ def __init__(self, sources: list[Source], query: IngestionQuery): From 8a46848f762fcdb15f67959a05f0ed97901948c9 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Mon, 28 Jul 2025 04:06:24 +0200 Subject: [PATCH 10/17] fix: clean up Jinja templates for better readability --- src/gitingest/output_formatter.py | 37 +++++++++++++------------------ 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index b98b250f..93f0f95f 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -212,7 +212,6 @@ def _(self, node: FileSystemFile, query): {{ SEPARATOR }} {{ node.name }} {{ SEPARATOR }} - {{ node.content }} """ file_template = self.env.from_string(template) @@ -220,31 +219,25 @@ def _(self, node: FileSystemFile, query): @format.register def _(self, node: FileSystemDirectory, query): - template = """ -{% if node.depth == 0 %} -{{ node.name }}: + template = """{%- if node.depth == 0 %}{{ node.name }}: {{ node.tree }} -{% endif %} -{% for child in node.children %} +{% endif -%} +{%- for child in node.children -%} {{ formatter.format(child, query) }} -{% endfor %} -""" +{%- endfor -%}""" dir_template = self.env.from_string(template) return dir_template.render(node=node, query=query, formatter=self) @format.register def _(self, node: GitRepository, query): - template = """ -{% if node.depth == 0 %} -🔗 Git Repository: {{ node.name }} + template = """{%- if node.depth == 0 %}🔗 Git Repository: {{ node.name }} {{ node.tree }} -{% endif %} -{% for child in node.children %} +{% endif -%} +{%- for child in node.children -%} {{ formatter.format(child, query) }} -{% endfor %} -""" +{%- endfor -%}""" git_template = self.env.from_string(template) return git_template.render(node=node, query=query, formatter=self) @@ -261,18 +254,18 @@ def _(self, node: FileSystemSymlink, query): @format.register def _(self, context: Context, query): """Format a Context by formatting all its sources.""" - template = """ -# Generated using https://gitingest.com/{{ context.query.user_name }}/{{ context.query.repo_name }} + template = \ +"""# Generated using https://gitingest.com/{{ context.query.user_name }}/{{ context.query.repo_name }} + Sources used: -{% for source in context.sources %} +{%- for source in context.sources %} - {{ source.name }}: {{ source.__class__.__name__ }} {% endfor %} -{% for source in context.sources %} +{%- for source in context.sources %} {{ formatter.format(source, context.query) }} -{% endfor %} -# End of generated content -""" +{%- endfor %} +# End of generated content""" context_template = self.env.from_string(template) return context_template.render(context=context, formatter=self) From 3abac0b2c260d6740259b30d9f82d36a191b7183 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Mon, 28 Jul 2025 16:42:05 +0200 Subject: [PATCH 11/17] feat: add unit tests for output formatting and enhance file system schema --- src/gitingest/output_formatter.py | 6 +++-- src/gitingest/schemas/filesystem.py | 1 + src/server/query_processor.py | 2 +- test_formatting.py | 36 +++++++++++++++++++++++++++++ 4 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 test_formatting.py diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 93f0f95f..0063ed7e 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -324,7 +324,7 @@ def summary(self, node: Source, query): @summary.register def _(self, node: FileSystemDirectory, query): - template = """ + template = """ \ Directory structure: {{ node.tree }} """ @@ -334,7 +334,9 @@ def _(self, node: FileSystemDirectory, query): @summary.register def _(self, context: Context, query): template = """ -{{ context.summary }} +Repository: {{ context.query.user_name }}/{{ context.query.repo_name }} +Commit: {{ context.query.commit }} +Files analyzed: {{ context.sources[0].file_count }} """ summary_template = self.env.from_string(template) return summary_template.render(context=context, query=query) diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index 17b72997..3f1672b9 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -70,6 +70,7 @@ class FileSystemDirectory(FileSystemNode): children: list[FileSystemNode] = field(default_factory=list) file_count: int = 0 dir_count: int = 0 + file_count_total: int = 0 type: FileSystemNodeType = FileSystemNodeType.DIRECTORY def sort_children(self) -> None: diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 6f60c175..f0997057 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -8,7 +8,7 @@ from gitingest.clone import clone_repo from gitingest.ingestion import ingest_query -from gitingest.output_formatter import DefaultFormatter, SummaryFormatter +from gitingest.output_formatter import DebugFormatter, DefaultFormatter, SummaryFormatter from gitingest.query_parser import parse_remote_repo from gitingest.utils.git_utils import resolve_commit, validate_github_token from gitingest.utils.logging_config import get_logger diff --git a/test_formatting.py b/test_formatting.py new file mode 100644 index 00000000..abb97543 --- /dev/null +++ b/test_formatting.py @@ -0,0 +1,36 @@ +from pathlib import Path +from unittest.mock import Mock +from gitingest.output_formatter import DefaultFormatter +from gitingest.schemas.filesystem import Context, GitRepository, FileSystemFile + +# Create a mock query +mock_query = Mock() +mock_query.user_name = "test_user" +mock_query.repo_name = "test_repo" + +# Create a simple file +mock_file = FileSystemFile( + name="test.py", + path_str="test.py", + path=Path("test.py"), +) +mock_file.content = "print('hello world')" + +# Create a git repository with the file +mock_repo = GitRepository( + name="test_repo", + path_str="", + path=Path("."), + children=[mock_file] +) + +# Create context +context = Context([mock_repo], mock_query) + +# Test formatting +formatter = DefaultFormatter() +result = formatter.format(context, mock_query) +print("RESULT:") +print(repr(result)) +print("\nFORMATTED:") +print(result) \ No newline at end of file From 1f3f3af4684f8f6ebd0a804023062f6a6407690b Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Thu, 31 Jul 2025 02:31:55 +0200 Subject: [PATCH 12/17] feat: extract formatter templates to files and rename Context to ContextV1 --- src/gitingest/entrypoint.py | 6 +- src/gitingest/format/DebugFormatter/Source.j2 | 4 + .../format/DefaultFormatter/ContextV1.j2 | 11 + .../DefaultFormatter/FileSystemDirectory.j2 | 7 + .../format/DefaultFormatter/FileSystemFile.j2 | 4 + .../DefaultFormatter/FileSystemSymlink.j2 | 3 + .../format/DefaultFormatter/GitRepository.j2 | 7 + .../format/SummaryFormatter/ContextV1.j2 | 3 + .../SummaryFormatter/FileSystemDirectory.j2 | 2 + src/gitingest/ingestion.py | 14 +- src/gitingest/output_formatter.py | 235 ++++++++---------- src/gitingest/schemas/__init__.py | 4 +- src/gitingest/schemas/filesystem.py | 4 +- src/server/query_processor.py | 2 +- test_formatting.py | 4 +- 15 files changed, 167 insertions(+), 143 deletions(-) create mode 100644 src/gitingest/format/DebugFormatter/Source.j2 create mode 100644 src/gitingest/format/DefaultFormatter/ContextV1.j2 create mode 100644 src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 create mode 100644 src/gitingest/format/DefaultFormatter/FileSystemFile.j2 create mode 100644 src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 create mode 100644 src/gitingest/format/DefaultFormatter/GitRepository.j2 create mode 100644 src/gitingest/format/SummaryFormatter/ContextV1.j2 create mode 100644 src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index b289f1e8..e024da8b 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -28,7 +28,7 @@ from types import TracebackType from gitingest.schemas import IngestionQuery - from gitingest.schemas import Context + from gitingest.schemas import ContextV1 # Initialize logger for this module logger = get_logger(__name__) @@ -52,7 +52,7 @@ async def ingest_async( This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), and processes its files according to the specified query parameters. It returns a single digest string. - The output is generated lazily using a Context object and the generate_digest() function. + The output is generated lazily using a ContextV1 object and the generate_digest() function. Parameters ---------- @@ -167,7 +167,7 @@ def ingest( This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), and processes its files according to the specified query parameters. It returns a single digest string. - The output is generated lazily using a Context object and the generate_digest() function. + The output is generated lazily using a ContextV1 object and the generate_digest() function. Parameters ---------- diff --git a/src/gitingest/format/DebugFormatter/Source.j2 b/src/gitingest/format/DebugFormatter/Source.j2 new file mode 100644 index 00000000..277ea18d --- /dev/null +++ b/src/gitingest/format/DebugFormatter/Source.j2 @@ -0,0 +1,4 @@ +{{ SEPARATOR }} +DEBUG: {{ class_name }} +Fields: {{ fields_str }} +{{ SEPARATOR }} \ No newline at end of file diff --git a/src/gitingest/format/DefaultFormatter/ContextV1.j2 b/src/gitingest/format/DefaultFormatter/ContextV1.j2 new file mode 100644 index 00000000..d53c3f38 --- /dev/null +++ b/src/gitingest/format/DefaultFormatter/ContextV1.j2 @@ -0,0 +1,11 @@ +# Generated using https://gitingest.com/{{ context.query.user_name }}/{{ context.query.repo_name }} + +Sources used: +{%- for source in context.sources %} +- {{ source.name }}: {{ source.__class__.__name__ }} +{% endfor %} + +{%- for source in context.sources %} +{{ formatter.format(source, context.query) }} +{%- endfor %} +# End of generated content \ No newline at end of file diff --git a/src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 b/src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 new file mode 100644 index 00000000..9a1d30b2 --- /dev/null +++ b/src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 @@ -0,0 +1,7 @@ +{%- if node.depth == 0 %}{{ node.name }}: +{{ node.tree }} + +{% endif -%} +{%- for child in node.children -%} +{{ formatter.format(child, query) }} +{%- endfor -%} \ No newline at end of file diff --git a/src/gitingest/format/DefaultFormatter/FileSystemFile.j2 b/src/gitingest/format/DefaultFormatter/FileSystemFile.j2 new file mode 100644 index 00000000..813d242c --- /dev/null +++ b/src/gitingest/format/DefaultFormatter/FileSystemFile.j2 @@ -0,0 +1,4 @@ +{{ SEPARATOR }} +{{ node.name }} +{{ SEPARATOR }} +{{ node.content }} \ No newline at end of file diff --git a/src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 b/src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 new file mode 100644 index 00000000..1c89b385 --- /dev/null +++ b/src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 @@ -0,0 +1,3 @@ +{{ SEPARATOR }} +{{ node.name }}{% if node.target %} -> {{ node.target }}{% endif %} +{{ SEPARATOR }} \ No newline at end of file diff --git a/src/gitingest/format/DefaultFormatter/GitRepository.j2 b/src/gitingest/format/DefaultFormatter/GitRepository.j2 new file mode 100644 index 00000000..3228dbcc --- /dev/null +++ b/src/gitingest/format/DefaultFormatter/GitRepository.j2 @@ -0,0 +1,7 @@ +{%- if node.depth == 0 %}🔗 Git Repository: {{ node.name }} +{{ node.tree }} + +{% endif -%} +{%- for child in node.children -%} +{{ formatter.format(child, query) }} +{%- endfor -%} \ No newline at end of file diff --git a/src/gitingest/format/SummaryFormatter/ContextV1.j2 b/src/gitingest/format/SummaryFormatter/ContextV1.j2 new file mode 100644 index 00000000..86bd1e73 --- /dev/null +++ b/src/gitingest/format/SummaryFormatter/ContextV1.j2 @@ -0,0 +1,3 @@ +Repository: {{ context.query.user_name }}/{{ context.query.repo_name }} +Commit: {{ context.query.commit }} +Files analyzed: {{ context.sources[0].file_count }} \ No newline at end of file diff --git a/src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 b/src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 new file mode 100644 index 00000000..d32dfe80 --- /dev/null +++ b/src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 @@ -0,0 +1,2 @@ +Directory structure: +{{ node.tree }} \ No newline at end of file diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 7910316a..c1a1351c 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES -from gitingest.schemas import Context, FileSystemNode, FileSystemStats +from gitingest.schemas import ContextV1, FileSystemNode, FileSystemStats from gitingest.schemas.filesystem import FileSystemDirectory, FileSystemFile, FileSystemSymlink, GitRepository from gitingest.utils.ingestion_utils import _should_exclude, _should_include from gitingest.utils.logging_config import get_logger @@ -23,11 +23,11 @@ def _is_git_repository(path: Path) -> bool: return (path / ".git").exists() -def ingest_query(query: IngestionQuery) -> Context: +def ingest_query(query: IngestionQuery) -> ContextV1: """Run the ingestion process for a parsed query. This is the main entry point for analyzing a codebase directory or single file. It processes the query - parameters, reads the file or directory content, and returns a Context object that can generate the final output digest on demand. + parameters, reads the file or directory content, and returns a ContextV1 object that can generate the final output digest on demand. Parameters ---------- @@ -36,8 +36,8 @@ def ingest_query(query: IngestionQuery) -> Context: Returns ------- - Context - A Context object representing the ingested file system nodes. Use generate_digest(context) to get the summary, directory structure, and file contents. + ContextV1 + A ContextV1 object representing the ingested file system nodes. Use generate_digest(context) to get the summary, directory structure, and file contents. Raises ------ @@ -92,7 +92,7 @@ def ingest_query(query: IngestionQuery) -> Context: "file_size": file_node.size, }, ) - return Context([file_node], query) + return ContextV1([file_node], query) # Check if this is a git repository and create appropriate node type if _is_git_repository(path): @@ -123,7 +123,7 @@ def ingest_query(query: IngestionQuery) -> Context: }, ) - return Context([root_node], query) + return ContextV1([root_node], query) def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None: diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 0063ed7e..34390e42 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -4,14 +4,15 @@ import ssl from functools import singledispatchmethod +from pathlib import Path from typing import TYPE_CHECKING import requests.exceptions import tiktoken -from jinja2 import BaseLoader, Environment +from jinja2 import Environment, FileSystemLoader, TemplateNotFound from gitingest.schemas import FileSystemDirectory, FileSystemFile, FileSystemNode, FileSystemSymlink, Source -from gitingest.schemas.filesystem import SEPARATOR, Context, FileSystemNodeType, GitRepository +from gitingest.schemas.filesystem import SEPARATOR, ContextV1, FileSystemNodeType, GitRepository from gitingest.utils.compat_func import readlink from gitingest.utils.logging_config import get_logger @@ -178,15 +179,15 @@ def _format_token_count(text: str) -> str | None: return str(total_tokens) -def generate_digest(context: Context) -> str: - """Generate a digest string from a Context object. +def generate_digest(context: ContextV1) -> str: + """Generate a digest string from a ContextV1 object. - This is a convenience function that uses the DefaultFormatter to format a Context. + This is a convenience function that uses the DefaultFormatter to format a ContextV1. Parameters ---------- - context : Context - The Context object containing sources and query information. + context : ContextV1 + The ContextV1 object containing sources and query information. Returns ------- @@ -200,143 +201,125 @@ def generate_digest(context: Context) -> str: class DefaultFormatter: def __init__(self): self.separator = SEPARATOR - self.env = Environment(loader=BaseLoader()) + template_dir = Path(__file__).parent / "format" / "DefaultFormatter" + self.env = Environment(loader=FileSystemLoader(template_dir)) + + def _get_template_for_node(self, node): + """Get template based on node class name.""" + template_name = f"{node.__class__.__name__}.j2" + return self.env.get_template(template_name) @singledispatchmethod def format(self, node: Source, query): - return f"{getattr(node, 'content', '')}" - - @format.register - def _(self, node: FileSystemFile, query): - template = """ -{{ SEPARATOR }} -{{ node.name }} -{{ SEPARATOR }} -{{ node.content }} -""" - file_template = self.env.from_string(template) - return file_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) - - @format.register - def _(self, node: FileSystemDirectory, query): - template = """{%- if node.depth == 0 %}{{ node.name }}: -{{ node.tree }} - -{% endif -%} -{%- for child in node.children -%} -{{ formatter.format(child, query) }} -{%- endfor -%}""" - dir_template = self.env.from_string(template) - return dir_template.render(node=node, query=query, formatter=self) - - @format.register - def _(self, node: GitRepository, query): - template = """{%- if node.depth == 0 %}🔗 Git Repository: {{ node.name }} -{{ node.tree }} - -{% endif -%} -{%- for child in node.children -%} -{{ formatter.format(child, query) }} -{%- endfor -%}""" - git_template = self.env.from_string(template) - return git_template.render(node=node, query=query, formatter=self) - - @format.register - def _(self, node: FileSystemSymlink, query): - template = """ -{{ SEPARATOR }} -{{ node.name }}{% if node.target %} -> {{ node.target }}{% endif %} -{{ SEPARATOR }} -""" - symlink_template = self.env.from_string(template) - return symlink_template.render(SEPARATOR=SEPARATOR, node=node, query=query, formatter=self) - - @format.register - def _(self, context: Context, query): - """Format a Context by formatting all its sources.""" - template = \ -"""# Generated using https://gitingest.com/{{ context.query.user_name }}/{{ context.query.repo_name }} - -Sources used: -{%- for source in context.sources %} -- {{ source.name }}: {{ source.__class__.__name__ }} -{% endfor %} - -{%- for source in context.sources %} -{{ formatter.format(source, context.query) }} -{%- endfor %} -# End of generated content""" - context_template = self.env.from_string(template) - return context_template.render(context=context, formatter=self) + """Dynamically format any node type based on available templates.""" + try: + template = self._get_template_for_node(node) + # Provide common template variables + context_vars = { + 'node': node, + 'query': query, + 'formatter': self, + 'SEPARATOR': SEPARATOR + } + # Special handling for ContextV1 objects + if isinstance(node, ContextV1): + context_vars['context'] = node + # Use ContextV1 for backward compatibility + template = self.env.get_template("ContextV1.j2") + + return template.render(**context_vars) + except TemplateNotFound: + # Fallback: return content if available, otherwise empty string + return f"{getattr(node, 'content', '')}" class DebugFormatter: def __init__(self): self.separator = SEPARATOR - self.env = Environment(loader=BaseLoader()) + template_dir = Path(__file__).parent / "format" / "DebugFormatter" + self.env = Environment(loader=FileSystemLoader(template_dir)) + + def _get_template_for_node(self, node): + """Get template based on node class name.""" + template_name = f"{node.__class__.__name__}.j2" + return self.env.get_template(template_name) @singledispatchmethod def format(self, node: Source, query): - """Format any Source type with debug information.""" - # Get the actual class name - class_name = node.__class__.__name__ - - # Get all field names (both from dataclass fields and regular attributes) - field_names = [] - - # Try to get dataclass fields first + """Dynamically format any node type with debug information.""" try: - if hasattr(node, "__dataclass_fields__") and hasattr(node.__dataclass_fields__, "keys"): - field_names.extend(node.__dataclass_fields__.keys()) - else: - raise AttributeError # Fall through to backup method - except (AttributeError, TypeError): - # Fall back to getting all non-private attributes - field_names = [ - attr for attr in dir(node) if not attr.startswith("_") and not callable(getattr(node, attr, None)) - ] - - # Format the debug output - fields_str = ", ".join(field_names) - template = """ -{{ SEPARATOR }} -DEBUG: {{ class_name }} -Fields: {{ fields_str }} -{{ SEPARATOR }} -""" - debug_template = self.env.from_string(template) - return debug_template.render( - SEPARATOR=SEPARATOR, - class_name=class_name, - fields_str=fields_str, - ) + # Get the actual class name + class_name = node.__class__.__name__ + + # Get all field names (both from dataclass fields and regular attributes) + field_names = [] + + # Try to get dataclass fields first + try: + if hasattr(node, "__dataclass_fields__") and hasattr(node.__dataclass_fields__, "keys"): + field_names.extend(node.__dataclass_fields__.keys()) + else: + raise AttributeError # Fall through to backup method + except (AttributeError, TypeError): + # Fall back to getting all non-private attributes + field_names = [ + attr for attr in dir(node) if not attr.startswith("_") and not callable(getattr(node, attr, None)) + ] + + # Format the debug output + fields_str = ", ".join(field_names) + + # Try to get specific template, fallback to Source.j2 + try: + template = self._get_template_for_node(node) + except TemplateNotFound: + template = self.env.get_template("Source.j2") + + return template.render( + SEPARATOR=SEPARATOR, + class_name=class_name, + fields_str=fields_str, + node=node, + query=query, + formatter=self + ) + except TemplateNotFound: + # Ultimate fallback + return f"DEBUG: {node.__class__.__name__}" class SummaryFormatter: """Dedicated formatter for generating summaries of filesystem nodes.""" def __init__(self): - self.env = Environment(loader=BaseLoader()) + template_dir = Path(__file__).parent / "format" / "SummaryFormatter" + self.env = Environment(loader=FileSystemLoader(template_dir)) + + def _get_template_for_node(self, node): + """Get template based on node class name.""" + template_name = f"{node.__class__.__name__}.j2" + return self.env.get_template(template_name) @singledispatchmethod def summary(self, node: Source, query): - return f"{getattr(node, 'name', '')}" - - @summary.register - def _(self, node: FileSystemDirectory, query): - template = """ \ -Directory structure: -{{ node.tree }} -""" - summary_template = self.env.from_string(template) - return summary_template.render(node=node, query=query) - - @summary.register - def _(self, context: Context, query): - template = """ -Repository: {{ context.query.user_name }}/{{ context.query.repo_name }} -Commit: {{ context.query.commit }} -Files analyzed: {{ context.sources[0].file_count }} -""" - summary_template = self.env.from_string(template) - return summary_template.render(context=context, query=query) + """Dynamically generate summary for any node type based on available templates.""" + try: + # Provide common template variables + context_vars = { + 'node': node, + 'query': query, + 'formatter': self + } + + # Special handling for ContextV1 objects + if isinstance(node, ContextV1): + context_vars['context'] = node + # Use ContextV1 for backward compatibility + template = self.env.get_template("ContextV1.j2") + else: + template = self._get_template_for_node(node) + + return template.render(**context_vars) + except TemplateNotFound: + # Fallback: return name if available + return f"{getattr(node, 'name', '')}" diff --git a/src/gitingest/schemas/__init__.py b/src/gitingest/schemas/__init__.py index ac8ca2b3..5c5706be 100644 --- a/src/gitingest/schemas/__init__.py +++ b/src/gitingest/schemas/__init__.py @@ -2,7 +2,7 @@ from gitingest.schemas.cloning import CloneConfig from gitingest.schemas.filesystem import ( - Context, + ContextV1, FileSystemDirectory, FileSystemFile, FileSystemNode, @@ -15,7 +15,7 @@ __all__ = [ "CloneConfig", - "Context", + "ContextV1", "FileSystemDirectory", "FileSystemFile", "FileSystemNode", diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index 3f1672b9..eb7deec8 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -134,8 +134,8 @@ def render_tree(self, prefix="", is_last=True): return [f"{prefix}{current_prefix}{display_name}"] -class Context(Source): - """The Context object is a general container for multiple unrelated sources. +class ContextV1(Source): + """The ContextV1 object is a general container for multiple unrelated sources. Attributes ---------- diff --git a/src/server/query_processor.py b/src/server/query_processor.py index f0997057..83af607e 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -23,7 +23,7 @@ upload_metadata_to_s3, upload_to_s3, ) -from gitingest.schemas import Context +from gitingest.schemas import ContextV1 from server.server_config import MAX_DISPLAY_SIZE # Initialize logger for this module diff --git a/test_formatting.py b/test_formatting.py index abb97543..59f557da 100644 --- a/test_formatting.py +++ b/test_formatting.py @@ -1,7 +1,7 @@ from pathlib import Path from unittest.mock import Mock from gitingest.output_formatter import DefaultFormatter -from gitingest.schemas.filesystem import Context, GitRepository, FileSystemFile +from gitingest.schemas.filesystem import ContextV1, GitRepository, FileSystemFile # Create a mock query mock_query = Mock() @@ -25,7 +25,7 @@ ) # Create context -context = Context([mock_repo], mock_query) +context = ContextV1([mock_repo], mock_query) # Test formatting formatter = DefaultFormatter() From 9a717c7d8dcd96f56040bb66cf97cb04744a6c48 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Thu, 31 Jul 2025 16:49:13 +0200 Subject: [PATCH 13/17] feat: move contextv1 to its own file --- .pre-commit-config.yaml | 1 + .../format/DefaultFormatter/ContextV1.j2 | 6 +- .../format/SummaryFormatter/ContextV1.j2 | 2 +- src/gitingest/output_formatter.py | 126 +++++++----------- src/gitingest/schemas/__init__.py | 3 +- src/gitingest/schemas/contextv1.py | 69 ++++++++++ src/gitingest/schemas/filesystem.py | 56 ++++---- test_formatting.py | 17 +-- 8 files changed, 159 insertions(+), 121 deletions(-) create mode 100644 src/gitingest/schemas/contextv1.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 85560838..a88f707a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -84,6 +84,7 @@ repos: rev: v1.36.4 hooks: - id: djlint-reformat-jinja + exclude: ^src/gitingest/format/ - repo: https://github.com/igorshubovych/markdownlint-cli rev: v0.45.0 diff --git a/src/gitingest/format/DefaultFormatter/ContextV1.j2 b/src/gitingest/format/DefaultFormatter/ContextV1.j2 index d53c3f38..0486beb9 100644 --- a/src/gitingest/format/DefaultFormatter/ContextV1.j2 +++ b/src/gitingest/format/DefaultFormatter/ContextV1.j2 @@ -1,11 +1,11 @@ -# Generated using https://gitingest.com/{{ context.query.user_name }}/{{ context.query.repo_name }} +# Generated using https://gitingest.com/{{ context.query.user_name }}/{{ context.query.repo_name }}{{ context.query.subpath }} Sources used: -{%- for source in context.sources %} +{%- for source in context %} - {{ source.name }}: {{ source.__class__.__name__ }} {% endfor %} {%- for source in context.sources %} {{ formatter.format(source, context.query) }} {%- endfor %} -# End of generated content \ No newline at end of file +# End of https://gitingest.com/{{ context.query.user_name }}/{{ context.query.repo_name }}{{ context.query.subpath }} diff --git a/src/gitingest/format/SummaryFormatter/ContextV1.j2 b/src/gitingest/format/SummaryFormatter/ContextV1.j2 index 86bd1e73..6c630ed2 100644 --- a/src/gitingest/format/SummaryFormatter/ContextV1.j2 +++ b/src/gitingest/format/SummaryFormatter/ContextV1.j2 @@ -1,3 +1,3 @@ Repository: {{ context.query.user_name }}/{{ context.query.repo_name }} Commit: {{ context.query.commit }} -Files analyzed: {{ context.sources[0].file_count }} \ No newline at end of file +Files analyzed: {{ context.file_count }} diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index 34390e42..a7adc8b0 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -9,10 +9,10 @@ import requests.exceptions import tiktoken -from jinja2 import Environment, FileSystemLoader, TemplateNotFound +from jinja2 import Environment, FileSystemLoader, Template, TemplateNotFound -from gitingest.schemas import FileSystemDirectory, FileSystemFile, FileSystemNode, FileSystemSymlink, Source -from gitingest.schemas.filesystem import SEPARATOR, ContextV1, FileSystemNodeType, GitRepository +from gitingest.schemas import ContextV1, FileSystemNode, Source +from gitingest.schemas.filesystem import SEPARATOR, FileSystemNodeType from gitingest.utils.compat_func import readlink from gitingest.utils.logging_config import get_logger @@ -28,49 +28,6 @@ ] -# Backward compatibility - - -def _create_summary_prefix(query: IngestionQuery, *, single_file: bool = False) -> str: - """Create a prefix string for summarizing a repository or local directory. - - Includes repository name (if provided), commit/branch details, and subpath if relevant. - - Parameters - ---------- - query : IngestionQuery - The parsed query object containing information about the repository and query parameters. - single_file : bool - A flag indicating whether the summary is for a single file (default: ``False``). - - Returns - ------- - str - A summary prefix string containing repository, commit, branch, and subpath details. - - """ - parts = [] - - if query.user_name: - parts.append(f"Repository: {query.user_name}/{query.repo_name}") - else: - # Local scenario - parts.append(f"Directory: {query.slug}") - - if query.tag: - parts.append(f"Tag: {query.tag}") - elif query.branch and query.branch not in ("main", "master"): - parts.append(f"Branch: {query.branch}") - - if query.commit: - parts.append(f"Commit: {query.commit}") - - if query.subpath != "/" and not single_file: - parts.append(f"Subpath: {query.subpath}") - - return "\n".join(parts) + "\n" - - def _gather_file_contents(node: FileSystemNode) -> str: """Recursively gather contents of all files under the given node. @@ -181,52 +138,55 @@ def _format_token_count(text: str) -> str | None: def generate_digest(context: ContextV1) -> str: """Generate a digest string from a ContextV1 object. - + This is a convenience function that uses the DefaultFormatter to format a ContextV1. - + Parameters ---------- context : ContextV1 The ContextV1 object containing sources and query information. - + Returns ------- str The formatted digest string. + """ formatter = DefaultFormatter() return formatter.format(context, context.query) class DefaultFormatter: - def __init__(self): + """Default formatter for rendering filesystem nodes using Jinja2 templates.""" + + def __init__(self) -> None: self.separator = SEPARATOR template_dir = Path(__file__).parent / "format" / "DefaultFormatter" - self.env = Environment(loader=FileSystemLoader(template_dir)) - - def _get_template_for_node(self, node): + self.env = Environment(loader=FileSystemLoader(template_dir), autoescape=True) + + def _get_template_for_node(self, node: Source) -> Template: """Get template based on node class name.""" template_name = f"{node.__class__.__name__}.j2" return self.env.get_template(template_name) @singledispatchmethod - def format(self, node: Source, query): + def format(self, node: Source, query: IngestionQuery) -> str: """Dynamically format any node type based on available templates.""" try: template = self._get_template_for_node(node) # Provide common template variables context_vars = { - 'node': node, - 'query': query, - 'formatter': self, - 'SEPARATOR': SEPARATOR + "node": node, + "query": query, + "formatter": self, + "SEPARATOR": SEPARATOR, } # Special handling for ContextV1 objects if isinstance(node, ContextV1): - context_vars['context'] = node + context_vars["context"] = node # Use ContextV1 for backward compatibility template = self.env.get_template("ContextV1.j2") - + return template.render(**context_vars) except TemplateNotFound: # Fallback: return content if available, otherwise empty string @@ -234,18 +194,20 @@ def format(self, node: Source, query): class DebugFormatter: - def __init__(self): + """Debug formatter that shows detailed information about filesystem nodes.""" + + def __init__(self) -> None: self.separator = SEPARATOR template_dir = Path(__file__).parent / "format" / "DebugFormatter" - self.env = Environment(loader=FileSystemLoader(template_dir)) - - def _get_template_for_node(self, node): + self.env = Environment(loader=FileSystemLoader(template_dir), autoescape=True) + + def _get_template_for_node(self, node: Source) -> Template: """Get template based on node class name.""" template_name = f"{node.__class__.__name__}.j2" return self.env.get_template(template_name) @singledispatchmethod - def format(self, node: Source, query): + def format(self, node: Source, query: IngestionQuery) -> str: """Dynamically format any node type with debug information.""" try: # Get the actual class name @@ -255,11 +217,15 @@ def format(self, node: Source, query): field_names = [] # Try to get dataclass fields first + def _raise_no_dataclass_fields() -> None: + msg = "No dataclass fields found" + raise AttributeError(msg) + try: if hasattr(node, "__dataclass_fields__") and hasattr(node.__dataclass_fields__, "keys"): field_names.extend(node.__dataclass_fields__.keys()) else: - raise AttributeError # Fall through to backup method + _raise_no_dataclass_fields() # Fall through to backup method except (AttributeError, TypeError): # Fall back to getting all non-private attributes field_names = [ @@ -268,20 +234,20 @@ def format(self, node: Source, query): # Format the debug output fields_str = ", ".join(field_names) - + # Try to get specific template, fallback to Source.j2 try: template = self._get_template_for_node(node) except TemplateNotFound: template = self.env.get_template("Source.j2") - + return template.render( SEPARATOR=SEPARATOR, class_name=class_name, fields_str=fields_str, node=node, query=query, - formatter=self + formatter=self, ) except TemplateNotFound: # Ultimate fallback @@ -291,34 +257,34 @@ def format(self, node: Source, query): class SummaryFormatter: """Dedicated formatter for generating summaries of filesystem nodes.""" - def __init__(self): + def __init__(self) -> None: template_dir = Path(__file__).parent / "format" / "SummaryFormatter" - self.env = Environment(loader=FileSystemLoader(template_dir)) - - def _get_template_for_node(self, node): + self.env = Environment(loader=FileSystemLoader(template_dir), autoescape=True) + + def _get_template_for_node(self, node: Source) -> Template: """Get template based on node class name.""" template_name = f"{node.__class__.__name__}.j2" return self.env.get_template(template_name) @singledispatchmethod - def summary(self, node: Source, query): + def summary(self, node: Source, query: IngestionQuery) -> str: """Dynamically generate summary for any node type based on available templates.""" try: # Provide common template variables context_vars = { - 'node': node, - 'query': query, - 'formatter': self + "node": node, + "query": query, + "formatter": self, } - + # Special handling for ContextV1 objects if isinstance(node, ContextV1): - context_vars['context'] = node + context_vars["context"] = node # Use ContextV1 for backward compatibility template = self.env.get_template("ContextV1.j2") else: template = self._get_template_for_node(node) - + return template.render(**context_vars) except TemplateNotFound: # Fallback: return name if available diff --git a/src/gitingest/schemas/__init__.py b/src/gitingest/schemas/__init__.py index 5c5706be..8a05ccf3 100644 --- a/src/gitingest/schemas/__init__.py +++ b/src/gitingest/schemas/__init__.py @@ -1,8 +1,8 @@ """Module containing the schemas for the Gitingest package.""" from gitingest.schemas.cloning import CloneConfig +from gitingest.schemas.contextv1 import ContextV1 from gitingest.schemas.filesystem import ( - ContextV1, FileSystemDirectory, FileSystemFile, FileSystemNode, @@ -23,4 +23,5 @@ "FileSystemSymlink", "GitRepository", "IngestionQuery", + "Source", ] diff --git a/src/gitingest/schemas/contextv1.py b/src/gitingest/schemas/contextv1.py new file mode 100644 index 00000000..d605e762 --- /dev/null +++ b/src/gitingest/schemas/contextv1.py @@ -0,0 +1,69 @@ +"""Schema for ContextV1 objects used in formatting.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING, Iterator + +from gitingest.schemas.filesystem import FileSystemDirectory, FileSystemNode, Source + +if TYPE_CHECKING: + from gitingest.schemas import IngestionQuery + + +@dataclass +class ContextV1: + """The ContextV1 object is an object that contains all information needed to produce a formatted output. + + This object contains all information needed to produce a formatted output + similar to the "legacy" output. + + Attributes + ---------- + sources : list[Source] + List of source objects (files, directories, etc.) + query : IngestionQuery + The query context. + + """ + + sources: list[Source] + query: IngestionQuery + + @property + def sources_by_type(self) -> dict[str, list[Source]]: + """Return sources grouped by their class name.""" + result = {} + for source in self.sources: + class_name = source.__class__.__name__ + if class_name not in result: + result[class_name] = [] + result[class_name].append(source) + return result + + def __getitem__(self, key: str) -> list[Source]: + """Allow dict-like access to sources by type name.""" + sources_dict = self.sources_by_type + if key not in sources_dict: + error_msg = f"No sources of type '{key}' found" + raise KeyError(error_msg) + return sources_dict[key] + + def __iter__(self) -> Iterator[Source]: + """Allow iteration over all sources.""" + return iter(self.sources) + + @property + def file_count(self) -> int: + """Calculate total file count based on sources.""" + # No need to iterate on children, directories are already aware of their + # file count + total = 0 + for source in self.sources: + if isinstance(source, FileSystemDirectory): + # For directories, add their file_count + total += source.file_count + elif isinstance(source, FileSystemNode): + # For individual files/nodes, increment by 1 + total += 1 + return total diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index eb7deec8..49cf2cf0 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -2,7 +2,7 @@ from __future__ import annotations -from abc import ABC +from abc import ABC, abstractmethod from dataclasses import dataclass, field from enum import Enum, auto from typing import TYPE_CHECKING @@ -10,7 +10,6 @@ if TYPE_CHECKING: from pathlib import Path - from gitingest.schemas import IngestionQuery SEPARATOR = "=" * 48 # Tiktoken, the tokenizer openai uses, counts 2 tokens if we have more than 48 @@ -35,9 +34,15 @@ class FileSystemStats: class Source(ABC): """Abstract base class for all sources (files, directories, etc).""" + @abstractmethod + def render_tree(self, prefix: str = "", *, is_last: bool = True) -> list[str]: + """Render the tree representation of this source.""" + @dataclass class FileSystemNode(Source): + """Base class for filesystem nodes (files, directories, symlinks).""" + name: str path_str: str path: Path @@ -45,28 +50,34 @@ class FileSystemNode(Source): size: int = 0 @property - def tree(self): + def tree(self) -> str: + """Return the name of this node.""" return self.name @dataclass class FileSystemFile(FileSystemNode): + """Represents a file in the filesystem.""" + @property - def content(self): + def content(self) -> str: + """Read and return the content of the file.""" # read the file try: - with open(self.path) as f: - return f.read() + return self.path.read_text(encoding="utf-8") except Exception as e: return f"Error reading content of {self.name}: {e}" - def render_tree(self, prefix="", is_last=True): + def render_tree(self, prefix: str = "", *, is_last: bool = True) -> list[str]: + """Render the tree representation of this file.""" current_prefix = "└── " if is_last else "├── " return [f"{prefix}{current_prefix}{self.name}"] @dataclass class FileSystemDirectory(FileSystemNode): + """Represents a directory in the filesystem.""" + children: list[FileSystemNode] = field(default_factory=list) file_count: int = 0 dir_count: int = 0 @@ -86,7 +97,8 @@ def _sort_key(child: FileSystemNode) -> tuple[int, str]: self.children.sort(key=_sort_key) - def render_tree(self, prefix="", is_last=True): + def render_tree(self, prefix: str = "", *, is_last: bool = True) -> list[str]: + """Render the tree representation of this directory.""" lines = [] current_prefix = "└── " if is_last else "├── " display_name = self.name + "/" @@ -99,7 +111,8 @@ def render_tree(self, prefix="", is_last=True): return lines @property - def tree(self): + def tree(self) -> str: + """Return the tree representation of this directory.""" return "\n".join(self.render_tree()) @@ -109,7 +122,8 @@ class GitRepository(FileSystemDirectory): git_info: dict = field(default_factory=dict) # Store git metadata like branch, commit, etc. - def render_tree(self, prefix="", is_last=True): + def render_tree(self, prefix: str = "", *, is_last: bool = True) -> list[str]: + """Render the tree representation of this git repository.""" lines = [] current_prefix = "└── " if is_last else "├── " # Mark as git repo in the tree @@ -125,27 +139,13 @@ def render_tree(self, prefix="", is_last=True): @dataclass class FileSystemSymlink(FileSystemNode): + """Represents a symbolic link in the filesystem.""" + target: str = "" # Add symlink-specific fields if needed - def render_tree(self, prefix="", is_last=True): + def render_tree(self, prefix: str = "", *, is_last: bool = True) -> list[str]: + """Render the tree representation of this symlink.""" current_prefix = "└── " if is_last else "├── " display_name = f"{self.name} -> {self.target}" if self.target else self.name return [f"{prefix}{current_prefix}{display_name}"] - - -class ContextV1(Source): - """The ContextV1 object is a general container for multiple unrelated sources. - - Attributes - ---------- - sources : list[Source] - The list of source objects to format. - query : IngestionQuery - The query context. - - """ - - def __init__(self, sources: list[Source], query: IngestionQuery): - self.sources = sources - self.query = query diff --git a/test_formatting.py b/test_formatting.py index 59f557da..38cd2265 100644 --- a/test_formatting.py +++ b/test_formatting.py @@ -1,7 +1,11 @@ +"""Test script for formatting functionality.""" + from pathlib import Path from unittest.mock import Mock + from gitingest.output_formatter import DefaultFormatter -from gitingest.schemas.filesystem import ContextV1, GitRepository, FileSystemFile +from gitingest.schemas import ContextV1 +from gitingest.schemas.filesystem import FileSystemFile, GitRepository # Create a mock query mock_query = Mock() @@ -11,7 +15,7 @@ # Create a simple file mock_file = FileSystemFile( name="test.py", - path_str="test.py", + path_str="test.py", path=Path("test.py"), ) mock_file.content = "print('hello world')" @@ -20,8 +24,8 @@ mock_repo = GitRepository( name="test_repo", path_str="", - path=Path("."), - children=[mock_file] + path=Path(), + children=[mock_file], ) # Create context @@ -30,7 +34,4 @@ # Test formatting formatter = DefaultFormatter() result = formatter.format(context, mock_query) -print("RESULT:") -print(repr(result)) -print("\nFORMATTED:") -print(result) \ No newline at end of file +# Results can be inspected with debugger or logging if needed From 175d5c2dd43498a3955749ce47fa6e088565eea5 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Thu, 31 Jul 2025 17:14:21 +0200 Subject: [PATCH 14/17] fix: resolve all pre-commit hook issues and lint warnings --- src/gitingest/entrypoint.py | 29 ++-- src/gitingest/format/DebugFormatter/Source.j2 | 2 +- .../DefaultFormatter/FileSystemDirectory.j2 | 2 +- .../format/DefaultFormatter/FileSystemFile.j2 | 2 +- .../DefaultFormatter/FileSystemSymlink.j2 | 2 +- .../format/DefaultFormatter/GitRepository.j2 | 2 +- .../SummaryFormatter/FileSystemDirectory.j2 | 2 +- src/gitingest/ingestion.py | 12 +- src/server/query_processor.py | 18 +-- src/server/routers_utils.py | 16 ++- tests/test_output_formatter.py | 134 ++++++++++++------ 11 files changed, 132 insertions(+), 89 deletions(-) diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index e024da8b..2212ed03 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -28,7 +28,6 @@ from types import TracebackType from gitingest.schemas import IngestionQuery - from gitingest.schemas import ContextV1 # Initialize logger for this module logger = get_logger(__name__) @@ -138,7 +137,6 @@ async def ingest_async( _apply_gitignores(query) logger.info("Processing files and generating output") - summary, tree, content = ingest_query(query) if output: logger.debug("Writing output to file", extra={"output_path": output}) @@ -205,19 +203,20 @@ def ingest( ``ingest_async`` : The asynchronous version of this function. """ - digest = asyncio.run(ingest_async( - source, - max_file_size=max_file_size, - include_patterns=include_patterns, - exclude_patterns=exclude_patterns, - branch=branch, - tag=tag, - include_gitignored=include_gitignored, - include_submodules=include_submodules, - token=token, - output=output, - )) - return digest + return asyncio.run( + ingest_async( + source, + max_file_size=max_file_size, + include_patterns=include_patterns, + exclude_patterns=exclude_patterns, + branch=branch, + tag=tag, + include_gitignored=include_gitignored, + include_submodules=include_submodules, + token=token, + output=output, + ), + ) def _override_branch_and_tag(query: IngestionQuery, branch: str | None, tag: str | None) -> None: diff --git a/src/gitingest/format/DebugFormatter/Source.j2 b/src/gitingest/format/DebugFormatter/Source.j2 index 277ea18d..ecebd57b 100644 --- a/src/gitingest/format/DebugFormatter/Source.j2 +++ b/src/gitingest/format/DebugFormatter/Source.j2 @@ -1,4 +1,4 @@ {{ SEPARATOR }} DEBUG: {{ class_name }} Fields: {{ fields_str }} -{{ SEPARATOR }} \ No newline at end of file +{{ SEPARATOR }} diff --git a/src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 b/src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 index 9a1d30b2..4e003344 100644 --- a/src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 +++ b/src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 @@ -4,4 +4,4 @@ {% endif -%} {%- for child in node.children -%} {{ formatter.format(child, query) }} -{%- endfor -%} \ No newline at end of file +{%- endfor -%} diff --git a/src/gitingest/format/DefaultFormatter/FileSystemFile.j2 b/src/gitingest/format/DefaultFormatter/FileSystemFile.j2 index 813d242c..37567010 100644 --- a/src/gitingest/format/DefaultFormatter/FileSystemFile.j2 +++ b/src/gitingest/format/DefaultFormatter/FileSystemFile.j2 @@ -1,4 +1,4 @@ {{ SEPARATOR }} {{ node.name }} {{ SEPARATOR }} -{{ node.content }} \ No newline at end of file +{{ node.content }} diff --git a/src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 b/src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 index 1c89b385..ebb804aa 100644 --- a/src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 +++ b/src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 @@ -1,3 +1,3 @@ {{ SEPARATOR }} {{ node.name }}{% if node.target %} -> {{ node.target }}{% endif %} -{{ SEPARATOR }} \ No newline at end of file +{{ SEPARATOR }} diff --git a/src/gitingest/format/DefaultFormatter/GitRepository.j2 b/src/gitingest/format/DefaultFormatter/GitRepository.j2 index 3228dbcc..29995223 100644 --- a/src/gitingest/format/DefaultFormatter/GitRepository.j2 +++ b/src/gitingest/format/DefaultFormatter/GitRepository.j2 @@ -4,4 +4,4 @@ {% endif -%} {%- for child in node.children -%} {{ formatter.format(child, query) }} -{%- endfor -%} \ No newline at end of file +{%- endfor -%} diff --git a/src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 b/src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 index d32dfe80..dc23c8b7 100644 --- a/src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 +++ b/src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 @@ -1,2 +1,2 @@ Directory structure: -{{ node.tree }} \ No newline at end of file +{{ node.tree }} diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index c1a1351c..2366b4ed 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -26,8 +26,10 @@ def _is_git_repository(path: Path) -> bool: def ingest_query(query: IngestionQuery) -> ContextV1: """Run the ingestion process for a parsed query. - This is the main entry point for analyzing a codebase directory or single file. It processes the query - parameters, reads the file or directory content, and returns a ContextV1 object that can generate the final output digest on demand. + This is the main entry point for analyzing a codebase directory or single file. + + It processes the query parameters, reads the file or directory content, and returns + a ContextV1 object that can generate the final output digest on demand. Parameters ---------- @@ -37,7 +39,9 @@ def ingest_query(query: IngestionQuery) -> ContextV1: Returns ------- ContextV1 - A ContextV1 object representing the ingested file system nodes. Use generate_digest(context) to get the summary, directory structure, and file contents. + A ContextV1 object representing the ingested file system nodes. + Use generate_digest(context) to get the summary, directory structure, + and file contents. Raises ------ @@ -126,7 +130,7 @@ def ingest_query(query: IngestionQuery) -> ContextV1: return ContextV1([root_node], query) -def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None: +def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None: # noqa: C901 """Process a file or directory item within a directory. This function handles each file or directory item, checking if it should be included or excluded based on the diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 83af607e..c4c3a041 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -8,7 +8,7 @@ from gitingest.clone import clone_repo from gitingest.ingestion import ingest_query -from gitingest.output_formatter import DebugFormatter, DefaultFormatter, SummaryFormatter +from gitingest.output_formatter import DefaultFormatter, SummaryFormatter, generate_digest from gitingest.query_parser import parse_remote_repo from gitingest.utils.git_utils import resolve_commit, validate_github_token from gitingest.utils.logging_config import get_logger @@ -23,7 +23,6 @@ upload_metadata_to_s3, upload_to_s3, ) -from gitingest.schemas import ContextV1 from server.server_config import MAX_DISPLAY_SIZE # Initialize logger for this module @@ -314,17 +313,18 @@ async def process_query( source=query.url, user_name=cast("str", query.user_name), repo_name=cast("str", query.repo_name), + subpath=query.subpath, commit=query.commit, include_patterns=query.include_patterns, ignore_patterns=query.ignore_patterns, ) - s3_url = upload_to_s3(content=context.digest, s3_file_path=s3_file_path, ingest_id=query.id) + s3_url = upload_to_s3(content=generate_digest(context), s3_file_path=s3_file_path, ingest_id=query.id) # Store S3 URL in query for later use query.s3_url = s3_url else: # Store locally local_txt_file = Path(clone_config.local_path).with_suffix(".txt") - print(f"Writing to {local_txt_file}") + logger.info("Writing digest to local file", extra={"file_path": str(local_txt_file)}) with local_txt_file.open("w", encoding="utf-8") as f: f.write(digest) @@ -340,14 +340,6 @@ async def process_query( "download full ingest to see more)\n" + digest[:MAX_DISPLAY_SIZE] ) - # _print_success( - # url=query.url, - # max_file_size=max_file_size, - # pattern_type=pattern_type, - # pattern=pattern, - # summary=digest, - # ) - digest_url = _generate_digest_url(query) # Clean up the repository after successful processing @@ -358,7 +350,7 @@ async def process_query( short_repo_url=short_repo_url, summary=summary, digest_url=digest_url, - tree=context.sources[0].tree, # TODO: this is a hack to get the tree of the first source + tree=context.sources[0].tree, # TODO: this is a hack to get the tree of the first source content=digest, default_max_file_size=max_file_size, pattern_type=pattern_type, diff --git a/src/server/routers_utils.py b/src/server/routers_utils.py index ddea340f..f0471375 100644 --- a/src/server/routers_utils.py +++ b/src/server/routers_utils.py @@ -8,9 +8,13 @@ from fastapi import status from fastapi.responses import JSONResponse +from gitingest.utils.logging_config import get_logger from server.models import IngestErrorResponse, IngestSuccessResponse, PatternType from server.query_processor import process_query +# Initialize logger for this module +logger = get_logger(__name__) + COMMON_INGEST_RESPONSES: dict[int | str, dict[str, Any]] = { status.HTTP_200_OK: {"model": IngestSuccessResponse, "description": "Successful ingestion"}, status.HTTP_400_BAD_REQUEST: {"model": IngestErrorResponse, "description": "Bad request or processing error"}, @@ -41,8 +45,8 @@ async def _perform_ingestion( ) if isinstance(result, IngestErrorResponse): - # print stack trace to console for debugging - print(traceback.format_exc()) + # Log stack trace for debugging + logger.error("Ingest processing failed", extra={"traceback": traceback.format_exc()}) # Return structured error response with 400 status code return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content=result.model_dump()) @@ -52,13 +56,13 @@ async def _perform_ingestion( except ValueError as ve: # Handle validation errors with 400 status code error_response = IngestErrorResponse(error=f"Validation error: {ve!s}") - # print stack trace to console for debugging - print(traceback.format_exc()) + # Log stack trace for debugging + logger.exception("Validation error during ingest", extra={"error": str(ve)}) return JSONResponse(status_code=status.HTTP_400_BAD_REQUEST, content=error_response.model_dump()) except Exception as exc: # Handle unexpected errors with 500 status code error_response = IngestErrorResponse(error=f"Internal server error: {exc!s}") - # print stack trace to console for debugging - print(traceback.format_exc()) + # Log stack trace for debugging + logger.exception("Unexpected error during ingest", extra={"error": str(exc)}) return JSONResponse(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content=error_response.model_dump()) diff --git a/tests/test_output_formatter.py b/tests/test_output_formatter.py index f916e2ee..6668bc7b 100644 --- a/tests/test_output_formatter.py +++ b/tests/test_output_formatter.py @@ -3,6 +3,7 @@ These tests validate the formatting behavior of DefaultFormatter and StupidFormatter for different FileSystemNode types (File, Directory, Symlink). """ +# pylint: disable=redefined-outer-name # pytest fixtures are expected to redefine names from __future__ import annotations @@ -11,8 +12,8 @@ import pytest -from gitingest.output_formatter import DefaultFormatter, DebugFormatter, SummaryFormatter -from gitingest.schemas import FileSystemFile, FileSystemDirectory, FileSystemSymlink, IngestionQuery +from gitingest.output_formatter import DebugFormatter, DefaultFormatter, SummaryFormatter +from gitingest.schemas import FileSystemDirectory, FileSystemFile, FileSystemSymlink, IngestionQuery from gitingest.schemas.filesystem import FileSystemNodeType @@ -78,87 +79,108 @@ def mock_symlink_node() -> FileSystemSymlink: class TestDefaultFormatter: """Test cases for DefaultFormatter class.""" - def test_init(self): + def test_init(self) -> None: """Test DefaultFormatter initialization.""" formatter = DefaultFormatter() assert formatter.env is not None assert formatter.format is not None - def test_format_file_node(self, mock_file_node, mock_query): + def test_format_file_node(self, mock_file_node: FileSystemFile, mock_query: IngestionQuery) -> None: """Test formatting a FileSystemFile node.""" formatter = DefaultFormatter() result = formatter.format(mock_file_node, mock_query) - + # Should contain separator, filename, and content assert "================================================" in result assert "test_file.py" in result assert "print('hello world')" in result assert "print('test content')" in result - def test_format_directory_node(self, mock_directory_node, mock_query): + def test_format_directory_node(self, mock_directory_node: FileSystemDirectory, mock_query: IngestionQuery) -> None: """Test formatting a FileSystemDirectory node.""" # Create mock child nodes child1 = Mock() child2 = Mock() mock_directory_node.children = [child1, child2] - + formatter = DefaultFormatter() - + # Mock the format method calls for children - with patch.object(formatter, 'format', side_effect=lambda node, query: f"formatted_{node.name}" if hasattr(node, 'name') else "formatted_child") as mock_format: + with patch.object( + formatter, + "format", + side_effect=lambda node, _: f"formatted_{node.name}" if hasattr(node, "name") else "formatted_child", + ) as mock_format: # Need to call the actual method for the directory node itself mock_format.side_effect = None - result = formatter.format(mock_directory_node, mock_query) - - # Reset side effect and call again to test child formatting - mock_format.side_effect = lambda node, query: f"formatted_{getattr(node, 'name', 'child')}" - result = formatter.format(mock_directory_node, mock_query) + formatter.format(mock_directory_node, mock_query) - def test_format_symlink_node(self, mock_symlink_node, mock_query): + # Reset side effect and call again to test child formatting + mock_format.side_effect = lambda node, _: f"formatted_{getattr(node, 'name', 'child')}" + formatter.format(mock_directory_node, mock_query) + + def test_format_symlink_node( + self, + mock_symlink_node: FileSystemSymlink, + mock_query: IngestionQuery, + ) -> None: """Test formatting a FileSystemSymlink node.""" formatter = DefaultFormatter() result = formatter.format(mock_symlink_node, mock_query) - + # Should contain separator, filename, and target assert "================================================" in result assert "link_to_file" in result assert "target_file.py" in result - def test_format_symlink_node_no_target(self, mock_symlink_node, mock_query): + def test_format_symlink_node_no_target( + self, + mock_symlink_node: FileSystemSymlink, + mock_query: IngestionQuery, + ) -> None: """Test formatting a FileSystemSymlink node without target.""" mock_symlink_node.target = "" formatter = DefaultFormatter() result = formatter.format(mock_symlink_node, mock_query) - + # Should contain separator and filename but no arrow assert "================================================" in result assert "link_to_file" in result assert " -> " not in result + class TestSummaryFormatter: """Test cases for SummaryFormatter class.""" - def test_init(self): + def test_init(self) -> None: """Test SummaryFormatter initialization.""" formatter = SummaryFormatter() assert formatter.env is not None assert formatter.summary is not None - def test_summary_directory_node(self, mock_directory_node, mock_query): + def test_summary_directory_node( + self, + mock_directory_node: FileSystemDirectory, + mock_query: IngestionQuery, + ) -> None: """Test summary generation for a FileSystemDirectory node.""" formatter = SummaryFormatter() result = formatter.summary(mock_directory_node, mock_query) - + assert "Directory structure:" in result assert "src/" in result assert "file1.py" in result assert "file2.py" in result - def test_summary_file_node_default(self, mock_file_node, mock_query): + def test_summary_file_node_default( + self, + mock_file_node: FileSystemFile, + mock_query: IngestionQuery, + ) -> None: """Test default summary for FileSystemFile node.""" formatter = SummaryFormatter() result = formatter.summary(mock_file_node, mock_query) - + # Should use default handler and return the name assert "test_file.py" in result @@ -166,17 +188,21 @@ def test_summary_file_node_default(self, mock_file_node, mock_query): class TestDebugFormatter: """Test cases for DebugFormatter class.""" - def test_init(self): + def test_init(self) -> None: """Test DebugFormatter initialization.""" formatter = DebugFormatter() assert formatter.env is not None assert formatter.format is not None - def test_format_file_node_debug_info(self, mock_file_node, mock_query): + def test_format_file_node_debug_info( + self, + mock_file_node: FileSystemFile, + mock_query: IngestionQuery, + ) -> None: """Test that DebugFormatter shows debug info for FileSystemFile.""" formatter = DebugFormatter() result = formatter.format(mock_file_node, mock_query) - + # Should contain debug information assert "================================================" in result assert "DEBUG: FileSystemFile" in result @@ -186,61 +212,79 @@ def test_format_file_node_debug_info(self, mock_file_node, mock_query): assert "path" in result assert "size" in result - def test_format_directory_node_debug_info(self, mock_directory_node, mock_query): + def test_format_directory_node_debug_info( + self, + mock_directory_node: FileSystemDirectory, + mock_query: IngestionQuery, + ) -> None: """Test that DebugFormatter shows debug info for FileSystemDirectory.""" formatter = DebugFormatter() result = formatter.format(mock_directory_node, mock_query) - + # Should contain debug information assert "DEBUG: FileSystemDirectory" in result assert "Fields:" in result assert "name" in result assert "children" in result - def test_format_symlink_node_debug_info(self, mock_symlink_node, mock_query): + def test_format_symlink_node_debug_info( + self, + mock_symlink_node: FileSystemSymlink, + mock_query: IngestionQuery, + ) -> None: """Test that DebugFormatter shows debug info for FileSystemSymlink.""" formatter = DebugFormatter() result = formatter.format(mock_symlink_node, mock_query) - + # Should contain debug information assert "DEBUG: FileSystemSymlink" in result assert "Fields:" in result assert "name" in result assert "target" in result - def test_format_all_node_types_show_debug(self, mock_file_node, mock_directory_node, mock_symlink_node, mock_query): + def test_format_all_node_types_show_debug( + self, + mock_file_node: FileSystemFile, + mock_directory_node: FileSystemDirectory, + mock_symlink_node: FileSystemSymlink, + mock_query: IngestionQuery, + ) -> None: """Test that DebugFormatter shows debug info for all node types.""" formatter = DebugFormatter() - + file_result = formatter.format(mock_file_node, mock_query) dir_result = formatter.format(mock_directory_node, mock_query) symlink_result = formatter.format(mock_symlink_node, mock_query) - + # All should contain debug headers assert "DEBUG: FileSystemFile" in file_result assert "DEBUG: FileSystemDirectory" in dir_result assert "DEBUG: FileSystemSymlink" in symlink_result - + # All should contain field information assert "Fields:" in file_result assert "Fields:" in dir_result assert "Fields:" in symlink_result - def test_debug_formatter_vs_default_formatter(self, mock_file_node, mock_query): + def test_debug_formatter_vs_default_formatter( + self, + mock_file_node: FileSystemFile, + mock_query: IngestionQuery, + ) -> None: """Test that DebugFormatter produces different output than DefaultFormatter.""" default_formatter = DefaultFormatter() debug_formatter = DebugFormatter() - + default_result = default_formatter.format(mock_file_node, mock_query) debug_result = debug_formatter.format(mock_file_node, mock_query) - + # Results should be different assert default_result != debug_result - + # Debug should contain debug info, default should not assert "DEBUG:" in debug_result assert "DEBUG:" not in default_result - + # Debug should show fields, default shows content assert "Fields:" in debug_result assert "Fields:" not in default_result @@ -249,30 +293,30 @@ def test_debug_formatter_vs_default_formatter(self, mock_file_node, mock_query): class TestFormatterEdgeCases: """Test edge cases and error conditions.""" - def test_format_unknown_node_type(self, mock_query): + def test_format_unknown_node_type(self, mock_query: IngestionQuery) -> None: """Test formatting with an unknown node type.""" unknown_node = Mock() unknown_node.name = "unknown" - + formatter = DefaultFormatter() # Should fall back to default behavior result = formatter.format(unknown_node, mock_query) assert result is not None - def test_format_node_without_name(self, mock_query): + def test_format_node_without_name(self, mock_query: IngestionQuery) -> None: """Test formatting a node without a name attribute.""" nameless_node = Mock(spec=FileSystemFile) # Remove name attribute del nameless_node.name - + formatter = DebugFormatter() # Should handle gracefully (jinja template will show empty) result = formatter.format(nameless_node, mock_query) assert result is not None - def test_format_with_none_query(self, mock_file_node): + def test_format_with_none_query(self, mock_file_node: FileSystemFile) -> None: """Test formatting with None query.""" formatter = DefaultFormatter() # Should handle None query gracefully result = formatter.format(mock_file_node, None) - assert result is not None \ No newline at end of file + assert result is not None From 6b680c7edf0587bd1a082b613cebc0baf54c6a59 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Thu, 31 Jul 2025 17:15:29 +0200 Subject: [PATCH 15/17] typo --- src/gitingest/ingestion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index 2366b4ed..b05e6357 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -291,7 +291,7 @@ def limit_exceeded(stats: FileSystemStats, depth: int) -> bool: """Check if any of the traversal limits have been exceeded. This function checks if the current traversal has exceeded any of the configured limits: - maximum directory depth, ma ximum number of files, or maximum total size in bytes. + maximum directory depth, maximum number of files, or maximum total size in bytes. Parameters ---------- From f4580c0b6cfa525f31c2a0dd82997b77a7b7c521 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Thu, 31 Jul 2025 17:17:34 +0200 Subject: [PATCH 16/17] typo --- src/server/query_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/query_processor.py b/src/server/query_processor.py index c4c3a041..546975a2 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -313,8 +313,8 @@ async def process_query( source=query.url, user_name=cast("str", query.user_name), repo_name=cast("str", query.repo_name), - subpath=query.subpath, commit=query.commit, + subpath=query.subpath, include_patterns=query.include_patterns, ignore_patterns=query.ignore_patterns, ) From 9a4e6f613178578bb21f29dfa79aeb315cb26a08 Mon Sep 17 00:00:00 2001 From: Nicolas IRAGNE Date: Sun, 3 Aug 2025 10:14:28 +0200 Subject: [PATCH 17/17] wip --- src/gitingest/entrypoint.py | 9 +- .../format/DefaultFormatter/ContextV1.j2 | 12 +- .../DefaultFormatter/FileSystemDirectory.j2 | 6 +- .../format/DefaultFormatter/FileSystemFile.j2 | 4 +- .../DefaultFormatter/FileSystemSymlink.j2 | 2 +- .../format/DefaultFormatter/GitRepository.j2 | 6 +- .../format/SummaryFormatter/ContextV1.j2 | 8 +- .../SummaryFormatter/FileSystemDirectory.j2 | 2 +- src/gitingest/ingestion.py | 6 +- src/gitingest/output_formatter.py | 116 ++++++++++++------ src/gitingest/query_parser.py | 8 +- src/gitingest/schemas/contextv1.py | 20 ++- src/gitingest/schemas/filesystem.py | 9 +- src/server/query_processor.py | 12 +- test_formatting.py | 37 ------ 15 files changed, 144 insertions(+), 113 deletions(-) delete mode 100644 test_formatting.py diff --git a/src/gitingest/entrypoint.py b/src/gitingest/entrypoint.py index 2212ed03..e3ecc9ee 100644 --- a/src/gitingest/entrypoint.py +++ b/src/gitingest/entrypoint.py @@ -15,7 +15,7 @@ from gitingest.clone import clone_repo from gitingest.config import MAX_FILE_SIZE from gitingest.ingestion import ingest_query -from gitingest.output_formatter import generate_digest +from gitingest.output_formatter import DefaultFormatter from gitingest.query_parser import parse_local_dir_path, parse_remote_repo from gitingest.utils.auth import resolve_token from gitingest.utils.compat_func import removesuffix @@ -51,7 +51,7 @@ async def ingest_async( This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), and processes its files according to the specified query parameters. It returns a single digest string. - The output is generated lazily using a ContextV1 object and the generate_digest() function. + The output is generated lazily using a ContextV1 object and the DefaultFormatter class. Parameters ---------- @@ -141,7 +141,8 @@ async def ingest_async( if output: logger.debug("Writing output to file", extra={"output_path": output}) context = ingest_query(query) - digest = generate_digest(context) + formatter = DefaultFormatter() + digest = formatter.format(context, context.query) await _write_output(digest, content=None, target=output) logger.info("Ingestion completed successfully") return digest @@ -165,7 +166,7 @@ def ingest( This function analyzes a source (URL or local path), clones the corresponding repository (if applicable), and processes its files according to the specified query parameters. It returns a single digest string. - The output is generated lazily using a ContextV1 object and the generate_digest() function. + The output is generated lazily using a ContextV1 object and the DefaultFormatter class. Parameters ---------- diff --git a/src/gitingest/format/DefaultFormatter/ContextV1.j2 b/src/gitingest/format/DefaultFormatter/ContextV1.j2 index 0486beb9..e9a211d7 100644 --- a/src/gitingest/format/DefaultFormatter/ContextV1.j2 +++ b/src/gitingest/format/DefaultFormatter/ContextV1.j2 @@ -1,11 +1,11 @@ -# Generated using https://gitingest.com/{{ context.query.user_name }}/{{ context.query.repo_name }}{{ context.query.subpath }} +# Generated using https://gitingest.com/{{ source.query.user_name }}/{{ source.query.repo_name }}{{ source.query.subpath }} Sources used: -{%- for source in context %} -- {{ source.name }}: {{ source.__class__.__name__ }} +{%- for src in source %} +- {{ src.name }}: {{ src.__class__.__name__ }} {% endfor %} -{%- for source in context.sources %} -{{ formatter.format(source, context.query) }} +{%- for src in source.sources %} +{{ formatter.format(src, source.query) }} {%- endfor %} -# End of https://gitingest.com/{{ context.query.user_name }}/{{ context.query.repo_name }}{{ context.query.subpath }} +# End of https://gitingest.com/{{ source.query.user_name }}/{{ source.query.repo_name }}{{ source.query.subpath }} diff --git a/src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 b/src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 index 4e003344..211ef932 100644 --- a/src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 +++ b/src/gitingest/format/DefaultFormatter/FileSystemDirectory.j2 @@ -1,7 +1,7 @@ -{%- if node.depth == 0 %}{{ node.name }}: -{{ node.tree }} +{%- if source.depth == 0 %}{{ source.name }}: +{{ source.tree }} {% endif -%} -{%- for child in node.children -%} +{%- for child in source.children -%} {{ formatter.format(child, query) }} {%- endfor -%} diff --git a/src/gitingest/format/DefaultFormatter/FileSystemFile.j2 b/src/gitingest/format/DefaultFormatter/FileSystemFile.j2 index 37567010..a62a4312 100644 --- a/src/gitingest/format/DefaultFormatter/FileSystemFile.j2 +++ b/src/gitingest/format/DefaultFormatter/FileSystemFile.j2 @@ -1,4 +1,4 @@ {{ SEPARATOR }} -{{ node.name }} +{{ source.name }} {{ SEPARATOR }} -{{ node.content }} +{{ source.content }} diff --git a/src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 b/src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 index ebb804aa..b07ff641 100644 --- a/src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 +++ b/src/gitingest/format/DefaultFormatter/FileSystemSymlink.j2 @@ -1,3 +1,3 @@ {{ SEPARATOR }} -{{ node.name }}{% if node.target %} -> {{ node.target }}{% endif %} +{{ source.name }}{% if source.target %} -> {{ source.target }}{% endif %} {{ SEPARATOR }} diff --git a/src/gitingest/format/DefaultFormatter/GitRepository.j2 b/src/gitingest/format/DefaultFormatter/GitRepository.j2 index 29995223..d0cc8608 100644 --- a/src/gitingest/format/DefaultFormatter/GitRepository.j2 +++ b/src/gitingest/format/DefaultFormatter/GitRepository.j2 @@ -1,7 +1,7 @@ -{%- if node.depth == 0 %}🔗 Git Repository: {{ node.name }} -{{ node.tree }} +{%- if source.depth == 0 %}🔗 Git Repository: {{ source.name }} +{{ source.tree }} {% endif -%} -{%- for child in node.children -%} +{%- for child in source.children -%} {{ formatter.format(child, query) }} {%- endfor -%} diff --git a/src/gitingest/format/SummaryFormatter/ContextV1.j2 b/src/gitingest/format/SummaryFormatter/ContextV1.j2 index 6c630ed2..6d4fece7 100644 --- a/src/gitingest/format/SummaryFormatter/ContextV1.j2 +++ b/src/gitingest/format/SummaryFormatter/ContextV1.j2 @@ -1,3 +1,5 @@ -Repository: {{ context.query.user_name }}/{{ context.query.repo_name }} -Commit: {{ context.query.commit }} -Files analyzed: {{ context.file_count }} +Repository: {{ source.query.user_name }}/{{ source.query.repo_name }} +Commit: {{ source.query.commit }} +Files analyzed: {{ source.file_count }} + +Estimated tokens: {{ source.token_count }} diff --git a/src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 b/src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 index dc23c8b7..cb4b6511 100644 --- a/src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 +++ b/src/gitingest/format/SummaryFormatter/FileSystemDirectory.j2 @@ -1,2 +1,2 @@ Directory structure: -{{ node.tree }} +{{ source.tree }} diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py index b05e6357..2afc7925 100644 --- a/src/gitingest/ingestion.py +++ b/src/gitingest/ingestion.py @@ -40,7 +40,7 @@ def ingest_query(query: IngestionQuery) -> ContextV1: ------- ContextV1 A ContextV1 object representing the ingested file system nodes. - Use generate_digest(context) to get the summary, directory structure, + Use str(DefaultFormatter(context)) to get the summary, directory structure, and file contents. Raises @@ -96,7 +96,7 @@ def ingest_query(query: IngestionQuery) -> ContextV1: "file_size": file_node.size, }, ) - return ContextV1([file_node], query) + return ContextV1(sources=[file_node], query=query) # Check if this is a git repository and create appropriate node type if _is_git_repository(path): @@ -127,7 +127,7 @@ def ingest_query(query: IngestionQuery) -> ContextV1: }, ) - return ContextV1([root_node], query) + return ContextV1(sources=[root_node], query=query) def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None: # noqa: C901 diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py index a7adc8b0..a7bb726a 100644 --- a/src/gitingest/output_formatter.py +++ b/src/gitingest/output_formatter.py @@ -11,7 +11,7 @@ import tiktoken from jinja2 import Environment, FileSystemLoader, Template, TemplateNotFound -from gitingest.schemas import ContextV1, FileSystemNode, Source +from gitingest.schemas import FileSystemNode, Source from gitingest.schemas.filesystem import SEPARATOR, FileSystemNodeType from gitingest.utils.compat_func import readlink from gitingest.utils.logging_config import get_logger @@ -136,15 +136,15 @@ def _format_token_count(text: str) -> str | None: return str(total_tokens) -def generate_digest(context: ContextV1) -> str: - """Generate a digest string from a ContextV1 object. +def generate_digest(context: Source) -> str: + """Generate a digest string from a Source object. - This is a convenience function that uses the DefaultFormatter to format a ContextV1. + This is a convenience function that uses the DefaultFormatter to format a Source. Parameters ---------- - context : ContextV1 - The ContextV1 object containing sources and query information. + context : Source + The Source object containing sources and query information. Returns ------- @@ -156,12 +156,12 @@ def generate_digest(context: ContextV1) -> str: return formatter.format(context, context.query) -class DefaultFormatter: - """Default formatter for rendering filesystem nodes using Jinja2 templates.""" +class Formatter: + """Base formatter class.""" - def __init__(self) -> None: + def __init__(self, template_subdir: str) -> None: self.separator = SEPARATOR - template_dir = Path(__file__).parent / "format" / "DefaultFormatter" + template_dir = Path(__file__).parent / "format" / template_subdir self.env = Environment(loader=FileSystemLoader(template_dir), autoescape=True) def _get_template_for_node(self, node: Source) -> Template: @@ -169,23 +169,72 @@ def _get_template_for_node(self, node: Source) -> Template: template_name = f"{node.__class__.__name__}.j2" return self.env.get_template(template_name) + +class DefaultFormatter(Formatter): + """Default formatter for rendering filesystem nodes using Jinja2 templates.""" + + def __init__(self) -> None: + super().__init__("DefaultFormatter") + + def format(self, source: Source, query: IngestionQuery) -> str: + """Format a source with the given query.""" + if query is None: + # Handle case where query is None (shouldn't happen in normal usage) + raise ValueError("ContextV1 must have a valid query object") + + # Calculate and set token count for ContextV1 + if hasattr(source, '_token_count'): + token_count = self._calculate_token_count(source) + source._token_count = token_count + # Also set token count in the extra dict + source.extra["token_count"] = token_count + + try: + return self._format_node(source, query) + except Exception as e: + # Log the error for debugging + import logging + logging.error(f"Error in DefaultFormatter: {e}") + raise + + def _calculate_token_count(self, source: Source) -> str: + """Calculate token count for the entire source.""" + # Gather all content from the source + content = self._gather_all_content(source) + return _format_token_count(content) or "Unknown" + + def _gather_all_content(self, node: Source) -> str: + """Recursively gather all content from the source tree.""" + content_parts = [] + + # Add content from the current node + if hasattr(node, 'content'): + content_parts.append(node.content) + + # Add content from all sources if it's a ContextV1 + if hasattr(node, 'sources'): + for source in node.sources: + content_parts.append(self._gather_all_content(source)) + + # Add content from children if it's a directory + if hasattr(node, 'children'): + for child in node.children: + content_parts.append(self._gather_all_content(child)) + + return "\n".join(filter(None, content_parts)) + @singledispatchmethod - def format(self, node: Source, query: IngestionQuery) -> str: + def _format_node(self, node: Source, query: IngestionQuery) -> str: """Dynamically format any node type based on available templates.""" try: template = self._get_template_for_node(node) # Provide common template variables context_vars = { - "node": node, + "source": node, "query": query, "formatter": self, "SEPARATOR": SEPARATOR, } - # Special handling for ContextV1 objects - if isinstance(node, ContextV1): - context_vars["context"] = node - # Use ContextV1 for backward compatibility - template = self.env.get_template("ContextV1.j2") return template.render(**context_vars) except TemplateNotFound: @@ -193,20 +242,17 @@ def format(self, node: Source, query: IngestionQuery) -> str: return f"{getattr(node, 'content', '')}" -class DebugFormatter: +class DebugFormatter(Formatter): """Debug formatter that shows detailed information about filesystem nodes.""" def __init__(self) -> None: - self.separator = SEPARATOR - template_dir = Path(__file__).parent / "format" / "DebugFormatter" - self.env = Environment(loader=FileSystemLoader(template_dir), autoescape=True) + super().__init__("DebugFormatter") def _get_template_for_node(self, node: Source) -> Template: """Get template based on node class name.""" template_name = f"{node.__class__.__name__}.j2" return self.env.get_template(template_name) - @singledispatchmethod def format(self, node: Source, query: IngestionQuery) -> str: """Dynamically format any node type with debug information.""" try: @@ -254,17 +300,18 @@ def _raise_no_dataclass_fields() -> None: return f"DEBUG: {node.__class__.__name__}" -class SummaryFormatter: +class SummaryFormatter(Formatter): """Dedicated formatter for generating summaries of filesystem nodes.""" def __init__(self) -> None: - template_dir = Path(__file__).parent / "format" / "SummaryFormatter" - self.env = Environment(loader=FileSystemLoader(template_dir), autoescape=True) + super().__init__("SummaryFormatter") - def _get_template_for_node(self, node: Source) -> Template: - """Get template based on node class name.""" - template_name = f"{node.__class__.__name__}.j2" - return self.env.get_template(template_name) + def format(self, source: Source, query: IngestionQuery) -> str: + """Generate the summary output.""" + if query is None: + # Handle case where query is None (shouldn't happen in normal usage) + raise ValueError("ContextV1 must have a valid query object") + return self.summary(source, query) @singledispatchmethod def summary(self, node: Source, query: IngestionQuery) -> str: @@ -272,19 +319,12 @@ def summary(self, node: Source, query: IngestionQuery) -> str: try: # Provide common template variables context_vars = { - "node": node, + "source": node, "query": query, "formatter": self, } - # Special handling for ContextV1 objects - if isinstance(node, ContextV1): - context_vars["context"] = node - # Use ContextV1 for backward compatibility - template = self.env.get_template("ContextV1.j2") - else: - template = self._get_template_for_node(node) - + template = self._get_template_for_node(node) return template.render(**context_vars) except TemplateNotFound: # Fallback: return name if available diff --git a/src/gitingest/query_parser.py b/src/gitingest/query_parser.py index dc4ccdef..c2de93d0 100644 --- a/src/gitingest/query_parser.py +++ b/src/gitingest/query_parser.py @@ -135,7 +135,13 @@ def parse_local_dir_path(path_str: str) -> IngestionQuery: """ path_obj = Path(path_str).resolve() slug = path_obj.name if path_str == "." else path_str.strip("/") - return IngestionQuery(local_path=path_obj, slug=slug, id=uuid.uuid4()) + return IngestionQuery( + local_path=path_obj, + slug=slug, + id=uuid.uuid4(), + user_name="local", # Set a default value for local paths + repo_name=slug, # Use the slug as the repo name for local paths + ) async def _configure_branch_or_tag( diff --git a/src/gitingest/schemas/contextv1.py b/src/gitingest/schemas/contextv1.py index d605e762..99e598bb 100644 --- a/src/gitingest/schemas/contextv1.py +++ b/src/gitingest/schemas/contextv1.py @@ -2,7 +2,8 @@ from __future__ import annotations -from dataclasses import dataclass +from dataclasses import dataclass, field +from pathlib import Path from typing import TYPE_CHECKING, Iterator from gitingest.schemas.filesystem import FileSystemDirectory, FileSystemNode, Source @@ -12,7 +13,7 @@ @dataclass -class ContextV1: +class ContextV1(Source): """The ContextV1 object is an object that contains all information needed to produce a formatted output. This object contains all information needed to produce a formatted output @@ -27,8 +28,19 @@ class ContextV1: """ - sources: list[Source] - query: IngestionQuery + sources: list[Source] = field(default_factory=list) + query: IngestionQuery = field(default=None) + + # Source fields + name: str = "context" + path_str: str = "" + path: Path = Path() + _token_count: str = "" + + def render_tree(self, prefix: str = "", *, is_last: bool = True) -> list[str]: + """Render the tree representation of this source.""" + # Return a simple tree representation for ContextV1 + return [f"{prefix}ContextV1: {len(self.sources)} sources"] @property def sources_by_type(self) -> dict[str, list[Source]]: diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index 49cf2cf0..e6ceff64 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -34,6 +34,9 @@ class FileSystemStats: class Source(ABC): """Abstract base class for all sources (files, directories, etc).""" + metadata: dict = field(default_factory=dict) + extra: dict = field(default_factory=dict) + @abstractmethod def render_tree(self, prefix: str = "", *, is_last: bool = True) -> list[str]: """Render the tree representation of this source.""" @@ -43,9 +46,9 @@ def render_tree(self, prefix: str = "", *, is_last: bool = True) -> list[str]: class FileSystemNode(Source): """Base class for filesystem nodes (files, directories, symlinks).""" - name: str - path_str: str - path: Path + name: str = "" + path_str: str = "" + path: Path = None # type: ignore depth: int = 0 size: int = 0 diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 546975a2..5b5dc226 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -8,7 +8,7 @@ from gitingest.clone import clone_repo from gitingest.ingestion import ingest_query -from gitingest.output_formatter import DefaultFormatter, SummaryFormatter, generate_digest +from gitingest.output_formatter import DefaultFormatter, SummaryFormatter from gitingest.query_parser import parse_remote_repo from gitingest.utils.git_utils import resolve_commit, validate_github_token from gitingest.utils.logging_config import get_logger @@ -303,8 +303,10 @@ async def process_query( try: context = ingest_query(query) - digest = DefaultFormatter().format(context, query) - summary = SummaryFormatter().summary(context, query) + formatter = DefaultFormatter() + digest = formatter.format(context, context.query) + summary_formatter = SummaryFormatter() + summary = summary_formatter.format(context, context.query) # Store digest based on S3 configuration if is_s3_enabled(): @@ -318,7 +320,9 @@ async def process_query( include_patterns=query.include_patterns, ignore_patterns=query.ignore_patterns, ) - s3_url = upload_to_s3(content=generate_digest(context), s3_file_path=s3_file_path, ingest_id=query.id) + s3_url = upload_to_s3( + content=formatter.format(context, context.query), s3_file_path=s3_file_path, ingest_id=query.id + ) # Store S3 URL in query for later use query.s3_url = s3_url else: diff --git a/test_formatting.py b/test_formatting.py deleted file mode 100644 index 38cd2265..00000000 --- a/test_formatting.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Test script for formatting functionality.""" - -from pathlib import Path -from unittest.mock import Mock - -from gitingest.output_formatter import DefaultFormatter -from gitingest.schemas import ContextV1 -from gitingest.schemas.filesystem import FileSystemFile, GitRepository - -# Create a mock query -mock_query = Mock() -mock_query.user_name = "test_user" -mock_query.repo_name = "test_repo" - -# Create a simple file -mock_file = FileSystemFile( - name="test.py", - path_str="test.py", - path=Path("test.py"), -) -mock_file.content = "print('hello world')" - -# Create a git repository with the file -mock_repo = GitRepository( - name="test_repo", - path_str="", - path=Path(), - children=[mock_file], -) - -# Create context -context = ContextV1([mock_repo], mock_query) - -# Test formatting -formatter = DefaultFormatter() -result = formatter.format(context, mock_query) -# Results can be inspected with debugger or logging if needed