Skip to content

Commit ba701a8

Browse files
feat: ignore .gitignore files by default (use --include-gitignored to stay
* use_gitignore flag to exclude gitignore --------- Co-authored-by: Filip Christiansen <[email protected]>
1 parent c19f275 commit ba701a8

File tree

10 files changed

+165
-17
lines changed

10 files changed

+165
-17
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ repos:
105105
starlette>=0.40.0,
106106
tiktoken,
107107
tomli,
108+
pathspec,
108109
uvicorn>=0.11.7,
109110
]
110111
- id: pylint
@@ -124,6 +125,7 @@ repos:
124125
starlette>=0.40.0,
125126
tiktoken,
126127
tomli,
128+
pathspec,
127129
uvicorn>=0.11.7,
128130
]
129131

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ export GITHUB_TOKEN=github_pat_...
109109
gitingest https://github.com/username/private-repo
110110
```
111111

112+
By default, files listed in `.gitignore` are skipped. Use `--include-gitignored` if you
113+
need those files in the digest.
114+
112115
By default, the digest is written to a text file (`digest.txt`) in your current working directory. You can customize the output in two ways:
113116

114117
- Use `--output/-o <filename>` to write to a specific file.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ dependencies = [
1313
"starlette>=0.40.0", # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw
1414
"tiktoken>=0.7.0", # Support for o200k_base encoding
1515
"tomli",
16+
"pathspec>=0.12.1",
1617
"typing_extensions; python_version < '3.10'",
1718
"uvicorn>=0.11.7", # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150
1819
]

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
click>=8.0.0
22
fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38
3+
pathspec>=0.12.1
34
pydantic
45
python-dotenv
56
slowapi

src/gitingest/cli.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@
4444
),
4545
)
4646
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
47+
@click.option(
48+
"--include-gitignored",
49+
is_flag=True,
50+
default=False,
51+
help="Include files matched by .gitignore",
52+
)
4753
@click.option(
4854
"--token",
4955
"-t",
@@ -61,6 +67,7 @@ def main(
6167
exclude_pattern: Tuple[str, ...],
6268
include_pattern: Tuple[str, ...],
6369
branch: Optional[str],
70+
include_gitignored: bool,
6471
token: Optional[str],
6572
):
6673
"""
@@ -83,11 +90,12 @@ def main(
8390
Glob patterns for including files in the output.
8491
branch : str, optional
8592
Specific branch to ingest (defaults to the repository's default).
93+
include_gitignored : bool
94+
If provided, include files normally ignored by .gitignore.
8695
token: str, optional
8796
GitHub personal-access token (PAT). Needed when *source* refers to a
8897
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
8998
"""
90-
9199
asyncio.run(
92100
_async_main(
93101
source=source,
@@ -96,6 +104,7 @@ def main(
96104
exclude_pattern=exclude_pattern,
97105
include_pattern=include_pattern,
98106
branch=branch,
107+
include_gitignored=include_gitignored,
99108
token=token,
100109
)
101110
)
@@ -108,6 +117,7 @@ async def _async_main(
108117
exclude_pattern: Tuple[str, ...],
109118
include_pattern: Tuple[str, ...],
110119
branch: Optional[str],
120+
include_gitignored: bool,
111121
token: Optional[str],
112122
) -> None:
113123
"""
@@ -132,6 +142,8 @@ async def _async_main(
132142
Glob patterns for including files in the output.
133143
branch : str, optional
134144
Specific branch to ingest (defaults to the repository's default).
145+
include_gitignored : bool
146+
If provided, include files normally ignored by .gitignore.
135147
token: str, optional
136148
GitHub personal-access token (PAT). Needed when *source* refers to a
137149
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
@@ -160,6 +172,7 @@ async def _async_main(
160172
exclude_patterns=exclude_patterns,
161173
branch=branch,
162174
output=output_target,
175+
include_gitignored=include_gitignored,
163176
token=token,
164177
)
165178

src/gitingest/entrypoint.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from gitingest.config import TMP_BASE_PATH
1212
from gitingest.ingestion import ingest_query
1313
from gitingest.query_parsing import IngestionQuery, parse_query
14+
from gitingest.utils.ignore_patterns import load_gitignore_patterns
1415

1516

1617
async def ingest_async(
@@ -19,6 +20,7 @@ async def ingest_async(
1920
include_patterns: Optional[Union[str, Set[str]]] = None,
2021
exclude_patterns: Optional[Union[str, Set[str]]] = None,
2122
branch: Optional[str] = None,
23+
include_gitignored: bool = False,
2224
token: Optional[str] = None,
2325
output: Optional[str] = None,
2426
) -> Tuple[str, str, str]:
@@ -42,6 +44,8 @@ async def ingest_async(
4244
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
4345
branch : str, optional
4446
The branch to clone and ingest. If `None`, the default branch is used.
47+
include_gitignored : bool
48+
If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``.
4549
token : str, optional
4650
GitHub personal-access token (PAT). Needed when *source* refers to a
4751
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
@@ -76,6 +80,10 @@ async def ingest_async(
7680
token=token,
7781
)
7882

83+
if not include_gitignored:
84+
gitignore_patterns = load_gitignore_patterns(query.local_path)
85+
query.ignore_patterns.update(gitignore_patterns)
86+
7987
if query.url:
8088
selected_branch = branch if branch else query.branch # prioritize branch argument
8189
query.branch = selected_branch
@@ -117,6 +125,7 @@ def ingest(
117125
include_patterns: Optional[Union[str, Set[str]]] = None,
118126
exclude_patterns: Optional[Union[str, Set[str]]] = None,
119127
branch: Optional[str] = None,
128+
include_gitignored: bool = False,
120129
token: Optional[str] = None,
121130
output: Optional[str] = None,
122131
) -> Tuple[str, str, str]:
@@ -140,6 +149,8 @@ def ingest(
140149
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
141150
branch : str, optional
142151
The branch to clone and ingest. If `None`, the default branch is used.
152+
include_gitignored : bool
153+
If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``.
143154
token : str, optional
144155
GitHub personal-access token (PAT). Needed when *source* refers to a
145156
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
@@ -165,6 +176,7 @@ def ingest(
165176
include_patterns=include_patterns,
166177
exclude_patterns=exclude_patterns,
167178
branch=branch,
179+
include_gitignored=include_gitignored,
168180
token=token,
169181
output=output,
170182
)

src/gitingest/utils/ignore_patterns.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""Default ignore patterns for Gitingest."""
22

3+
import os
4+
from pathlib import Path
35
from typing import Set
46

57
DEFAULT_IGNORE_PATTERNS: Set[str] = {
@@ -160,3 +162,47 @@
160162
# Gitingest
161163
"digest.txt",
162164
}
165+
166+
167+
def load_gitignore_patterns(root: Path) -> Set[str]:
168+
"""
169+
Recursively load ignore patterns from all .gitignore files under the given root directory.
170+
171+
Parameters
172+
----------
173+
root : Path
174+
The root directory to search for .gitignore files.
175+
176+
Returns
177+
-------
178+
Set[str]
179+
A set of ignore patterns extracted from all .gitignore files found under the root directory.
180+
"""
181+
patterns: Set[str] = set()
182+
for dirpath, _, filenames in os.walk(root):
183+
if ".gitignore" not in filenames:
184+
continue
185+
186+
gitignore_path = Path(dirpath) / ".gitignore"
187+
with gitignore_path.open("r", encoding="utf-8") as f:
188+
for line in f:
189+
stripped = line.strip()
190+
191+
if not stripped or stripped.startswith("#"):
192+
continue
193+
194+
negated = stripped.startswith("!")
195+
if negated:
196+
stripped = stripped[1:]
197+
198+
rel_dir = os.path.relpath(dirpath, root)
199+
if stripped.startswith("/"):
200+
pattern_body = os.path.join(rel_dir, stripped.lstrip("/"))
201+
else:
202+
pattern_body = os.path.join(rel_dir, stripped) if rel_dir != "." else stripped
203+
204+
pattern_body = pattern_body.replace("\\", "/")
205+
pattern = f"!{pattern_body}" if negated else pattern_body
206+
patterns.add(pattern)
207+
208+
return patterns

src/gitingest/utils/ingestion_utils.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
"""Utility functions for the ingestion process."""
22

3-
from fnmatch import fnmatch
43
from pathlib import Path
54
from typing import Set
65

6+
from pathspec import PathSpec
7+
78

89
def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> bool:
910
"""
@@ -38,10 +39,8 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) ->
3839
if path.is_dir():
3940
return True
4041

41-
for pattern in include_patterns:
42-
if fnmatch(rel_str, pattern):
43-
return True
44-
return False
42+
spec = PathSpec.from_lines("gitwildmatch", include_patterns)
43+
return spec.match_file(rel_str)
4544

4645

4746
def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> bool:
@@ -73,7 +72,5 @@ def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> b
7372
return True
7473

7574
rel_str = str(rel_path)
76-
for pattern in ignore_patterns:
77-
if pattern and fnmatch(rel_str, pattern):
78-
return True
79-
return False
75+
spec = PathSpec.from_lines("gitwildmatch", ignore_patterns)
76+
return spec.match_file(rel_str)

tests/test_gitignore_feature.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
"""
2+
Tests for the gitignore functionality in Gitingest.
3+
"""
4+
5+
from pathlib import Path
6+
7+
import pytest
8+
9+
from gitingest.entrypoint import ingest_async
10+
from gitingest.utils.ignore_patterns import load_gitignore_patterns
11+
12+
13+
@pytest.fixture(name="repo_path")
14+
def repo_fixture(tmp_path: Path) -> Path:
15+
"""
16+
Create a temporary repository structure with:
17+
- A .gitignore that excludes 'exclude.txt'
18+
- 'include.txt' (should be processed)
19+
- 'exclude.txt' (should be skipped when gitignore rules are respected)
20+
"""
21+
# Create a .gitignore file that excludes 'exclude.txt'
22+
gitignore_file = tmp_path / ".gitignore"
23+
gitignore_file.write_text("exclude.txt\n")
24+
25+
# Create a file that should be included
26+
include_file = tmp_path / "include.txt"
27+
include_file.write_text("This file should be included.")
28+
29+
# Create a file that should be excluded
30+
exclude_file = tmp_path / "exclude.txt"
31+
exclude_file.write_text("This file should be excluded.")
32+
33+
return tmp_path
34+
35+
36+
def test_load_gitignore_patterns(tmp_path: Path):
37+
"""
38+
Test that load_gitignore_patterns() correctly loads patterns from a .gitignore file.
39+
"""
40+
gitignore = tmp_path / ".gitignore"
41+
# Write some sample patterns with a comment line included
42+
gitignore.write_text("exclude.txt\n*.log\n# a comment\n")
43+
44+
patterns = load_gitignore_patterns(tmp_path)
45+
46+
# Check that the expected patterns are loaded
47+
assert "exclude.txt" in patterns
48+
assert "*.log" in patterns
49+
# Ensure that comment lines are not added
50+
for pattern in patterns:
51+
assert not pattern.startswith("#")
52+
53+
54+
@pytest.mark.asyncio
55+
async def test_ingest_with_gitignore(repo_path: Path):
56+
"""
57+
Integration test for ingest_async() respecting .gitignore rules.
58+
59+
When ``include_gitignored`` is ``False`` (default), the content of 'exclude.txt' should be omitted.
60+
When ``include_gitignored`` is ``True``, both files should be present.
61+
"""
62+
# Run ingestion with the gitignore functionality enabled.
63+
_, _, content_with_ignore = await ingest_async(source=str(repo_path))
64+
# 'exclude.txt' should be skipped.
65+
assert "This file should be excluded." not in content_with_ignore
66+
# 'include.txt' should be processed.
67+
assert "This file should be included." in content_with_ignore
68+
69+
# Run ingestion with the gitignore functionality disabled.
70+
_, _, content_without_ignore = await ingest_async(source=str(repo_path), include_gitignored=True)
71+
# Now both files should be present.
72+
assert "This file should be excluded." in content_without_ignore
73+
assert "This file should be included." in content_without_ignore

tests/test_ingestion.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,10 +84,10 @@ class PatternScenario(TypedDict):
8484
"*/file_dir2.txt",
8585
},
8686
"ignore_patterns": {*()},
87-
"expected_num_files": 3,
88-
"expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"},
89-
"expected_structure": {"test_repo/", "dir2/"},
90-
"expected_not_structure": {"src/", "subdir/", "dir1/"},
87+
"expected_num_files": 4,
88+
"expected_content": {"file1.txt", "file2.py", "dir1/file_dir1.txt", "dir2/file_dir2.txt"},
89+
"expected_structure": {"test_repo/", "dir1/", "dir2/"},
90+
"expected_not_structure": {"src/", "subdir/"},
9191
}
9292
),
9393
id="include-wildcard-directory",
@@ -114,9 +114,10 @@ class PatternScenario(TypedDict):
114114
{
115115
"include_patterns": {"**/file_dir2.txt", "src/**/*.py"},
116116
"ignore_patterns": {*()},
117-
"expected_num_files": 2,
117+
"expected_num_files": 3,
118118
"expected_content": {
119119
"dir2/file_dir2.txt",
120+
"src/subfile2.py",
120121
"src/subdir/file_subdir.py",
121122
},
122123
"expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"},
@@ -169,12 +170,11 @@ class PatternScenario(TypedDict):
169170
{
170171
"include_patterns": {*()},
171172
"ignore_patterns": {"src/**/*.py"},
172-
"expected_num_files": 7,
173+
"expected_num_files": 6,
173174
"expected_content": {
174175
"file1.txt",
175176
"file2.py",
176177
"src/subfile1.txt",
177-
"src/subfile2.py",
178178
"src/subdir/file_subdir.txt",
179179
"dir1/file_dir1.txt",
180180
"dir2/file_dir2.txt",

0 commit comments

Comments
 (0)