Skip to content

Commit e5fadce

Browse files
feat(parser): relax host validation to support self-hosted GitLab & git.* domains (#314)
• Accept hosts starting with “git.” or “gitlab.” in _looks_like_git_host • Update doc-strings to document the heuristic • Adjust git-host-agnostic tests: expect ValueError for slug form with custom hosts; add real GitLab instance (git.rwth-aachen.de) to matrix
1 parent 4ee598c commit e5fadce

File tree

4 files changed

+44
-4
lines changed

4 files changed

+44
-4
lines changed

src/gitingest/utils/query_parser_utils.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,22 +60,48 @@ def _is_valid_pattern(pattern: str) -> bool:
6060

6161
def _validate_host(host: str) -> None:
6262
"""
63-
Validate the given host against the known Git hosts.
63+
Validate a hostname.
64+
65+
The host is accepted if it is either present in the hard-coded `KNOWN_GIT_HOSTS` list or if it satisfies the
66+
simple heuristics in `_looks_like_git_host`, which try to recognise common self-hosted Git services (e.g. GitLab
67+
instances on sub-domains such as `gitlab.example.com` or `git.example.com`).
6468
6569
Parameters
6670
----------
6771
host : str
68-
The host to validate.
72+
Hostname (case-insensitive).
6973
7074
Raises
7175
------
7276
ValueError
73-
If the host is not a known Git host.
77+
If the host cannot be recognised as a probable Git hosting domain.
7478
"""
75-
if host not in KNOWN_GIT_HOSTS:
79+
host = host.lower()
80+
if host not in KNOWN_GIT_HOSTS and not _looks_like_git_host(host):
7681
raise ValueError(f"Unknown domain '{host}' in URL")
7782

7883

84+
def _looks_like_git_host(host: str) -> bool:
85+
"""
86+
Check if the given host looks like a Git host.
87+
88+
The current heuristic returns `True` when the host starts with `git.` (e.g. `git.example.com`) or starts with
89+
`gitlab.` (e.g. `gitlab.company.com`).
90+
91+
Parameters
92+
----------
93+
host : str
94+
Hostname (case-insensitive).
95+
96+
Returns
97+
-------
98+
bool
99+
True if the host looks like a Git host, otherwise False.
100+
"""
101+
host = host.lower()
102+
return host.startswith(("git.", "gitlab."))
103+
104+
79105
def _validate_url_scheme(scheme: str) -> None:
80106
"""
81107
Validate the given scheme against the known schemes.
@@ -90,6 +116,7 @@ def _validate_url_scheme(scheme: str) -> None:
90116
ValueError
91117
If the scheme is not 'http' or 'https'.
92118
"""
119+
scheme = scheme.lower()
93120
if scheme not in ("https", "http"):
94121
raise ValueError(f"Invalid URL scheme '{scheme}' in URL")
95122

tests/query_parser/__init__.py

Whitespace-only changes.

tests/query_parser/test_git_host_agnostic.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pytest
1111

1212
from gitingest.query_parsing import parse_query
13+
from gitingest.utils.query_parser_utils import KNOWN_GIT_HOSTS
1314

1415
# Repository matrix: (host, user, repo)
1516
_REPOS: List[Tuple[str, str, str]] = [
@@ -18,6 +19,8 @@
1819
("bitbucket.org", "na-dna", "llm-knowledge-share"),
1920
("gitea.com", "xorm", "xorm"),
2021
("codeberg.org", "forgejo", "forgejo"),
22+
("git.rwth-aachen.de", "medialab", "19squared"),
23+
("gitlab.alpinelinux.org", "alpine", "apk-tools"),
2124
]
2225

2326

@@ -43,6 +46,13 @@ async def test_parse_query_without_host(
4346

4447
expected_url = f"https://{host}/{user}/{repo}"
4548

49+
# For slug form with a custom host (not in KNOWN_GIT_HOSTS) we expect a failure,
50+
# because the parser cannot guess which domain to use.
51+
if variant == "slug" and host not in KNOWN_GIT_HOSTS:
52+
with pytest.raises(ValueError):
53+
await parse_query(url, max_file_size=50, from_web=True)
54+
return
55+
4656
query = await parse_query(url, max_file_size=50, from_web=True)
4757

4858
# Compare against the canonical dict while ignoring unpredictable fields.

tests/query_parser/test_query_parser.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
"https://gitea.com/user/repo",
2525
"https://codeberg.org/user/repo",
2626
"https://gist.github.com/user/repo",
27+
"https://git.example.com/user/repo",
28+
"https://gitlab.example.com/user/repo",
29+
"https://gitlab.example.se/user/repo",
2730
]
2831

2932
URLS_HTTP: List[str] = [url.replace("https://", "http://") for url in URLS_HTTPS]

0 commit comments

Comments
 (0)