Skip to content

Commit 18220bb

Browse files
feat: Implement secret scanning with gitleaks and trufflehog (#319)
* feat: Implement secret scanning with gitleaks and trufflehog This commit introduces secret scanning to the project to prevent the accidental committal of credentials. Key changes include: - A new `.github/workflows/security.yml` workflow to run `gitleaks` and `trufflehog` on pull requests. - A `.pre-commit-config.yaml` to run `gitleaks` and `trufflehog` as pre-commit hooks. - A `.gitleaks.toml` configuration file to define rules and allowlists for `gitleaks`. - A new `scan-secrets` target in the `Makefile` for local scanning. - The `pre-commit-run` target in the `Makefile` has been updated to include the `scan-secrets` target. - Updated `README.md` to document the new feature. This addresses issue #270. * fix: Implement comprehensive code review fixes for PR #319 Addresses all issues identified in code review comment #3369314346: CRITICAL FIXES (3 blockers): 1. Restored pre-commit configuration - Added secret scanning hooks while preserving ALL existing hooks (Ruff, MyPy, GitHub workflow validation, Poetry check, test isolation, strangler pattern compliance) 2. Fixed Makefile pre-commit-run target - Restored Poetry dependency and removed circular dependency with scan-secrets 3. Enhanced security workflow - Added error handling, updated actions to v4, added permissions, and configured continue-on-error to prevent blocking PRs on false positives MAJOR IMPROVEMENTS: 4. Optimized secret scanning performance - Using native gitleaks/trufflehog binaries for pre-commit (fast), Docker only for CI 5. Enhanced gitleaks configuration: - Added rules for WatsonX, Anthropic, MLFlow, MinIO, PostgreSQL, JWT - Added entropy detection for high-entropy strings - Enhanced allowlist for test files, docs, deployment scripts - Added stopwords to reduce false positives - Fixed TOML format to use [[rules]] with explicit IDs 6. Removed unrelated test file formatting change CODE QUALITY FIXES: - Fixed nested with statements to use PEP 604 syntax - Removed unused test method argument - Fixed MyPy type ignore comments - Removed non-existent validate-ci.sh hook reference TECHNICAL DETAILS: - Pre-commit hooks now use language: system with native binaries - Gitleaks uses --staged flag for faster pre-commit execution - TruffleHog uses --only-verified to reduce false positives - Fixed deprecated stage names (commit -> pre-commit) - All secret scanning rules follow correct TOML array-of-tables format --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: Manav Gupta <[email protected]>
1 parent a9032f1 commit 18220bb

File tree

7 files changed

+232
-22
lines changed

7 files changed

+232
-22
lines changed

.github/workflows/security.yml

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
name: Security Scan
2+
3+
on:
4+
pull_request:
5+
branches: [ main ]
6+
workflow_dispatch:
7+
8+
permissions:
9+
contents: read
10+
security-events: write
11+
12+
jobs:
13+
gitleaks:
14+
name: gitleaks
15+
runs-on: ubuntu-latest
16+
continue-on-error: true # Don't block PR on false positives
17+
steps:
18+
- name: Checkout code
19+
uses: actions/checkout@v4
20+
with:
21+
fetch-depth: 0
22+
23+
- name: Run gitleaks
24+
uses: gitleaks/gitleaks-action@v2
25+
env:
26+
GITLEAKS_CONFIG: .gitleaks.toml
27+
GITLEAKS_ENABLE_UPLOAD: false
28+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
29+
continue-on-error: true
30+
31+
- name: Report gitleaks status
32+
if: failure()
33+
run: |
34+
echo "⚠️ Gitleaks detected potential secrets. Please review the findings above."
35+
echo "If these are false positives, update .gitleaks.toml allowlist."
36+
exit 0
37+
38+
trufflehog:
39+
name: trufflehog
40+
runs-on: ubuntu-latest
41+
continue-on-error: true # Don't block PR on false positives
42+
steps:
43+
- name: Checkout code
44+
uses: actions/checkout@v4
45+
with:
46+
fetch-depth: 0
47+
48+
- name: Run trufflehog
49+
uses: trufflesecurity/[email protected]
50+
with:
51+
path: ./
52+
base: ${{ github.event.pull_request.base.sha }}
53+
head: ${{ github.event.pull_request.head.sha }}
54+
extra_args: --only-verified
55+
continue-on-error: true
56+
57+
- name: Report trufflehog status
58+
if: failure()
59+
run: |
60+
echo "⚠️ TruffleHog detected verified secrets. Please review the findings above."
61+
echo "Remove any real secrets and rotate compromised credentials immediately."
62+
exit 0

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ __pycache__/
1313
venv/
1414
env/
1515
.venv/
16+
backend/.venv/
1617

1718
# Ignore version control files and directories
1819
.git/

.gitleaks.toml

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# .gitleaks.toml
2+
3+
[allowlist]
4+
description = "Allowlist for paths and commits that are known to be safe."
5+
paths = [
6+
'''gitleaks.toml''',
7+
'''(.*?)(go.sum|go.mod|vendor)''',
8+
'''(.*?)(package.json|package-lock.json|npm-shrinkwrap.json)''',
9+
'''(.*?)(Pipfile|Pipfile.lock|poetry.lock)''',
10+
'''(.*?)(Gemfile.lock|gems.locked)''',
11+
'''(.*?)(Cargo.lock)''',
12+
'''(.*?)(yarn.lock)''',
13+
'''(.*?)(composer.lock)''',
14+
'''(.*?)(.snap)''',
15+
'''(.*?)(\.md|\.txt)''', # Documentation files
16+
'''env\.example''', # Example env files
17+
'''(.*?)test_.*\.py''', # Test files with fixtures
18+
'''(.*?)tests/fixtures/.*''', # Test fixtures
19+
'''deployment/scripts/.*''', # Deployment scripts with env var templates
20+
'''\.env\..*''', # Environment template files
21+
]
22+
23+
# Stopwords to avoid false positives
24+
stopwords = [
25+
"example",
26+
"sample",
27+
"test",
28+
"mock",
29+
"dummy",
30+
"placeholder",
31+
]
32+
33+
[[rules]]
34+
id = "aws-access-token"
35+
description = "AWS Access Token"
36+
regex = '''(A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}'''
37+
tags = ["key", "AWS"]
38+
39+
[[rules]]
40+
id = "github-pat"
41+
description = "GitHub Personal Access Token"
42+
regex = '''ghp_[0-9a-zA-Z]{36}'''
43+
tags = ["key", "GitHub"]
44+
45+
[[rules]]
46+
id = "github-fine-grained-pat"
47+
description = "GitHub Fine-Grained Personal Access Token"
48+
regex = '''github_pat_[0-9a-zA-Z]{22}_[0-9a-zA-Z]{59}'''
49+
tags = ["key", "GitHub"]
50+
51+
[[rules]]
52+
id = "github-app-token"
53+
description = "GitHub App Token"
54+
regex = '''(ghu|ghs)_[0-9a-zA-Z]{36}'''
55+
tags = ["key", "GitHub"]
56+
57+
[[rules]]
58+
id = "github-refresh-token"
59+
description = "GitHub Refresh Token"
60+
regex = '''ghr_[0-9a-zA-Z]{76}'''
61+
tags = ["key", "GitHub"]
62+
63+
[[rules]]
64+
id = "slack-token"
65+
description = "Slack Token"
66+
regex = '''xox[baprs]-([0-9a-zA-Z-]{10,48})?'''
67+
tags = ["key", "Slack"]
68+
69+
[[rules]]
70+
id = "stripe-sk"
71+
description = "Stripe Secret Key"
72+
regex = '''sk_live_[0-9a-zA-Z]{24}'''
73+
tags = ["key", "Stripe"]
74+
75+
[[rules]]
76+
id = "stripe-rk"
77+
description = "Stripe Restricted Key"
78+
regex = '''rk_live_[0-9a-zA-Z]{24}'''
79+
tags = ["key", "Stripe"]
80+
81+
[[rules]]
82+
id = "private-key"
83+
description = "Private Key"
84+
regex = '''-----BEGIN ((EC|PGP|OPENSSH|RSA|DSA) )?PRIVATE KEY( BLOCK)?-----'''
85+
tags = ["key", "Asymmetric"]
86+
87+
[[rules]]
88+
id = "watsonx-api-key"
89+
description = "WatsonX API Key"
90+
regex = '''(?i)(WATSONX_APIKEY|WATSONX_API_KEY)\s*[=:]\s*['"]?([a-zA-Z0-9_-]{32,})['"]?'''
91+
tags = ["key", "WatsonX"]
92+
93+
[[rules]]
94+
id = "anthropic-api-key"
95+
description = "Anthropic API Key"
96+
regex = '''(?i)ANTHROPIC_API_KEY\s*[=:]\s*['"]?(sk-ant-[a-zA-Z0-9_-]{32,})['"]?'''
97+
tags = ["key", "Anthropic"]
98+
99+
[[rules]]
100+
id = "mlflow-credentials"
101+
description = "MLFlow Credentials"
102+
regex = '''(?i)MLFLOW_TRACKING_(USERNAME|PASSWORD)\s*[=:]\s*['"]?([^'"\s]{3,})['"]?'''
103+
tags = ["credentials", "MLFlow"]
104+
105+
[[rules]]
106+
id = "minio-credentials"
107+
description = "MinIO Credentials"
108+
regex = '''(?i)MINIO_ROOT_(USER|PASSWORD)\s*[=:]\s*['"]?([^'"\s]{3,})['"]?'''
109+
tags = ["credentials", "MinIO"]
110+
111+
[[rules]]
112+
id = "postgres-password"
113+
description = "PostgreSQL Password"
114+
regex = '''(?i)(POSTGRES_PASSWORD|COLLECTIONDB_PASSWORD)\s*[=:]\s*['"]?([^'"\s]{3,})['"]?'''
115+
tags = ["password", "PostgreSQL"]
116+
117+
[[rules]]
118+
id = "jwt-secret-key"
119+
description = "JWT Secret Key"
120+
regex = '''(?i)JWT_SECRET_KEY\s*[=:]\s*['"]?([a-zA-Z0-9_-]{32,})['"]?'''
121+
tags = ["secret", "JWT"]
122+
123+
[[rules]]
124+
id = "high-entropy-strings"
125+
description = "High Entropy String (possible secret)"
126+
regex = '''[a-zA-Z0-9+/=]{32,}'''
127+
entropy = 4.5
128+
tags = ["entropy"]

.pre-commit-config.yaml

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,6 @@ repos:
8787
# Local validation
8888
- repo: local
8989
hooks:
90-
- id: validate-ci-locally
91-
name: Validate CI workflows can run locally
92-
entry: ./scripts/validate-ci.sh
93-
language: script
94-
files: ^\.github/workflows/.*\.ya?ml$
95-
pass_filenames: false
96-
9790
- id: python-poetry-check
9891
name: Check poetry configuration
9992
entry: bash -c 'cd backend && poetry check'
@@ -121,3 +114,16 @@ repos:
121114
language: system
122115
files: ^(backend|scripts)/.*\.py$
123116
pass_filenames: false
117+
118+
# Secret scanning hooks (gitleaks + trufflehog)
119+
- id: gitleaks
120+
name: Detect hardcoded secrets using Gitleaks
121+
entry: gitleaks protect --verbose --redact -c .gitleaks.toml --staged
122+
language: system
123+
stages: [pre-commit]
124+
125+
- id: trufflehog
126+
name: Detect hardcoded secrets using TruffleHog
127+
entry: trufflehog filesystem --directory . --only-verified
128+
language: system
129+
stages: [pre-commit]

Makefile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1121,7 +1121,7 @@ format-check: venv
11211121
@echo "$(GREEN)✅ Format check completed$(NC)"
11221122

11231123
## Pre-commit targets
1124-
pre-commit-run:
1124+
pre-commit-run: venv
11251125
@echo "$(CYAN)🔧 Running pre-commit hooks on all files...$(NC)"
11261126
@cd backend && $(POETRY) run pre-commit run --all-files
11271127
@echo "$(GREEN)✅ Pre-commit run completed$(NC)"
@@ -1172,6 +1172,14 @@ security-check: venv
11721172
@cd backend && $(POETRY) run safety check || echo "$(YELLOW)⚠️ Some dependency vulnerabilities found$(NC)"
11731173
@echo "$(GREEN)✅ Security checks completed$(NC)"
11741174

1175+
scan-secrets:
1176+
@echo "$(CYAN)🔑 Running secret scanning...$(NC)"
1177+
@echo "Running gitleaks..."
1178+
@docker run --rm -v $(CURDIR):/path gitleaks/gitleaks:latest detect --source /path --config /path/.gitleaks.toml --verbose
1179+
@echo "Running trufflehog..."
1180+
@docker run --rm -v $(CURDIR):/path trufflesecurity/trufflehog:latest filesystem /path
1181+
@echo "$(GREEN)✅ Secret scanning completed$(NC)"
1182+
11751183
## Coverage targets
11761184
coverage: venv
11771185
@echo "$(CYAN)📈 Running tests with coverage...$(NC)"

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,9 @@ make lint
190190
# Security scanning
191191
make security-check
192192

193+
# Secret scanning
194+
make scan-secrets
195+
193196
# Coverage report
194197
make coverage
195198
```

backend/tests/unit/test_podcast_service_unit.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -138,12 +138,14 @@ async def test_get_podcast_returns_output(self, mock_service: PodcastService) ->
138138
mock_podcast = Mock()
139139
mock_podcast.user_id = user_id
140140

141-
with patch.object(mock_service.repository, "get_by_id", new=AsyncMock(return_value=mock_podcast)) as mock_get:
142-
with patch.object(mock_service.repository, "to_schema", return_value=mock_output):
143-
result = await mock_service.get_podcast(podcast_id, user_id)
141+
with (
142+
patch.object(mock_service.repository, "get_by_id", new=AsyncMock(return_value=mock_podcast)) as mock_get,
143+
patch.object(mock_service.repository, "to_schema", return_value=mock_output),
144+
):
145+
result = await mock_service.get_podcast(podcast_id, user_id)
144146

145-
assert result == mock_output
146-
mock_get.assert_called_once_with(podcast_id)
147+
assert result == mock_output
148+
mock_get.assert_called_once_with(podcast_id)
147149

148150
@pytest.mark.asyncio
149151
async def test_list_user_podcasts(self, mock_service: PodcastService) -> None:
@@ -165,12 +167,14 @@ async def test_delete_podcast(self, mock_service: PodcastService) -> None:
165167
podcast_id = uuid4()
166168
user_id = uuid4()
167169

168-
with patch.object(mock_service.repository, "get_by_id", new=AsyncMock(return_value=Mock(user_id=user_id))):
169-
with patch.object(mock_service.repository, "delete", new=AsyncMock(return_value=True)) as mock_delete:
170-
result = await mock_service.delete_podcast(podcast_id, user_id)
170+
with (
171+
patch.object(mock_service.repository, "get_by_id", new=AsyncMock(return_value=Mock(user_id=user_id))),
172+
patch.object(mock_service.repository, "delete", new=AsyncMock(return_value=True)) as mock_delete,
173+
):
174+
result = await mock_service.delete_podcast(podcast_id, user_id)
171175

172-
assert result is True
173-
mock_delete.assert_called_once_with(podcast_id)
176+
assert result is True
177+
mock_delete.assert_called_once_with(podcast_id)
174178

175179

176180
@pytest.mark.unit
@@ -224,7 +228,7 @@ def mock_service(self) -> PodcastService:
224228
collection_service=collection_service,
225229
search_service=search_service,
226230
)
227-
service.search_service.search = AsyncMock() # type: ignore[attr-defined]
231+
service.search_service.search = AsyncMock() # type: ignore[method-assign,attr-defined]
228232
return service
229233

230234
@pytest.mark.asyncio
@@ -247,9 +251,7 @@ async def test_retrieve_content_uses_description_in_query(self, mock_service: Po
247251
assert description in search_input.question
248252

249253
@pytest.mark.asyncio
250-
async def test_retrieve_content_uses_generic_query_without_description(
251-
self, mock_service: PodcastService
252-
) -> None:
254+
async def test_retrieve_content_uses_generic_query_without_description(self, mock_service: PodcastService) -> None:
253255
"""Unit: _retrieve_content uses generic query if no description."""
254256
podcast_input = PodcastGenerationInput(
255257
user_id=uuid4(),

0 commit comments

Comments
 (0)