Skip to content

Commit 963ad01

Browse files
author
Gal Ben David
committed
This is a major release dropping the C++ implementation in favor of a
Rust implementation. Using this library in production for more than year has raised multiple concerns. C++ concurrency model has proven to be hard when using libgit2 and shown many exceptions and race-conditions. Nontheless, C++ shown problems with unicode strings and performance degradation. Using Rust ended up being more performent, safe, and easy to develop and maintain. - Replaced the C++ implementation with Rust - Tiny changes in the API. Rule adding functions dropped the `_regex` prefix from their parameters. - The package now ships binary packages (wheels) - More performance improvements like avoiding scanning empty files. - File path and extensions skipping rules now being compared lowercased
1 parent dc01b37 commit 963ad01

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+1874
-30322
lines changed

.github/workflows/build.yml

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,49 @@ name: Build
22
on: [push, pull_request]
33

44
jobs:
5-
build:
5+
lint:
66
if: github.event_name == 'push' && !startsWith(github.event.ref, 'refs/tags')
7-
runs-on: ubuntu-20.04
7+
runs-on: ubuntu-latest
8+
steps:
9+
- name: Checkout
10+
uses: actions/checkout@v1
11+
- name: Install latest rust
12+
uses: actions-rs/toolchain@v1
13+
with:
14+
toolchain: stable
15+
override: true
16+
components: clippy
17+
- name: Lint with clippy
18+
uses: actions-rs/cargo@v1
19+
with:
20+
command: clippy
21+
args: --all-targets --all-features
22+
test:
23+
runs-on: ${{ matrix.os }}
24+
needs: lint
825
strategy:
9-
max-parallel: 4
26+
fail-fast: false
1027
matrix:
11-
python-version: [3.6, 3.7, 3.8, pypy3]
28+
python-version: [3.6, 3.7, 3.8]
29+
os: [ubuntu-latest , macos-latest, windows-latest]
30+
1231
steps:
13-
- uses: actions/checkout@v1
14-
- name: Set up Python ${{ matrix.python-version }}
15-
uses: actions/setup-python@v1
16-
with:
17-
python-version: ${{ matrix.python-version }}
18-
- name: Install Ubuntu packages
19-
run: >-
20-
sudo apt install libre2-dev libgit2-dev;
21-
- name: Test module
22-
run: >-
23-
python setup.py test
32+
- name: Checkout
33+
uses: actions/checkout@v1
34+
- name: Set up Python ${{ matrix.python-version }}
35+
uses: actions/setup-python@v1
36+
with:
37+
python-version: ${{ matrix.python-version }}
38+
- name: Run image
39+
uses: abatilo/[email protected]
40+
- name: Install latest rust
41+
uses: actions-rs/toolchain@v1
42+
with:
43+
toolchain: stable
44+
override: true
45+
- name: Install dependencies
46+
run: poetry install
47+
- name: Build Python package
48+
run: poetry run maturin develop
49+
- name: pytest
50+
run: poetry run pytest tests

.github/workflows/deploy.yml

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,26 @@ on:
55

66
jobs:
77
deploy:
8-
runs-on: ubuntu-20.04
8+
runs-on: ${{ matrix.os }}
9+
strategy:
10+
fail-fast: false
11+
matrix:
12+
python-version: [3.6, 3.7, 3.8]
13+
os: [ubuntu-latest, macos-latest, windows-latest]
914
steps:
10-
- uses: actions/checkout@v1
11-
- name: Set up Python 3.8
12-
uses: actions/setup-python@v1
13-
with:
14-
python-version: 3.8
15-
- name: Build a source tarball
16-
run: >-
17-
python -m pip install --user --upgrade setuptools;
18-
python setup.py sdist;
19-
- name: Publish distribution 📦 to PyPI
20-
uses: pypa/gh-action-pypi-publish@master
21-
with:
22-
password: ${{ secrets.pypi_password }}
15+
- uses: actions/checkout@v1
16+
- uses: actions/setup-python@v1
17+
with:
18+
python-version: ${{ matrix.python-version }}
19+
- name: Install latest rust
20+
uses: actions-rs/toolchain@v1
21+
with:
22+
toolchain: stable
23+
override: true
24+
- name: Install dependencies
25+
run: |
26+
python -m pip install --upgrade pip maturin
27+
- name: Build & Publish to PyPi
28+
run: maturin publish --username __token__ --no-sdist --interpreter python${{matrix.python_version}} --manylinux=2014
29+
env:
30+
MATURIN_PASSWORD: ${{ secrets.pypi_password }}

.gitignore

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ parts/
2020
sdist/
2121
var/
2222
wheels/
23-
pip-wheel-metadata/
2423
share/python-wheels/
2524
*.egg-info/
2625
.installed.cfg
@@ -50,6 +49,7 @@ coverage.xml
5049
*.py,cover
5150
.hypothesis/
5251
.pytest_cache/
52+
cover/
5353

5454
# Translations
5555
*.mo
@@ -72,6 +72,7 @@ instance/
7272
docs/_build/
7373

7474
# PyBuilder
75+
.pybuilder/
7576
target/
7677

7778
# Jupyter Notebook
@@ -82,7 +83,9 @@ profile_default/
8283
ipython_config.py
8384

8485
# pyenv
85-
.python-version
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
8689

8790
# pipenv
8891
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
@@ -128,6 +131,22 @@ dmypy.json
128131
# Pyre type checker
129132
.pyre/
130133

131-
*.cppimporthash
132-
.rendered.*
133-
.vscode
134+
# pytype static type analyzer
135+
.pytype/
136+
137+
# Cython debug symbols
138+
cython_debug/
139+
.gitignore
140+
.gitignore
141+
142+
# Generated by Cargo
143+
# will have compiled files and executables
144+
debug/
145+
target/
146+
147+
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
148+
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
149+
Cargo.lock
150+
151+
# These are backup files generated by rustfmt
152+
**/*.rs.bk

Cargo.toml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
[package]
2+
name = "pyrepscan"
3+
version = "0.1.0"
4+
authors = ["Gal Ben David <[email protected]>"]
5+
edition = "2018"
6+
7+
[lib]
8+
name = "pyrepscan"
9+
crate-type = ["cdylib"]
10+
11+
[dependencies]
12+
git2 = "*"
13+
regex = "*"
14+
rayon = "*"
15+
chrono = "*"
16+
parking_lot = "*"
17+
18+
[dependencies.pyo3]
19+
version = "0.12"
20+
features = ["extension-module"]
21+
22+
[profile.release]
23+
lto = true
24+
panic = "abort"

MANIFEST.in

Lines changed: 0 additions & 4 deletions
This file was deleted.

README.md

Lines changed: 38 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<img src="https://raw.githubusercontent.com/intsights/PyRepScan/master/images/logo.png" alt="Logo">
44
</a>
55
<h3 align="center">
6-
A Git Repository Leaks Scanner Python library written in C++
6+
A Git Repository Secrets Scanner written in Rust
77
</h3>
88
</p>
99

@@ -19,7 +19,6 @@
1919
- [Built With](#built-with)
2020
- [Performance](#performance)
2121
- [CPU](#cpu)
22-
- [Prerequisites](#prerequisites)
2322
- [Installation](#installation)
2423
- [Documentation](#documentation)
2524
- [Usage](#usage)
@@ -29,37 +28,25 @@
2928

3029
## About The Project
3130

32-
PyRepScan is a python library written in C++. The library uses [libgit2](https://github.com/libgit2/libgit2) for repository parsing and traversing, [re2](https://github.com/google/re2) for regex pattern matching and [taskflow](https://github.com/taskflow/taskflow) for concurrency. The library was written to achieve high performance and python bindings.
31+
PyRepScan is a python library written in Rust. The library uses [git2-rs](https://github.com/rust-lang/git2-rs) for repository parsing and traversing, [regex](https://github.com/rust-lang/regex) for regex pattern matching and [rayon](https://github.com/rayon-rs/rayon) for concurrency. The library was written to achieve high performance and python bindings.
3332

3433

3534
### Built With
3635

37-
* [libgit2](https://github.com/libgit2/libgit2)
38-
* [re2](https://github.com/google/re2)
39-
* [taskflow](https://github.com/taskflow/taskflow)
36+
* [git2-rs](https://github.com/rust-lang/git2-rs)
37+
* [regex](https://github.com/rust-lang/regex)
38+
* [rayon](https://github.com/rayon-rs/rayon)
4039

4140

4241
### Performance
4342

4443
#### CPU
45-
| Library | Time | Improvement Factor |
44+
| Library | Time | Peak Memory |
4645
| ------------- | ------------- | ------------- |
47-
| [PyRepScan](https://github.com/intsights/PyRepScan) | 2.18s | 1.0x |
48-
| [gitleaks](https://github.com/zricethezav/gitleaks) | 63.0s | 28.9x |
46+
| [PyRepScan](https://github.com/intsights/PyRepScan) | 4s | 501,708 kb |
47+
| [gitleaks](https://github.com/zricethezav/gitleaks) | 507s | 823,016 kb |
4948

5049

51-
### Prerequisites
52-
53-
In order to compile this package you should have GCC & Python development package installed.
54-
* Fedora
55-
```sh
56-
sudo dnf install python3-devel gcc-c++ libgit2-devel re2-devel
57-
```
58-
* Ubuntu 20.04
59-
```sh
60-
sudo apt install python3-dev libgit2-dev libre2-dev
61-
```
62-
6350
### Installation
6451

6552
```sh
@@ -82,62 +69,62 @@ This class holds all the added rules for fast reuse.
8269
def add_content_rule(
8370
self,
8471
name: str,
85-
regex_pattern: str,
86-
whitelist_regex_patterns: typing.List[str],
87-
blacklist_regex_patterns: typing.List[str],
72+
pattern: str,
73+
whitelist_patterns: typing.List[str],
74+
blacklist_patterns: typing.List[str],
8875
) -> None
8976
```
9077
The `add_content_rule` function adds a new rule to an internal list of rules that could be reused multiple times against different repositories. The same name can be used multiple times and would lead to results which can hold the same name. Content rule means that the regex pattern would be tested against the content of the files.
9178
- `name` - The name of the rule so it can be identified.
92-
- `regex_pattern` - The regex pattern (RE2 syntax) to match against the content of the commited files.
93-
- `whitelist_regex_patterns` - A list of regex patterns (RE2 syntax) to match against the content of the committed file to filter in results. Only one of the patterns should be matched to pass through the result. There is an OR relation between the patterns.
94-
- `blacklist_regex_patterns` - A list of regex patterns (RE2 syntax) to match against the content of the committed file to filter out results. Only one of the patterns should be matched to omit the result. There is an OR relation between the patterns.
79+
- `pattern` - The regex pattern (Rust Regex syntax) to match against the content of the commited files.
80+
- `whitelist_patterns` - A list of regex patterns (Rust Regex syntax) to match against the content of the committed file to filter in results. Only one of the patterns should be matched to pass through the result. There is an OR relation between the patterns.
81+
- `blacklist_patterns` - A list of regex patterns (Rust Regex syntax) to match against the content of the committed file to filter out results. Only one of the patterns should be matched to omit the result. There is an OR relation between the patterns.
9582

9683

9784
```python
98-
def add_file_name_rule(
85+
def add_file_path_rule(
9986
self,
10087
name: str,
101-
regex_pattern: str,
88+
pattern: str,
10289
) -> None
10390
```
104-
The `add_file_name_rule` function adds a new rule to an internal list of rules that could be reused multiple times against different repositories. The same name can be used multiple times and would lead to results which can hold the same name. File name rule means that the regex pattern would be tested against the file names.
91+
The `add_file_path_rule` function adds a new rule to an internal list of rules that could be reused multiple times against different repositories. The same name can be used multiple times and would lead to results which can hold the same name. File name rule means that the regex pattern would be tested against the file paths.
10592
- `name` - The name of the rule so it can be identified.
106-
- `regex_pattern` - The regex pattern (RE2 syntax) to match against the file names of the commited files.
93+
- `pattern` - The regex pattern (Rust Regex syntax) to match against the file paths of the commited files.
10794

10895

10996
```python
110-
def add_ignored_file_extension(
97+
def add_file_extension_to_skip(
11198
self,
11299
file_extension: str,
113100
) -> None
114101
```
115-
The `add_ignored_file_extension` function adds a new file extension to the filtering phase to reduce the amount of inspected files and to increase the performance of the scan.
102+
The `add_file_extension_to_skip` function adds a new file extension to the filtering phase to reduce the amount of inspected files and to increase the performance of the scan.
116103
- `file_extension` - A file extension, without a leading dot, to filter out from the scan.
117104

118105

119106
```python
120-
def add_ignored_file_path(
107+
def add_file_path_to_skip(
121108
self,
122109
file_path: str,
123110
) -> None
124111
```
125-
The `add_ignored_file_path` function adds a new file pattern to the filtering phase to reduce the amount of inspected files and to increase the performance of the scan. Every file path that would include the `file_path` substring would be left out of the scanned files.
112+
The `add_file_path_to_skip` function adds a new file path pattern to the filtering phase to reduce the amount of inspected files and to increase the performance of the scan. Every file path that would include the `file_path` substring would be left out of the scanned files.
126113
- `file_path` - If the inspected file path would include this substring, it won't be scanned. This parameter is a free text.
127114

128115

129116
```python
130117
def scan(
131118
self,
132119
repository_path: str,
133-
branch_glob_pattern: '*',
134-
from_timestamp: int = 0,
120+
branch_glob_pattern: typing.Optional[str],
121+
from_timestamp: typing.Optional[int],
135122
) -> typing.List[typing.Dict[str, str]]
136123
```
137124
The `scan` function is the main function in the library. Calling this function would trigger a new scan that would return a list of matches. The scan function is a multithreaded operation, that would utilize all the available core in the system. The results would not include the file content but only the regex matching group. To retrieve the full file content one should take the `results['oid']` and to call `get_file_content` function.
138125
- `repository_path` - The git repository folder path.
139-
- `branch_glob_pattern` - A glob pattern to filter branches for the scan.
140-
- `from_timestamp` - A UTC timestamp (Int) that only commits that were created after this timestamp would be included in the scan.
126+
- `branch_glob_pattern` - A glob pattern to filter branches for the scan. If None is sent, defaults to `*`.
127+
- `from_timestamp` - A UTC timestamp (Int) that only commits that were created after this timestamp would be included in the scan. If None is sent, defaults to `0`.
141128

142129
A sample result would look like this:
143130
```python
@@ -157,6 +144,7 @@ A sample result would look like this:
157144

158145
```python
159146
def get_file_content(
147+
self,
160148
repository_path: str,
161149
file_oid: str,
162150
) -> bytes
@@ -176,32 +164,32 @@ grs = pyrepscan.GitRepositoryScanner()
176164
# Adds a specific rule, can be called multiple times or none
177165
grs.add_content_rule(
178166
name='First Rule',
179-
regex_pattern=r'(-----BEGIN PRIVATE KEY-----)',
180-
whitelist_regex_patterns=[],
181-
blacklist_regex_patterns=[],
167+
pattern=r'(-----BEGIN PRIVATE KEY-----)',
168+
whitelist_patterns=[],
169+
blacklist_patterns=[],
182170
)
183-
grs.add_file_name_rule(
171+
grs.add_file_path_rule(
184172
name='Second Rule',
185-
regex_pattern=r'.+\.pem',
173+
pattern=r'.+\.pem',
186174
)
187-
grs.add_file_name_rule(
175+
grs.add_file_path_rule(
188176
name='Third Rule',
189-
regex_pattern=r'(prod|dev|stage).+key',
177+
pattern=r'(prod|dev|stage).+key',
190178
)
191179

192180
# Add file extensions to ignore during the search
193-
grs.add_ignored_file_extension(
181+
grs.add_file_extension_to_skip(
194182
file_extension='bin',
195183
)
196-
grs.add_ignored_file_extension(
184+
grs.add_file_extension_to_skip(
197185
file_extension='jpg',
198186
)
199187

200188
# Add file paths to ignore during the search. Free text is allowed
201-
grs.add_ignored_file_path(
189+
grs.add_file_path_to_skip(
202190
file_path='site-packages',
203191
)
204-
grs.add_ignored_file_path(
192+
grs.add_file_path_to_skip(
205193
file_path='node_modules',
206194
)
207195

benchmarks/gitleaks.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
~/go/bin/gitleaks --repo-path=$REPO_PATH --config=gitleaks.toml

benchmarks/gitleaks.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[[rules]]
2+
description = "AWS Manager ID"
3+
regex = '''(A3T[A-Z0-9]|AKIA|AGPA|AIDA|AROA|AIPA|ANPA|ANVA|ASIA)[A-Z0-9]{16}'''
4+
tags = ["key", "AWS"]

0 commit comments

Comments
 (0)