Skip to content

Commit 838f4f2

Browse files
committed
last-modified: use Bloom filters when available
Our 'git last-modified' performs a revision walk, and computes a diff at each point in the walk to figure out whether a given revision changed any of the paths it considers interesting. When changed-path Bloom filters are available, we can avoid computing many such diffs. Before computing a diff, we first check if any of the remaining paths of interest were possibly changed at a given commit by consulting its Bloom filter. If any of them are, we are resigned to compute the diff. If none of those queries returned "maybe", we know that the given commit doesn't contain any changed paths which are interesting to us. So, we can avoid computing it in this case. Comparing the perf test results on git.git: Test HEAD~ HEAD ------------------------------------------------------------------------------------ 8020.1: top-level last-modified 4.49(4.34+0.11) 2.22(2.05+0.09) -50.6% 8020.2: top-level recursive last-modified 5.64(5.45+0.11) 5.62(5.30+0.11) -0.4% 8020.3: subdir last-modified 0.11(0.06+0.04) 0.07(0.03+0.04) -36.4% Based-on-patch-by: Taylor Blau <[email protected]> Signed-off-by: Toon Claes <[email protected]>
1 parent 5c3b650 commit 838f4f2

File tree

3 files changed

+54
-3
lines changed

3 files changed

+54
-3
lines changed

builtin/last-modified.c

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "git-compat-util.h"
2+
#include "bloom.h"
23
#include "builtin.h"
4+
#include "commit-graph.h"
35
#include "commit.h"
46
#include "config.h"
57
#include "diff.h"
@@ -17,6 +19,7 @@
1719
struct last_modified_entry {
1820
struct hashmap_entry hashent;
1921
struct object_id oid;
22+
struct bloom_key key;
2023
const char path[FLEX_ARRAY];
2124
};
2225

@@ -41,6 +44,12 @@ struct last_modified {
4144

4245
static void last_modified_release(struct last_modified *lm)
4346
{
47+
struct hashmap_iter iter;
48+
struct last_modified_entry *ent;
49+
50+
hashmap_for_each_entry(&lm->paths, &iter, ent, hashent)
51+
bloom_key_clear(&ent->key);
52+
4453
hashmap_clear_and_free(&lm->paths, struct last_modified_entry, hashent);
4554
release_revisions(&lm->rev);
4655
}
@@ -62,6 +71,9 @@ static void add_path_from_diff(struct diff_queue_struct *q,
6271

6372
FLEX_ALLOC_STR(ent, path, path);
6473
oidcpy(&ent->oid, &p->two->oid);
74+
if (lm->rev.bloom_filter_settings)
75+
bloom_key_fill(&ent->key, path, strlen(path),
76+
lm->rev.bloom_filter_settings);
6577
hashmap_entry_init(&ent->hashent, strhash(ent->path));
6678
hashmap_add(&lm->paths, &ent->hashent);
6779
}
@@ -138,6 +150,7 @@ static void mark_path(const char *path, const struct object_id *oid,
138150
last_modified_emit(data->lm, path, data->commit);
139151

140152
hashmap_remove(&data->lm->paths, &ent->hashent, path);
153+
bloom_key_clear(&ent->key);
141154
free(ent);
142155
}
143156

@@ -181,6 +194,27 @@ static void last_modified_diff(struct diff_queue_struct *q,
181194
}
182195
}
183196

197+
static bool maybe_changed_path(struct last_modified *lm, struct commit *origin)
198+
{
199+
struct bloom_filter *filter;
200+
struct last_modified_entry *ent;
201+
struct hashmap_iter iter;
202+
203+
if (!lm->rev.bloom_filter_settings)
204+
return true;
205+
206+
filter = get_bloom_filter(lm->rev.repo, origin);
207+
if (!filter)
208+
return true;
209+
210+
hashmap_for_each_entry(&lm->paths, &iter, ent, hashent) {
211+
if (bloom_filter_contains(filter, &ent->key,
212+
lm->rev.bloom_filter_settings))
213+
return true;
214+
}
215+
return false;
216+
}
217+
184218
static int last_modified_run(struct last_modified *lm)
185219
{
186220
struct last_modified_callback_data data = { .lm = lm };
@@ -196,14 +230,22 @@ static int last_modified_run(struct last_modified *lm)
196230
if (!data.commit)
197231
BUG("paths remaining beyond boundary in last-modified");
198232

233+
// TODO distinguish when boundary is the one touching paths and
234+
// beyond it
235+
199236
if (data.commit->object.flags & BOUNDARY) {
200237
diff_tree_oid(lm->rev.repo->hash_algo->empty_tree,
201238
&data.commit->object.oid, "",
202239
&lm->rev.diffopt);
203240
diff_flush(&lm->rev.diffopt);
204-
} else {
205-
log_tree_commit(&lm->rev, data.commit);
241+
242+
//break;
206243
}
244+
245+
if (!maybe_changed_path(lm, data.commit))
246+
continue;
247+
248+
log_tree_commit(&lm->rev, data.commit);
207249
}
208250

209251
return 0;
@@ -230,6 +272,8 @@ static int last_modified_init(struct last_modified *lm, struct repository *r,
230272
return argc;
231273
}
232274

275+
lm->rev.bloom_filter_settings = get_bloom_filter_settings(lm->rev.repo);
276+
233277
if (populate_paths_from_revs(lm) < 0)
234278
return error(_("unable to setup last-modified"));
235279

commit-graph.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -820,7 +820,12 @@ int corrected_commit_dates_enabled(struct repository *r)
820820

821821
struct bloom_filter_settings *get_bloom_filter_settings(struct repository *r)
822822
{
823-
struct commit_graph *g = r->objects->commit_graph;
823+
struct commit_graph *g;
824+
825+
if (!prepare_commit_graph(r))
826+
return NULL;
827+
828+
g = r->objects->commit_graph;
824829
while (g) {
825830
if (g->bloom_filter_settings)
826831
return g->bloom_filter_settings;

t/t8020-last-modified.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ test_expect_success 'limit last-modified traversal by commit' '
113113
EOF
114114
'
115115

116+
# TODO test exact at boundary
117+
116118
test_expect_success 'only last-modified files in the current tree' '
117119
git rm -rf a &&
118120
git commit -m "remove a" &&

0 commit comments

Comments
 (0)