From 7dbc5a82755d0e3f98ee9c5a148c683d39b5f076 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Thu, 19 Jun 2025 18:56:19 +0000 Subject: [PATCH] Speed up `codespell:ignore` check by skipping the regex in most cases The codespell codebase unsurprisingly spends a vast majority of its runtime in various regex related code such as `search` and `finditer`. The best way to optimize runtime spend in regexes is to not do a regex in the first place, since the regex engine has a rather steep overhead over regular string primitives (that is at the cost of flexibility). If the regex rarely matches and there is a very easy static substring that can be used to rule out the match, then you can speed up the code by using `substring in string` as a conditional to skip the regex. This is assuming the regex is used enough for the performance to matter. An obvious choice here falls on the `codespell:ignore` regex, because it has a very distinctive substring in the form of `codespell:ignore`, which will rule out almost all lines that will not match. With this little trick, runtime goes from ~5.4s to ~4.5s on the corpus mentioned in #3419. --- codespell_lib/_codespell.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index afdd0cc508..4207b09131 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -60,7 +60,10 @@ uri_regex_def = ( r"(\b(?:https?|[ts]?ftp|file|git|smb)://[^\s]+(?=$|\s)|\b[\w.%+-]+@[\w.-]+\b)" ) -inline_ignore_regex = re.compile(r"[^\w\s]\s*codespell:ignore\b(\s+(?P[\w,]*))?") +codespell_ignore_tag = "codespell:ignore" +inline_ignore_regex = re.compile( + rf"[^\w\s]\s*{codespell_ignore_tag}\b(\s+(?P[\w,]*))?" +) USAGE = """ \t%prog [OPTIONS] [file1 file2 ... fileN] """ @@ -951,7 +954,9 @@ def parse_file( continue extra_words_to_ignore = set() - match = inline_ignore_regex.search(line) + match = ( + inline_ignore_regex.search(line) if codespell_ignore_tag in line else None + ) if match: extra_words_to_ignore = set( filter(None, (match.group("words") or "").split(","))