Skip to content

Commit cd5f6fc

Browse files
authored
BM25Retriever: escape regex metacharacters in getTermFrequency to prevent crashes (#8749)
2 parents 7bcc76d + 5fc0bca commit cd5f6fc

File tree

3 files changed

+27
-1
lines changed

3 files changed

+27
-1
lines changed

.changeset/major-lamps-walk.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"@langchain/community": patch
3+
---
4+
5+
BM25Retriever: escape regex metacharacters in getTermFrequency to prevent crashes

libs/langchain-community/src/retrievers/tests/bm25.test.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { expect, test } from "@jest/globals";
22
import { Document } from "@langchain/core/documents";
33
import { BM25Retriever } from "../bm25.js";
4+
import { getTermFrequency } from "../../utils/@furkantoprak/bm25/BM25.js";
45

56
test("BM25Retriever", async () => {
67
const docs = [
@@ -25,3 +26,20 @@ test("BM25Retriever", async () => {
2526
"The quick brown fox jumps over the lazy dog"
2627
);
2728
});
29+
30+
test("getTermFrequency escapes regex metacharacters", () => {
31+
const corpus =
32+
"**Version 1:** What is the country of origin for the person in question?";
33+
const term = "**Version 1:**";
34+
35+
// Should not throw and should find at least one match
36+
const freq = getTermFrequency(term, corpus);
37+
expect(freq).toBeGreaterThanOrEqual(1);
38+
39+
// Also test other metacharacters
40+
const corpus2 = "Does this match (maybe)? [yes] *stars* +plus+";
41+
expect(getTermFrequency("(maybe)?", corpus2)).toBeGreaterThanOrEqual(1);
42+
expect(getTermFrequency("[yes]", corpus2)).toBeGreaterThanOrEqual(1);
43+
expect(getTermFrequency("*stars*", corpus2)).toBeGreaterThanOrEqual(1);
44+
expect(getTermFrequency("+plus+", corpus2)).toBeGreaterThanOrEqual(1);
45+
});

libs/langchain-community/src/utils/@furkantoprak/bm25/BM25.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@ export const getWordCount = (corpus: string) => {
1212

1313
/** Number of occurences of a word in a string. */
1414
export const getTermFrequency = (term: string, corpus: string) => {
15-
return ((corpus || "").match(new RegExp(term, "g")) || []).length;
15+
// Escape any RegExp metacharacters in the term so constructing a RegExp
16+
// from user-provided or model-generated queries does not throw an error
17+
const escaped = (term || "").replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
18+
return ((corpus || "").match(new RegExp(escaped, "g")) || []).length;
1619
};
1720

1821
/** Inverse document frequency. */

0 commit comments

Comments
 (0)