File tree Expand file tree Collapse file tree 3 files changed +27
-1
lines changed Expand file tree Collapse file tree 3 files changed +27
-1
lines changed Original file line number Diff line number Diff line change
1
+ ---
2
+ " @langchain/community " : patch
3
+ ---
4
+
5
+ BM25Retriever: escape regex metacharacters in getTermFrequency to prevent crashes
Original file line number Diff line number Diff line change 1
1
import { expect , test } from "@jest/globals" ;
2
2
import { Document } from "@langchain/core/documents" ;
3
3
import { BM25Retriever } from "../bm25.js" ;
4
+ import { getTermFrequency } from "../../utils/@furkantoprak/bm25/BM25.js" ;
4
5
5
6
test ( "BM25Retriever" , async ( ) => {
6
7
const docs = [
@@ -25,3 +26,20 @@ test("BM25Retriever", async () => {
25
26
"The quick brown fox jumps over the lazy dog"
26
27
) ;
27
28
} ) ;
29
+
30
+ test ( "getTermFrequency escapes regex metacharacters" , ( ) => {
31
+ const corpus =
32
+ "**Version 1:** What is the country of origin for the person in question?" ;
33
+ const term = "**Version 1:**" ;
34
+
35
+ // Should not throw and should find at least one match
36
+ const freq = getTermFrequency ( term , corpus ) ;
37
+ expect ( freq ) . toBeGreaterThanOrEqual ( 1 ) ;
38
+
39
+ // Also test other metacharacters
40
+ const corpus2 = "Does this match (maybe)? [yes] *stars* +plus+" ;
41
+ expect ( getTermFrequency ( "(maybe)?" , corpus2 ) ) . toBeGreaterThanOrEqual ( 1 ) ;
42
+ expect ( getTermFrequency ( "[yes]" , corpus2 ) ) . toBeGreaterThanOrEqual ( 1 ) ;
43
+ expect ( getTermFrequency ( "*stars*" , corpus2 ) ) . toBeGreaterThanOrEqual ( 1 ) ;
44
+ expect ( getTermFrequency ( "+plus+" , corpus2 ) ) . toBeGreaterThanOrEqual ( 1 ) ;
45
+ } ) ;
Original file line number Diff line number Diff line change @@ -12,7 +12,10 @@ export const getWordCount = (corpus: string) => {
12
12
13
13
/** Number of occurences of a word in a string. */
14
14
export const getTermFrequency = ( term : string , corpus : string ) => {
15
- return ( ( corpus || "" ) . match ( new RegExp ( term , "g" ) ) || [ ] ) . length ;
15
+ // Escape any RegExp metacharacters in the term so constructing a RegExp
16
+ // from user-provided or model-generated queries does not throw an error
17
+ const escaped = ( term || "" ) . replace ( / [ . * + ? ^ $ { } ( ) | [ \] \\ ] / g, "\\$&" ) ;
18
+ return ( ( corpus || "" ) . match ( new RegExp ( escaped , "g" ) ) || [ ] ) . length ;
16
19
} ;
17
20
18
21
/** Inverse document frequency. */
You can’t perform that action at this time.
0 commit comments