From cd3667ded5939cf430871f101c9b8c28b75c1d09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kry=C5=A1tof=20Tulinger?= Date: Sun, 17 Feb 2019 17:53:55 +0100 Subject: [PATCH 1/2] detecting regular expression literals in JavaScript fixes #2670 --- .../javascript/JavaScriptAnalyzer.java | 6 +-- .../javascript/JavaScriptSymbolTokenizer.lex | 22 ++++++++-- .../analysis/javascript/JavaScriptXref.lex | 30 ++++++++++++-- .../JavaScriptSymbolTokenizerTest.java | 40 +++++++++++++------ .../javascript/JavaScriptXrefTest.java | 22 ++++++++-- .../analysis/javascript/regexp_modifiers.js | 22 ++++++++++ .../javascript/regexp_modifiers_symbols.txt | 19 +++++++++ .../javascript/regexp_modifiers_xref.html | 29 ++++++++++++++ .../analysis/javascript/regexp_plain.js | 22 ++++++++++ .../javascript/regexp_plain_symbols.txt | 19 +++++++++ .../javascript/regexp_plain_xref.html | 29 ++++++++++++++ 11 files changed, 234 insertions(+), 26 deletions(-) create mode 100644 opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers.js create mode 100644 opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers_symbols.txt create mode 100644 opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers_xref.html create mode 100644 opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain.js create mode 100644 opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain_symbols.txt create mode 100644 opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain_xref.html diff --git a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/javascript/JavaScriptAnalyzer.java b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/javascript/JavaScriptAnalyzer.java index ed72055b763..249b167ed72 100644 --- a/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/javascript/JavaScriptAnalyzer.java +++ b/opengrok-indexer/src/main/java/org/opengrok/indexer/analysis/javascript/JavaScriptAnalyzer.java @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019, Oracle and/or its affiliates. All rights reserved. * Portions Copyright (c) 2017-2018, Chris Fraire . */ package org.opengrok.indexer.analysis.javascript; @@ -50,11 +50,11 @@ protected JavaScriptAnalyzer(AnalyzerFactory factory) { * Gets a version number to be used to tag processed documents so that * re-analysis can be re-done later if a stored version number is different * from the current implementation. - * @return 20180208_00 + * @return 20190217_00 */ @Override protected int getSpecializedVersionNo() { - return 20180208_00; // Edit comment above too! + return 20190217_00; // Edit comment above too! } /** diff --git a/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptSymbolTokenizer.lex b/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptSymbolTokenizer.lex index 48c9e07b7c8..f808029c674 100644 --- a/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptSymbolTokenizer.lex +++ b/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptSymbolTokenizer.lex @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019, Oracle and/or its affiliates. All rights reserved. * Portions Copyright (c) 2017, Chris Fraire . */ @@ -42,7 +42,7 @@ import org.opengrok.indexer.analysis.JFlexSymbolMatcher; %include CommonLexer.lexh %char -%state STRING COMMENT SCOMMENT QSTRING +%state STRING REGEXP_START REGEXP COMMENT SCOMMENT QSTRING %include JavaScript.lexh %% @@ -56,6 +56,13 @@ import org.opengrok.indexer.analysis.JFlexSymbolMatcher; {Number} {} \" { yybegin(STRING); } \' { yybegin(QSTRING); } + /* + * Literal regexps are in conflict with division "/" and are detected + * in javascript based on context and when ambiguous, the division has + * a higher precedence. We do a best-effort context matching for + * preceding "=" (variable), "(" (function call) or ":" (object). + */ + [:=(][ \t\r\n]*/\/ { yybegin(REGEXP_START); } "/*" { yybegin(COMMENT); } "//" { yybegin(SCOMMENT); } } @@ -65,6 +72,15 @@ import org.opengrok.indexer.analysis.JFlexSymbolMatcher; \" { yybegin(YYINITIAL); } } + { + \/ { yybegin(REGEXP); } +} + + { + \\[/] {} + \/[gimsuy]* { yybegin(YYINITIAL); } +} + { \\[\'\\] {} \' { yybegin(YYINITIAL); } @@ -78,6 +94,6 @@ import org.opengrok.indexer.analysis.JFlexSymbolMatcher; \n { yybegin(YYINITIAL);} } - { + { [^] {} } diff --git a/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptXref.lex b/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptXref.lex index 443fa806f5a..7de050885ee 100644 --- a/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptXref.lex +++ b/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptXref.lex @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2006, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2019, Oracle and/or its affiliates. All rights reserved. * Portions Copyright (c) 2017, Chris Fraire . */ @@ -68,7 +68,7 @@ File = [a-zA-Z]{FNameChar}* "." ([Jj][Ss] | [Xx][Mm][Ll] | [Cc][Oo][Nn][Ff] | [Tt][Xx][Tt] | [Hh][Tt][Mm][Ll]? | [Ii][Nn][Ii] | [Dd][Ii][Ff][Ff] | [Pp][Aa][Tt][Cc][Hh]) -%state STRING COMMENT SCOMMENT QSTRING +%state STRING REGEXP_START REGEXP COMMENT SCOMMENT QSTRING %include Common.lexh %include CommonURI.lexh @@ -98,7 +98,6 @@ File = [a-zA-Z]{FNameChar}* "." ([Jj][Ss] | onNonSymbolMatched(yytext(), yychar); onDisjointSpanChanged(null, yychar); } - \" { chkLOC(); yypush(STRING); @@ -121,6 +120,16 @@ File = [a-zA-Z]{FNameChar}* "." ([Jj][Ss] | onDisjointSpanChanged(HtmlConsts.COMMENT_CLASS, yychar); onNonSymbolMatched(yytext(), yychar); } + /* + * Literal regexps are in conflict with division "/" and are detected + * in javascript based on context and when ambiguous, the division has + * a higher precedence. We do a best-effort context matching for + * preceding "=" (variable), "(" (function call) or ":" (object). + */ + [:=(]{WhspChar}*/\/ { + yypush(REGEXP_START); + onNonSymbolMatched(yytext(), yychar); + } } { @@ -133,6 +142,19 @@ File = [a-zA-Z]{FNameChar}* "." ([Jj][Ss] | } } + { + \/ { + onDisjointSpanChanged(HtmlConsts.STRING_CLASS, yychar); + onNonSymbolMatched(yytext(), yychar); + yybegin(REGEXP); + } +} + + { + \\[/] { onNonSymbolMatched(yytext(), yychar); } + \/[gimsuy]* { chkLOC(); onNonSymbolMatched(yytext(), yychar); yypop(); } +} + { \\[\'\\] | \' {WhspChar}+ \' { chkLOC(); onNonSymbolMatched(yytext(), yychar); } @@ -154,7 +176,7 @@ File = [a-zA-Z]{FNameChar}* "." ([Jj][Ss] | } } - { + { {WhspChar}*{EOL} { onEndOfLineMatched(yytext(), yychar); } [[\s]--[\n]] { onNonSymbolMatched(yytext(), yychar); } [^\n] { chkLOC(); onNonSymbolMatched(yytext(), yychar); } diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/javascript/JavaScriptSymbolTokenizerTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/javascript/JavaScriptSymbolTokenizerTest.java index 124487495d9..e9fac70bb8a 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/javascript/JavaScriptSymbolTokenizerTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/javascript/JavaScriptSymbolTokenizerTest.java @@ -18,20 +18,21 @@ */ /* - * Copyright (c) 2010, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2019, Oracle and/or its affiliates. All rights reserved. * Portions Copyright (c) 2017, Chris Fraire . */ package org.opengrok.indexer.analysis.javascript; +import static org.junit.Assert.assertNotNull; +import static org.opengrok.indexer.util.CustomAssertions.assertSymbolStream; + import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; -import static org.junit.Assert.assertNotNull; import org.junit.Test; -import static org.opengrok.indexer.util.CustomAssertions.assertSymbolStream; /** * Tests the {@link JavaScriptSymbolTokenizer} class. @@ -40,29 +41,44 @@ public class JavaScriptSymbolTokenizerTest { /** * Test sample.js v. samplesymbols.txt + * * @throws java.lang.Exception thrown on error */ @Test public void testJavaScriptSymbolStream() throws Exception { + testSymbols("analysis/javascript/sample.js", "analysis/javascript/samplesymbols.txt"); + } + + @Test + public void testRegexpWithModifiersSymbols() throws Exception { + testSymbols("analysis/javascript/regexp_modifiers.js", "analysis/javascript/regexp_modifiers_symbols.txt"); + } + + @Test + public void testRegexpSymbols() throws Exception { + testSymbols("analysis/javascript/regexp_plain.js", "analysis/javascript/regexp_plain_symbols.txt"); + } + + private void testSymbols(String codeResource, String symbolsResource) throws Exception { InputStream jsres = getClass().getClassLoader().getResourceAsStream( - "analysis/javascript/sample.js"); - assertNotNull("despite sample.js as resource,", jsres); + codeResource); + assertNotNull(String.format("Unable to find %s as a resource", codeResource), jsres); InputStream symres = getClass().getClassLoader().getResourceAsStream( - "analysis/javascript/samplesymbols.txt"); - assertNotNull("despite samplesymbols.txt as resource,", symres); + symbolsResource); + assertNotNull(String.format("Unable to find %s as a resource", symbolsResource), symres); List expectedSymbols = new ArrayList<>(); - try (BufferedReader wdsr = new BufferedReader(new InputStreamReader( - symres, "UTF-8"))) { + try (BufferedReader wdsr = new BufferedReader(new InputStreamReader(symres, "UTF-8"))) { String line; while ((line = wdsr.readLine()) != null) { int hasho = line.indexOf('#'); - if (hasho != -1) line = line.substring(0, hasho); + if (hasho != -1) { + line = line.substring(0, hasho); + } expectedSymbols.add(line.trim()); } } - assertSymbolStream(JavaScriptSymbolTokenizer.class, jsres, - expectedSymbols); + assertSymbolStream(JavaScriptSymbolTokenizer.class, jsres, expectedSymbols); } } diff --git a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/javascript/JavaScriptXrefTest.java b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/javascript/JavaScriptXrefTest.java index af086a0f35b..317b7b319ef 100644 --- a/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/javascript/JavaScriptXrefTest.java +++ b/opengrok-indexer/src/test/java/org/opengrok/indexer/analysis/javascript/JavaScriptXrefTest.java @@ -18,17 +18,17 @@ */ /* - * Copyright (c) 2012, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2019, Oracle and/or its affiliates. All rights reserved. * Portions Copyright (c) 2017, 2019, Chris Fraire . */ package org.opengrok.indexer.analysis.javascript; +import static org.opengrok.indexer.util.StreamUtils.readTagsFromResource; + +import java.io.IOException; import org.junit.Test; import org.opengrok.indexer.analysis.XrefTestBase; -import java.io.IOException; - -import static org.opengrok.indexer.util.StreamUtils.readTagsFromResource; /** * Tests the {@link JavaScriptXref} class. @@ -49,4 +49,18 @@ public void shouldCloseTruncatedStringSpan() throws IOException { "analysis/javascript/truncated.js", "analysis/javascript/truncated_xref.html", null, 1); } + + @Test + public void shouldDetectRegularExpressionWithoutModifiers() throws IOException { + writeAndCompare(new JavaScriptAnalyzerFactory(), + "analysis/javascript/regexp_plain.js", + "analysis/javascript/regexp_plain_xref.html", null, 14); + } + + @Test + public void shouldDetectRegularExpressionWithModifiers() throws IOException { + writeAndCompare(new JavaScriptAnalyzerFactory(), + "analysis/javascript/regexp_modifiers.js", + "analysis/javascript/regexp_modifiers_xref.html", null, 14); + } } diff --git a/opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers.js b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers.js new file mode 100644 index 00000000000..0c862b15fb4 --- /dev/null +++ b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers.js @@ -0,0 +1,22 @@ +function escapeLuceneCharacters1(term) { + // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ + var pattern = /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/gms; + + return term.replace(pattern, "\\$1"); +} + +function escapeLuceneCharacters2(term) { + // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ + var pattern = { + pattern: /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/gms + }; + + return term.replace(pattern, "\\$1"); +} + +function escapeLuceneCharacters3(term) { + // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ + var pattern = new RegExp(/([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/gms); + + return term.replace(pattern, "\\$1"); +} diff --git a/opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers_symbols.txt b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers_symbols.txt new file mode 100644 index 00000000000..0e946b74d70 --- /dev/null +++ b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers_symbols.txt @@ -0,0 +1,19 @@ +escapeLuceneCharacters1 +term +pattern +term +replace +pattern +escapeLuceneCharacters2 +term +pattern +pattern +term +replace +pattern +escapeLuceneCharacters3 +term +pattern +term +replace +pattern diff --git a/opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers_xref.html b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers_xref.html new file mode 100644 index 00000000000..5db2b291a63 --- /dev/null +++ b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_modifiers_xref.html @@ -0,0 +1,29 @@ + + + + +sampleFile - OpenGrok cross reference for /sampleFile +1function escapeLuceneCharacters1(term) { +2 // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ +3 var pattern = /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/gms; +4 +5 return term.replace(pattern, "\\$1"); +6} +7 +8function escapeLuceneCharacters2(term) { +9 // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ +10 var pattern = { +11 pattern: /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/gms +12 }; +13 +14 return term.replace(pattern, "\\$1"); +15} +16 +17function escapeLuceneCharacters3(term) { +18 // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ +19 var pattern = new RegExp(/([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/gms); +20 +21 return term.replace(pattern, "\\$1"); +22} +23 + diff --git a/opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain.js b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain.js new file mode 100644 index 00000000000..d3adec4c849 --- /dev/null +++ b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain.js @@ -0,0 +1,22 @@ +function escapeLuceneCharacters1(term) { + // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ + var pattern = /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/; + + return term.replace(pattern, "\\$1"); +} + +function escapeLuceneCharacters2(term) { + // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ + var pattern = { + pattern: /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/ + }; + + return term.replace(pattern, "\\$1"); +} + +function escapeLuceneCharacters3(term) { + // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ + var pattern = new RegExp(/([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/); + + return term.replace(pattern, "\\$1"); +} diff --git a/opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain_symbols.txt b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain_symbols.txt new file mode 100644 index 00000000000..0e946b74d70 --- /dev/null +++ b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain_symbols.txt @@ -0,0 +1,19 @@ +escapeLuceneCharacters1 +term +pattern +term +replace +pattern +escapeLuceneCharacters2 +term +pattern +pattern +term +replace +pattern +escapeLuceneCharacters3 +term +pattern +term +replace +pattern diff --git a/opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain_xref.html b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain_xref.html new file mode 100644 index 00000000000..8f2ec0378b5 --- /dev/null +++ b/opengrok-indexer/src/test/resources/analysis/javascript/regexp_plain_xref.html @@ -0,0 +1,29 @@ + + + + +sampleFile - OpenGrok cross reference for /sampleFile +1function escapeLuceneCharacters1(term) { +2 // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ +3 var pattern = /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/; +4 +5 return term.replace(pattern, "\\$1"); +6} +7 +8function escapeLuceneCharacters2(term) { +9 // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ +10 var pattern = { +11 pattern: /([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/ +12 }; +13 +14 return term.replace(pattern, "\\$1"); +15} +16 +17function escapeLuceneCharacters3(term) { +18 // must escape: + - && || ! ( ) { } [ ] ^ " ~ * ? : \ +19 var pattern = new RegExp(/([\+\-\!\(\)\{\}\[\]\^\"\~\*\?\:\\]|&&|\|\|)/); +20 +21 return term.replace(pattern, "\\$1"); +22} +23 + From 0cf0feeb961657bde8460bf16b4a76cb6fd5fb6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kry=C5=A1tof=20Tulinger?= Date: Mon, 18 Feb 2019 08:44:01 +0100 Subject: [PATCH 2/2] adding a regexp_start to the others --- .../resources/analysis/javascript/JavaScriptSymbolTokenizer.lex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptSymbolTokenizer.lex b/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptSymbolTokenizer.lex index f808029c674..210ea50ccab 100644 --- a/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptSymbolTokenizer.lex +++ b/opengrok-indexer/src/main/resources/analysis/javascript/JavaScriptSymbolTokenizer.lex @@ -94,6 +94,6 @@ import org.opengrok.indexer.analysis.JFlexSymbolMatcher; \n { yybegin(YYINITIAL);} } - { + { [^] {} }