Merge PR #422: Add support for nested block comments

nene · web-flow · commit 6e4380f779d6 · 2022-09-02T22:04:16.000+03:00
diff --git a/src/languages/postgresql/postgresql.formatter.ts b/src/languages/postgresql/postgresql.formatter.ts
@@ -332,6 +332,7 @@ export default class PostgreSqlFormatter extends Formatter {
       reservedPhrases,
       reservedKeywords: keywords,
       reservedFunctionNames: functions,
+      nestedBlockComments: true,
       extraParens: ['[]'],
       stringTypes: ['$$', { quote: "''", prefixes: ['B', 'E', 'X', 'U&'] }],
       identTypes: [{ quote: '""', prefixes: ['U&'] }],
diff --git a/src/languages/tsql/tsql.formatter.ts b/src/languages/tsql/tsql.formatter.ts
@@ -226,6 +226,7 @@ export default class TSqlFormatter extends Formatter {
       reservedPhrases,
       reservedKeywords: keywords,
       reservedFunctionNames: functions,
+      nestedBlockComments: true,
       stringTypes: [{ quote: "''", prefixes: ['N'] }],
       identTypes: [`""`, '[]'],
       identChars: { first: '#@', rest: '#@$' },
diff --git a/src/lexer/NestedComment.ts b/src/lexer/NestedComment.ts
@@ -0,0 +1,52 @@
+/* eslint-disable no-cond-assign */
+import { RegExpLike } from 'src/lexer/TokenizerEngine';
+
+const START = /\/\*/uy; // matches: /*
+const MIDDLE = /([^/*]|\*[^/]|\/[^*])+/uy; // matches text NOT containing /* or */
+const END = /\*\//uy; // matches: */
+
+/**
+ * An object mimicking a regular expression,
+ * for matching nested block-comments.
+ */
+export class NestedComment implements RegExpLike {
+  public lastIndex: number = 0;
+
+  public exec(input: string): string[] | null {
+    let result = '';
+    let match: string | null;
+    let nestLevel = 0;
+
+    if ((match = this.matchSection(START, input))) {
+      result += match;
+      nestLevel++;
+    } else {
+      return null;
+    }
+
+    while (nestLevel > 0) {
+      if ((match = this.matchSection(START, input))) {
+        result += match;
+        nestLevel++;
+      } else if ((match = this.matchSection(END, input))) {
+        result += match;
+        nestLevel--;
+      } else if ((match = this.matchSection(MIDDLE, input))) {
+        result += match;
+      } else {
+        return null;
+      }
+    }
+
+    return [result];
+  }
+
+  private matchSection(regex: RegExp, input: string): string | null {
+    regex.lastIndex = this.lastIndex;
+    const matches = regex.exec(input);
+    if (matches) {
+      this.lastIndex += matches[0].length;
+    }
+    return matches ? matches[0] : null;
+  }
+}
diff --git a/src/lexer/Tokenizer.ts b/src/lexer/Tokenizer.ts
@@ -4,6 +4,7 @@ import { ParamTypes, TokenizerOptions } from 'src/lexer/TokenizerOptions';
 import TokenizerEngine, { TokenRule } from 'src/lexer/TokenizerEngine';
 import { escapeRegExp } from 'src/lexer/regexUtil';
 import { equalizeWhitespace, Optional } from 'src/utils';
+import { NestedComment } from './NestedComment';
 
 type OptionalTokenRule = Optional<TokenRule, 'regex'>;
 
@@ -32,7 +33,7 @@ export default class Tokenizer {
     return this.validRules([
       {
         type: TokenType.BLOCK_COMMENT,
-        regex: /(\/\*[^]*?(?:\*\/|$))/uy,
+        regex: cfg.nestedBlockComments ? new NestedComment() : /(\/\*[^]*?\*\/)/uy,
       },
       {
         type: TokenType.LINE_COMMENT,
diff --git a/src/lexer/TokenizerEngine.ts b/src/lexer/TokenizerEngine.ts
@@ -1,9 +1,17 @@
 import { Token, TokenType } from 'src/lexer/token';
 import { WHITESPACE_REGEX } from './regexUtil';
 
+export interface RegExpLike {
+  lastIndex: number;
+  exec(input: string): string[] | null;
+}
+
 export interface TokenRule {
   type: TokenType;
-  regex: RegExp;
+  // Normally a RegExp object.
+  // But to allow for more complex matching logic,
+  // an object can be given that implements a RegExpLike interface.
+  regex: RegExpLike;
   // Called with the raw string that was matched
   text?: (rawText: string) => string;
   key?: (rawText: string) => string;
diff --git a/src/lexer/TokenizerOptions.ts b/src/lexer/TokenizerOptions.ts
@@ -75,6 +75,8 @@ export interface TokenizerOptions {
   paramTypes?: ParamTypes;
   // Line comment types to support, defaults to --
   lineCommentTypes?: string[];
+  // True to allow for nested /* /* block comments */ */
+  nestedBlockComments?: boolean;
   // Additional characters to support in identifiers
   identChars?: IdentChars;
   // Additional characters to support in named parameters
diff --git a/test/features/comments.ts b/test/features/comments.ts
@@ -4,6 +4,7 @@ import { FormatFn } from 'src/sqlFormatter';
 
 interface CommentsConfig {
   hashComments?: boolean;
+  nestedBlockComments?: boolean;
 }
 
 export default function supportsComments(format: FormatFn, opts: CommentsConfig = {}) {
@@ -149,15 +150,14 @@ export default function supportsComments(format: FormatFn, opts: CommentsConfig
     expect(result).toBe('SELECT\n  *\nFROM\n  -- line comment 1\n  MyTable -- line comment 2');
   });
 
-  it('formats query that ends with open comment', () => {
+  it('does not detect unclosed comment as a comment', () => {
     const result = format(`
       SELECT count(*)
-      /*Comment
+      /*SomeComment
     `);
     expect(result).toBe(dedent`
       SELECT
-        count(*)
-        /*Comment
+        count(*) / * SomeComment
     `);
   });
 
@@ -172,4 +172,17 @@ export default function supportsComments(format: FormatFn, opts: CommentsConfig
       `);
     });
   }
+
+  if (opts.nestedBlockComments) {
+    it('supports nested block comments', () => {
+      const result = format('SELECT alpha /* /* commment */ */ FROM beta');
+      expect(result).toBe(dedent`
+        SELECT
+          alpha
+          /* /* commment */ */
+        FROM
+          beta
+      `);
+    });
+  }
 }
diff --git a/test/postgresql.test.ts b/test/postgresql.test.ts
@@ -32,7 +32,7 @@ describe('PostgreSqlFormatter', () => {
   const format: FormatFn = (query, cfg = {}) => originalFormat(query, { ...cfg, language });
 
   behavesLikeSqlFormatter(format);
-  supportsComments(format);
+  supportsComments(format, { nestedBlockComments: true });
   supportsCreateView(format, { orReplace: true, materialized: true });
   supportsCreateTable(format, { ifNotExists: true });
   supportsDropTable(format, { ifExists: true });
diff --git a/test/tsql.test.ts b/test/tsql.test.ts
@@ -30,7 +30,7 @@ describe('TSqlFormatter', () => {
   const format: FormatFn = (query, cfg = {}) => originalFormat(query, { ...cfg, language });
 
   behavesLikeSqlFormatter(format);
-  supportsComments(format);
+  supportsComments(format, { nestedBlockComments: true });
   supportsCreateView(format, { materialized: true });
   supportsCreateTable(format);
   supportsDropTable(format, { ifExists: true });
diff --git a/test/unit/NestedComment.test.ts b/test/unit/NestedComment.test.ts
@@ -0,0 +1,63 @@
+import { NestedComment } from 'src/lexer/NestedComment';
+
+describe('NestedComment', () => {
+  const match = (input: string, index: number) => {
+    const re = new NestedComment();
+    re.lastIndex = index;
+    return re.exec(input);
+  };
+
+  it('matches comment at the start of a string', () => {
+    expect(match('/* comment */ blah...', 0)).toEqual(['/* comment */']);
+  });
+
+  it('matches empty comment block', () => {
+    expect(match('/**/ blah...', 0)).toEqual(['/**/']);
+  });
+
+  it('matches comment containing * and / characters', () => {
+    expect(match('/** // */ blah...', 0)).toEqual(['/** // */']);
+  });
+
+  it('matches only first comment, when two comments in row', () => {
+    expect(match('/*com1*//*com2*/ blah...', 0)).toEqual(['/*com1*/']);
+  });
+
+  it('matches comment in the middle of a string', () => {
+    expect(match('hello /* comment */ blah...', 6)).toEqual(['/* comment */']);
+  });
+
+  it('does not match a comment when index not set to its start position', () => {
+    expect(match('hello /* comment */ blah...', 1)).toEqual(null);
+  });
+
+  it('does not match unclosed comment', () => {
+    expect(match('/* comment blah...', 0)).toEqual(null);
+  });
+
+  it('does not match unopened comment', () => {
+    expect(match(' comment */ blah...', 0)).toEqual(null);
+  });
+
+  it('matches a nested comment', () => {
+    expect(match('/* some /* nested */ comment */ blah...', 0)).toEqual([
+      '/* some /* nested */ comment */',
+    ]);
+  });
+
+  it('matches a multi-level nested comment', () => {
+    expect(match('/* some /* /* nested */ */ comment */ blah...', 0)).toEqual([
+      '/* some /* /* nested */ */ comment */',
+    ]);
+  });
+
+  it('matches multiple nested comments', () => {
+    expect(match('/* some /* n1 */ and /* n2 */ coms */ blah...', 0)).toEqual([
+      '/* some /* n1 */ and /* n2 */ coms */',
+    ]);
+  });
+
+  it('does not match an inproperly nested comment', () => {
+    expect(match('/* some /* comment blah...', 0)).toEqual(null);
+  });
+});

Original file line number	Diff line number	Diff line change
`@@ -4,6 +4,7 @@ import { ParamTypes, TokenizerOptions } from 'src/lexer/TokenizerOptions';`
`4`	`4`	`import TokenizerEngine, { TokenRule } from 'src/lexer/TokenizerEngine';`
`5`	`5`	`import { escapeRegExp } from 'src/lexer/regexUtil';`
`6`	`6`	`import { equalizeWhitespace, Optional } from 'src/utils';`
	`7`	`+import { NestedComment } from './NestedComment';`
`7`	`8`
`8`	`9`	`type OptionalTokenRule = Optional<TokenRule, 'regex'>;`
`9`	`10`
`@@ -32,7 +33,7 @@ export default class Tokenizer {`
`32`	`33`	`return this.validRules([`
`33`	`34`	`{`
`34`	`35`	`type: TokenType.BLOCK_COMMENT,`
`35`		`- regex: /(\/\[^]?(?:\*\/\|$))/uy,`
	`36`	`+ regex: cfg.nestedBlockComments ? new NestedComment() : /(\/\[^]?\*\/)/uy,`
`36`	`37`	`},`
`37`	`38`	`{`
`38`	`39`	`type: TokenType.LINE_COMMENT,`