diff --git a/packages/json-joy/src/util/diff/__tests__/str.spec.ts b/packages/json-joy/src/util/diff/__tests__/str.spec.ts index 417c2d493f..8e606d490d 100644 --- a/packages/json-joy/src/util/diff/__tests__/str.spec.ts +++ b/packages/json-joy/src/util/diff/__tests__/str.spec.ts @@ -1,6 +1,150 @@ -import {PATCH_OP_TYPE, type Patch, diff, diffEdit, overlap, normalize, apply, src, dst, invert} from '../str'; +import {PATCH_OP_TYPE, type Patch, diff, diffEdit, overlap, normalize, apply, src, dst, invert, pfx, sfx} from '../str'; import {assertPatch} from './util'; +describe('pfx()', () => { + test('finds common prefixes', () => { + expect(pfx('abc', 'b')).toEqual(0); + expect(pfx('abc', 'a')).toEqual(1); + expect(pfx('abc', 'ab')).toEqual(2); + expect(pfx('abc', 'abc')).toEqual(3); + expect(pfx('abc', 'abcd')).toEqual(3); + expect(pfx('abc', 'abcde')).toEqual(3); + expect(pfx('๐Ÿ‘จโ€๐Ÿณ', '๐Ÿ‘จโ€๐Ÿณ')).toEqual(5); + expect(pfx('๐Ÿ‘จโ€๐Ÿณ', '๐Ÿ‘จโ€๐Ÿณchef')).toEqual(5); + expect(pfx('๐Ÿ‘จโ€๐Ÿณchef', '๐Ÿ‘จโ€๐Ÿณ')).toEqual(5); + expect(pfx('๐Ÿ‘จโ€๐Ÿณ๐Ÿ‘จโ€๐Ÿณ', '๐Ÿ‘จโ€๐Ÿณ')).toEqual(5); + expect('๐Ÿ‘จโ€๐Ÿณchef'.slice(0, 5)).toBe('๐Ÿ‘จโ€๐Ÿณ'); + }); + + test('handles grapheme clusters with ZWJ (Zero Width Joiner)', () => { + const family = '๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ'; + expect(pfx(family, family)).toEqual(11); + expect(pfx(family + 'abc', family)).toEqual(11); + expect(pfx(family + 'abc', family + 'xyz')).toEqual(11); + expect(pfx('prefix' + family, 'prefix' + family)).toEqual(6 + 11); + const womanTech = '๐Ÿ‘ฉ๐Ÿฝโ€๐Ÿ’ป'; + expect(pfx(womanTech, womanTech)).toEqual(7); + expect(pfx(womanTech + 'code', womanTech)).toEqual(7); + expect(pfx('hello' + womanTech, 'hello' + womanTech)).toEqual(5 + 7); + }); + + test('handles flag emojis (regional indicators)', () => { + const usFlag = '๐Ÿ‡บ๐Ÿ‡ธ'; + const ukFlag = '๐Ÿ‡ฌ๐Ÿ‡ง'; + expect(pfx(usFlag, usFlag)).toEqual(4); + expect(pfx(usFlag + 'USA', usFlag)).toEqual(4); + expect(pfx(usFlag, ukFlag)).toEqual(0); + expect(pfx('hello' + usFlag, 'hello' + usFlag)).toEqual(5 + 4); + }); + + test('handles combining diacritical marks', () => { + const combining = 'e\u0301'; // e + combining acute accent + expect(pfx(combining, combining)).toEqual(2); + expect(pfx(combining + 'llo', combining)).toEqual(2); + expect(pfx('hello' + combining, 'hello' + combining)).toEqual(5 + 2); + + // Multiple combining marks + const multiCombining = 'a\u0301\u0302\u0303'; + expect(pfx(multiCombining, multiCombining)).toEqual(4); + }); + + test('handles variation selectors', () => { + const heartText = 'โค\uFE0E'; // text style + const heartEmoji = 'โค\uFE0F'; // emoji style + expect(pfx(heartText, heartText)).toEqual(2); + expect(pfx(heartEmoji, heartEmoji)).toEqual(2); + expect(pfx(heartText, heartEmoji)).toEqual(1); // Only the base character matches + }); + + test('handles mixed grapheme clusters', () => { + const chef = '๐Ÿ‘จโ€๐Ÿณ'; + const family = '๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ'; + const combined = chef + family; + expect(pfx(combined, combined)).toEqual(16); + expect(pfx(combined + 'text', combined)).toEqual(16); + expect(pfx('abc' + combined, 'abc' + combined)).toEqual(3 + 16); + }); +}); + +describe('sfx()', () => { + test('finds common suffixes', () => { + expect(sfx('abc', 'b')).toEqual(0); + expect(sfx('abc', 'c')).toEqual(1); + expect(sfx('abc', 'bc')).toEqual(2); + expect(sfx('abc', 'abc')).toEqual(3); + expect(sfx('abc', '_abc')).toEqual(3); + expect(sfx('abc', 'abcd')).toEqual(0); + expect(sfx('๐Ÿ‘จโ€๐Ÿณ', '๐Ÿ‘จโ€๐Ÿณ')).toEqual(5); + expect(sfx('๐Ÿ‘จโ€๐Ÿณ', '๐Ÿ‘จโ€๐Ÿณchef')).toEqual(0); + expect(sfx('๐Ÿ‘จโ€๐Ÿณchef', '๐Ÿ‘จโ€๐Ÿณ')).toEqual(0); + expect(sfx('๐Ÿ‘จโ€๐Ÿณ', 'chef๐Ÿ‘จโ€๐Ÿณ')).toEqual(5); + expect(sfx('chef๐Ÿ‘จโ€๐Ÿณ', '๐Ÿ‘จโ€๐Ÿณ')).toEqual(5); + expect(sfx('๐Ÿ‘จโ€๐Ÿณ๐Ÿ‘จโ€๐Ÿณ', '๐Ÿ‘จโ€๐Ÿณ')).toEqual(5); + }); + + test('handles grapheme clusters with ZWJ (Zero Width Joiner)', () => { + const family = '๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ'; + expect(sfx(family, family)).toEqual(11); + expect(sfx('abc' + family, family)).toEqual(11); + expect(sfx('xyz' + family, 'abc' + family)).toEqual(11); + expect(sfx(family + 'suffix', family + 'suffix')).toEqual(6 + 11); + const womanTech = '๐Ÿ‘ฉ๐Ÿฝโ€๐Ÿ’ป'; + expect(sfx(womanTech, womanTech)).toEqual(7); + expect(sfx('code' + womanTech, womanTech)).toEqual(7); + expect(sfx(womanTech + 'hello', womanTech + 'hello')).toEqual(5 + 7); + }); + + test('handles flag emojis (regional indicators)', () => { + const usFlag = '๐Ÿ‡บ๐Ÿ‡ธ'; + const ukFlag = '๐Ÿ‡ฌ๐Ÿ‡ง'; + expect(sfx(usFlag, usFlag)).toEqual(4); + expect(sfx('USA' + usFlag, usFlag)).toEqual(4); + expect(sfx(usFlag, ukFlag)).toEqual(0); + expect(sfx(usFlag + 'hello', usFlag + 'hello')).toEqual(5 + 4); + }); + + test('handles combining diacritical marks', () => { + const combining = 'e\u0301'; // e + combining acute accent + expect(sfx(combining, combining)).toEqual(2); + expect(sfx('ell' + combining, combining)).toEqual(2); + expect(sfx(combining + 'hello', combining + 'hello')).toEqual(5 + 2); + const multiCombining = 'a\u0301\u0302\u0303'; // a with multiple accents + expect(sfx(multiCombining, multiCombining)).toEqual(4); + expect(sfx('text' + multiCombining, multiCombining)).toEqual(4); + }); + + test('handles variation selectors', () => { + const heartText = 'โค\uFE0E'; // text style + const heartEmoji = 'โค\uFE0F'; // emoji style + expect(sfx(heartText, heartText)).toEqual(2); + expect(sfx(heartEmoji, heartEmoji)).toEqual(2); + expect(sfx(heartText, heartEmoji)).toEqual(0); + expect(sfx('love' + heartEmoji, heartEmoji)).toEqual(2); + }); + + test('handles mixed grapheme clusters', () => { + const chef = '๐Ÿ‘จโ€๐Ÿณ'; + const family = '๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ'; + const combined = family + chef; + expect(sfx(combined, combined)).toEqual(16); + expect(sfx('text' + combined, combined)).toEqual(16); + expect(sfx(combined + 'abc', combined + 'abc')).toEqual(3 + 16); + }); + + test('does not split grapheme clusters at boundaries', () => { + const chef = '๐Ÿ‘จโ€๐Ÿณ'; + const family = '๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ'; + + // Ensure we don't split in the middle of a grapheme cluster + expect(sfx('x' + chef, chef)).toEqual(5); // full chef emoji + expect(sfx('xy' + family, family)).toEqual(11); // full family emoji + + // When the suffix is part of a larger grapheme, it should not match partially + expect(sfx('๐Ÿ‘จโ€๐Ÿณ๐Ÿ‘ฉ', '๐Ÿ‘ฉ')).toEqual(2); // Just the woman emoji at end + expect(sfx('text๐Ÿ‘จโ€๐Ÿณ', '๐Ÿ‘จโ€๐Ÿณ')).toEqual(5); // Full chef emoji + }); +}); + describe('normalize()', () => { test('joins consecutive same type operations', () => { expect( @@ -240,6 +384,75 @@ describe('diff()', () => { assertPatch('a๐Ÿ™ƒb', 'a๐Ÿ‘‹b'); }); + test('grapheme clusters with ZWJ (Zero Width Joiner)', () => { + const chef = '๐Ÿ‘จโ€๐Ÿณ'; + const family = '๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ'; + const womanTech = '๐Ÿ‘ฉ๐Ÿฝโ€๐Ÿ’ป'; + assertPatch(chef, family); + assertPatch(family, chef); + assertPatch(womanTech, chef); + assertPatch('hello', 'hello' + chef); + assertPatch('hello', chef + 'hello'); + assertPatch('hello world', 'hello' + family + 'world'); + assertPatch('hello' + chef, 'hello'); + assertPatch(chef + 'hello', 'hello'); + assertPatch('hello' + family + 'world', 'helloworld'); + assertPatch(chef + family, family + chef); + assertPatch('a' + chef + 'b' + family + 'c', 'x' + family + 'y' + chef + 'z'); + assertPatch('The ' + chef + ' cooks', 'A ' + chef + ' bakes'); + assertPatch('Team: ' + family, 'Group: ' + womanTech); + }); + + test('flag emojis (regional indicators)', () => { + const ruFlag = '๐Ÿ‡ท๐Ÿ‡บ'; + const chFlag = '๐Ÿ‡จ๐Ÿ‡ณ'; + const inFlag = '๐Ÿ‡ฎ๐Ÿ‡ณ'; + assertPatch(ruFlag, chFlag); + assertPatch(chFlag, inFlag); + assertPatch('Made in ' + ruFlag, 'Made in ' + chFlag); + assertPatch(ruFlag + ' USA', chFlag + ' UK'); + assertPatch('Hello ' + ruFlag + ' world', 'Hello ' + inFlag + ' world'); + assertPatch(ruFlag + chFlag, chFlag + ruFlag); + assertPatch('Flags: ' + ruFlag + chFlag + inFlag, 'Flags: ' + inFlag + chFlag + ruFlag); + }); + + test('combining diacritical marks', () => { + const combining1 = 'e\u0301'; + const combining2 = 'e\u0300'; + const precomposed = 'รฉ'; + assertPatch(combining1, combining2); + assertPatch(combining1, precomposed); + assertPatch(precomposed, combining1); + assertPatch('cafe\u0301', 'cafรฉ'); + assertPatch('naรฏve', 'naive'); + assertPatch('rรฉsumรฉ', 'resume'); + const multiCombining = 'a\u0301\u0302\u0303'; + assertPatch('test' + multiCombining, 'test'); + assertPatch('test', 'test' + multiCombining); + }); + + test('variation selectors', () => { + const heartText = 'โค\uFE0E'; // text style + const heartEmoji = 'โค\uFE0F'; // emoji style + assertPatch(heartText, heartEmoji); + assertPatch(heartEmoji, heartText); + assertPatch('I ' + heartText + ' code', 'I ' + heartEmoji + ' code'); + assertPatch('Love ' + heartEmoji, 'Love ' + heartText); + }); + + test('complex grapheme clusters in real scenarios', () => { + const chef = '๐Ÿ‘จโ€๐Ÿณ'; + const family = '๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ'; + const womanTech = '๐Ÿ‘ฉ๐Ÿฝโ€๐Ÿ’ป'; + const usFlag = '๐Ÿ‡บ๐Ÿ‡ธ'; + assertPatch('Hey ' + chef + ', dinner ready?', 'Hi ' + womanTech + ', code ready?'); + assertPatch(family + ' going to ' + usFlag, family + ' staying home'); + assertPatch( + 'The ' + chef + ' from ' + usFlag + ' is amazing', + 'A ' + womanTech + ' from ' + usFlag + ' is brilliant', + ); + }); + test('same strings', () => { assertPatch('', ''); assertPatch('1', '1'); @@ -331,6 +544,33 @@ describe('diffEdit()', () => { assertDiffEdit('aaa', 'bbb', 'ccc'); assertDiffEdit('1', '2', '3'); }); + + test('handles grapheme cluster inserts and deletes', () => { + const chef = '๐Ÿ‘จโ€๐Ÿณ'; + const family = '๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ'; + const womanTech = '๐Ÿ‘ฉ๐Ÿฝโ€๐Ÿ’ป'; + const usFlag = '๐Ÿ‡บ๐Ÿ‡ธ'; + + // Insert grapheme clusters + assertDiffEdit('', chef, ''); + assertDiffEdit('Hello ', chef, ''); + assertDiffEdit('', chef, ' world'); + assertDiffEdit('Hello ', chef, ' world'); + assertDiffEdit('Team: ', family, ' rocks!'); + + // Insert multiple grapheme clusters + assertDiffEdit('', chef + family, ''); + assertDiffEdit('Coders: ', womanTech + chef, ' win'); + + // Insert with flags + assertDiffEdit('Made in ', usFlag, ''); + assertDiffEdit('', usFlag, ' USA'); + + // Combining characters + const combining = 'e\u0301'; + assertDiffEdit('caf', combining, ''); + assertDiffEdit('', combining, ' accent'); + }); }); describe('overlap()', () => { @@ -353,6 +593,21 @@ describe('overlap()', () => { expect(overlap('abc', 'abc')).toEqual(3); expect(overlap('a', 'a')).toEqual(1); }); + + test('handles grapheme clusters', () => { + const chef = '๐Ÿ‘จโ€๐Ÿณ'; + const family = '๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ'; + + // Overlap with grapheme clusters + expect(overlap('hello' + chef, chef + 'world')).toEqual(5); + expect(overlap('abc' + family, family + 'xyz')).toEqual(11); + + // No overlap when grapheme differs + expect(overlap('hello' + chef, family + 'world')).toEqual(0); + + // Text overlap with grapheme clusters + expect(overlap('prefix' + chef, chef + 'suffix')).toEqual(5); + }); }); describe('Unicode edge cases', () => { @@ -404,6 +659,18 @@ describe('Unicode edge cases', () => { assertPatch(nfd, nfc); assertPatch(`hello ${nfc}`, `hello ${nfd}`); }); + + test('handles complex emoji with ZWJ sequences', () => { + const chefEmoji = '๐Ÿ‘จโ€๐Ÿณ'; // chef emoji (man + ZWJ + cooking) + const src = chefEmoji; + const dst = 'chef' + chefEmoji; + const patch = normalize(diff(src, dst)); + assertPatch(src, dst, patch); + expect(patch).toEqual([ + [PATCH_OP_TYPE.INS, 'chef'], + [PATCH_OP_TYPE.EQL, chefEmoji], + ]); + }); }); describe('Algorithm edge cases', () => { diff --git a/packages/json-joy/src/util/diff/str.ts b/packages/json-joy/src/util/diff/str.ts index 07e9b2d605..26cd4fcddc 100644 --- a/packages/json-joy/src/util/diff/str.ts +++ b/packages/json-joy/src/util/diff/str.ts @@ -388,7 +388,7 @@ const diffNoCommonAffix = (src: string, dst: string): Patch => { * @param txt2 Second string. * @return The number of characters common to the start of each string. */ -export const pfx = (txt1: string, txt2: string) => { +export const pfx = (txt1: string, txt2: string): number => { if (!txt1 || !txt2 || txt1.charAt(0) !== txt2.charAt(0)) return 0; let min = 0; let max = Math.min(txt1.length, txt2.length); @@ -427,9 +427,35 @@ export const sfx = (txt1: string, txt2: string): number => { } else max = mid; mid = Math.floor((max - min) / 2 + min); } - const code = txt1.charCodeAt(txt1.length - mid); - const isSurrogatePairEnd = code >= 0xd800 && code <= 0xdbff; - if (isSurrogatePairEnd) mid--; + // Check if we're splitting a surrogate pair or combining character sequence + // We need to check the character BEFORE the matched suffix to see if we're + // splitting a grapheme cluster. + if (mid > 0 && mid < txt1.length) { + const boundaryPos = txt1.length - mid - 1; + const code = txt1.charCodeAt(boundaryPos); + const isHighSurrogate = code >= 0xd800 && code <= 0xdbff; + const isCombining = + code === 0x200d || // ZWJ + (code >= 0xfe00 && code <= 0xfe0f) || // Variation selectors + (code >= 0x0300 && code <= 0x036f); // Combining diacritical marks + + if (isHighSurrogate || isCombining) { + // We're splitting a grapheme cluster. Walk backwards to include the full cluster. + mid--; + while (mid > 0) { + const pos = txt1.length - mid - 1; + if (pos < 0) break; + const prevCode = txt1.charCodeAt(pos); + const isPrevHighSurrogate = prevCode >= 0xd800 && prevCode <= 0xdbff; + const isPrevCombining = + prevCode === 0x200d || + (prevCode >= 0xfe00 && prevCode <= 0xfe0f) || + (prevCode >= 0x0300 && prevCode <= 0x036f); + if (!isPrevHighSurrogate && !isPrevCombining) break; + mid--; + } + } + } return mid; };