Skip to content

Commit ea1e2a1

Browse files
committed
lib: add utf16 fast path for TextDecoder
1 parent ca90ab1 commit ea1e2a1

File tree

3 files changed

+83
-65
lines changed

3 files changed

+83
-65
lines changed

lib/internal/encoding.js

Lines changed: 46 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ const { FastBuffer } = require('internal/buffer');
2020
const {
2121
ERR_ENCODING_NOT_SUPPORTED,
2222
ERR_INVALID_ARG_TYPE,
23+
ERR_ENCODING_INVALID_ENCODED_DATA,
2324
ERR_INVALID_THIS,
24-
ERR_NO_ICU,
2525
} = require('internal/errors').codes;
2626
const kSingleByte = Symbol('single-byte');
2727
const kHandle = Symbol('handle');
@@ -30,11 +30,11 @@ const kEncoding = Symbol('encoding');
3030
const kDecoder = Symbol('decoder');
3131
const kChunk = Symbol('chunk');
3232
const kFatal = Symbol('kFatal');
33-
const kUTF8FastPath = Symbol('kUTF8FastPath');
33+
const kUnicode = Symbol('kUnicode');
3434
const kIgnoreBOM = Symbol('kIgnoreBOM');
3535

3636
const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte');
37-
const { unfinishedBytesUtf8, mergePrefixUtf8 } = require('internal/encoding/util');
37+
const { unfinishedBytes, mergePrefix } = require('internal/encoding/util');
3838

3939
const {
4040
getConstructorOf,
@@ -419,11 +419,33 @@ if (hasIntl) {
419419

420420
const kBOMSeen = Symbol('BOM seen');
421421

422-
let StringDecoder;
423-
function lazyStringDecoder() {
424-
if (StringDecoder === undefined)
425-
({ StringDecoder } = require('string_decoder'));
426-
return StringDecoder;
422+
function decodeUTF16bufferLE(le, ignoreBom, fatal, encoding) {
423+
let suffix = '';
424+
if (le.length % 2 !== 0) {
425+
if (fatal) throw new ERR_ENCODING_INVALID_ENCODED_DATA(encoding, undefined);
426+
le = le.subarray(0, -1);
427+
suffix = '\ufffd';
428+
}
429+
if (le.length === 0) return suffix;
430+
let res = le.ucs2Slice();
431+
if (!ignoreBom && res[0] === '\ufeff') res = StringPrototypeSlice(res, 1);
432+
if (!fatal) return res.toWellFormed() + suffix;
433+
if (!res.isWellFormed()) throw new ERR_ENCODING_INVALID_ENCODED_DATA(encoding, undefined);
434+
return res;
435+
}
436+
437+
function decodeUTF16le(input, ignoreBom, fatal) {
438+
const le = parseInput(input);
439+
return decodeUTF16bufferLE(le, ignoreBom, fatal, 'utf-16le');
440+
}
441+
442+
function decodeUTF16be(input, ignoreBom, fatal) {
443+
const be = parseInput(input);
444+
const le = new FastBuffer(be.length);
445+
le.set(be);
446+
const swap = le.length % 2 === 0 ? le : le.subarray(0, -1);
447+
swap.swap16();
448+
return decodeUTF16bufferLE(le, ignoreBom, fatal, 'utf-16be');
427449
}
428450

429451
class TextDecoder {
@@ -446,33 +468,29 @@ class TextDecoder {
446468
this[kEncoding] = enc;
447469
this[kIgnoreBOM] = Boolean(options?.ignoreBOM);
448470
this[kFatal] = Boolean(options?.fatal);
449-
this[kUTF8FastPath] = false;
471+
this[kUnicode] = undefined;
450472
this[kHandle] = undefined;
451473
this[kSingleByte] = undefined; // Does not care about streaming or BOM
452474
this[kChunk] = null; // A copy of previous streaming tail or null
453475

454476
if (enc === 'utf-8') {
455-
this[kUTF8FastPath] = true;
477+
this[kUnicode] = decodeUTF8;
478+
this[kBOMSeen] = false;
479+
} else if (enc === 'utf-16le') {
480+
this[kUnicode] = decodeUTF16le;
481+
this[kBOMSeen] = false;
482+
} else if (enc === 'utf-16be') {
483+
this[kUnicode] = decodeUTF16be;
456484
this[kBOMSeen] = false;
457485
} else if (isSinglebyteEncoding(enc)) {
458486
this[kSingleByte] = createSinglebyteDecoder(enc, this[kFatal]);
459-
} else {
460-
this.#prepareConverter(); // Need to throw early if we don't support the encoding
461-
}
462-
}
463-
464-
#prepareConverter() {
465-
if (hasIntl) {
487+
} if (hasIntl) {
466488
let icuEncoding = this[kEncoding];
467489
if (icuEncoding === 'gbk') icuEncoding = 'gb18030'; // 10.1.1. GBK's decoder is gb18030's decoder
468490
const handle = icuGetConverter(icuEncoding, this[kFlags]);
469491
if (handle === undefined)
470492
throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]);
471493
this[kHandle] = handle;
472-
} else if (this[kEncoding] === 'utf-16le') {
473-
if (this[kFatal]) throw new ERR_NO_ICU('"fatal" option');
474-
this[kHandle] = new (lazyStringDecoder())(this[kEncoding]);
475-
this[kBOMSeen] = false;
476494
} else {
477495
throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]);
478496
}
@@ -485,19 +503,19 @@ class TextDecoder {
485503
if (this[kSingleByte]) return this[kSingleByte](parseInput(input));
486504

487505
const stream = options?.stream;
488-
if (this[kUTF8FastPath]) {
506+
if (this[kUnicode]) {
489507
const chunk = this[kChunk];
490508
const ignoreBom = this[kIgnoreBOM] || this[kBOMSeen];
491509
if (!stream) {
492510
this[kBOMSeen] = false;
493-
if (!chunk) return decodeUTF8(input, ignoreBom, this[kFatal]);
511+
if (!chunk) return this[kUnicode](input, ignoreBom, this[kFatal]);
494512
}
495513

496514
let u = parseInput(input);
497515
if (u.length === 0 && stream) return ''; // no state change
498516
let prefix;
499517
if (chunk) {
500-
const merged = mergePrefixUtf8(u, this[kChunk]);
518+
const merged = mergePrefix(u, this[kChunk], this[kEncoding]);
501519
if (u.length < 3) {
502520
u = merged; // Might be unfinished, but fully consumed old u
503521
} else {
@@ -510,7 +528,7 @@ class TextDecoder {
510528
}
511529

512530
if (stream) {
513-
const trail = unfinishedBytesUtf8(u, u.length);
531+
const trail = unfinishedBytes(u, u.length, this[kEncoding]);
514532
if (trail > 0) {
515533
this[kChunk] = new FastBuffer(u.subarray(-trail)); // copy
516534
if (!prefix && trail === u.length) return ''; // No further state change
@@ -519,8 +537,8 @@ class TextDecoder {
519537
}
520538

521539
try {
522-
const res = (prefix ? decodeUTF8(prefix, ignoreBom, this[kFatal]) : '') +
523-
decodeUTF8(u, ignoreBom || prefix, this[kFatal]);
540+
const res = (prefix ? this[kUnicode](prefix, ignoreBom, this[kFatal]) : '') +
541+
this[kUnicode](u, ignoreBom || prefix, this[kFatal]);
524542

525543
// "BOM seen" is set on the current decode call only if it did not error,
526544
// in "serialize I/O queue" after decoding
@@ -541,22 +559,7 @@ class TextDecoder {
541559
return icuDecode(this[kHandle], input, flags, this[kEncoding]);
542560
}
543561

544-
input = parseInput(input);
545-
546-
let result = stream ? this[kHandle].write(input) : this[kHandle].end(input);
547-
548-
if (result.length > 0 && !this[kBOMSeen] && !this[kIgnoreBOM]) {
549-
// If the very first result in the stream is a BOM, and we are not
550-
// explicitly told to ignore it, then we discard it.
551-
if (result[0] === '\ufeff') {
552-
result = StringPrototypeSlice(result, 1);
553-
}
554-
this[kBOMSeen] = true;
555-
}
556-
557-
if (!stream) this[kBOMSeen] = false;
558-
559-
return result;
562+
// Unreachable
560563
}
561564
}
562565

lib/internal/encoding/util.js

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,39 +7,54 @@ const {
77
Uint8Array,
88
} = primordials;
99

10-
1110
/**
1211
* Get a number of last bytes in an Uint8Array `data` ending at `len` that don't
1312
* form a codepoint yet, but can be a part of a single codepoint on more data.
14-
* @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes
13+
* @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes
1514
* @param {number} len Position to look behind from
16-
* @returns {number} Number of unfinished potentially valid UTF-8 bytes ending at position `len`
15+
* @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be
16+
* @returns {number} Number (0-3) of unfinished potentially valid UTF bytes ending at position `len`
1717
*/
18-
function unfinishedBytesUtf8(data, len) {
19-
// 0-3
20-
let pos = 0;
21-
while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes
22-
if (pos === len) return 0; // no space for lead
23-
const lead = data[len - pos - 1];
24-
if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead
25-
if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here
26-
if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, or 3-byte or less and we already have 2 trailing
27-
const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80;
28-
const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf;
29-
const next = data[len - pos];
30-
return next >= lower && next <= upper ? pos + 1 : 0;
18+
function unfinishedBytes(data, len, enc) {
19+
switch (enc) {
20+
case 'utf-8': {
21+
// 0-3
22+
let pos = 0;
23+
while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes
24+
if (pos === len) return 0; // no space for lead
25+
const lead = data[len - pos - 1];
26+
if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead
27+
if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here
28+
if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, 3-byte or less and we already have 2 trailing
29+
const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80;
30+
const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf;
31+
const next = data[len - pos];
32+
return next >= lower && next <= upper ? pos + 1 : 0;
33+
}
34+
35+
case 'utf-16le':
36+
case 'utf-16be': {
37+
// 0-3
38+
const uneven = len % 2; // Uneven byte length adds 1
39+
if (len < 2) return uneven;
40+
const l = len - uneven - 1;
41+
const last = enc === 'utf-16le' ? (data[l] << 8) ^ data[l - 1] : (data[l - 1] << 8) ^ data[l];
42+
return last >= 0xd8_00 && last < 0xdc_00 ? uneven + 2 : uneven; // lone lead adds 2
43+
}
44+
}
3145
}
3246

3347
/**
3448
* Merge prefix `chunk` with `data` and return new combined prefix.
3549
* For data.length < 3, fully consumes data and can return unfinished data,
3650
* otherwise returns a prefix with no unfinished bytes
37-
* @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes
51+
* @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes
3852
* @param {Uint8Array} chunk Prefix to prepend before `data`
53+
* @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be
3954
* @returns {Uint8Array} If data.length >= 3: an Uint8Array containing `chunk` and a slice of `data`
40-
* so that the result has no unfinished UTF-8 codepoints. If data.length < 3: concat(chunk, data).
55+
* so that the result has no unfinished codepoints. If data.length < 3: concat(chunk, data).
4156
*/
42-
function mergePrefixUtf8(data, chunk) {
57+
function mergePrefix(data, chunk, enc) {
4358
if (data.length === 0) return chunk;
4459
if (data.length < 3) {
4560
// No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
@@ -57,7 +72,7 @@ function mergePrefixUtf8(data, chunk) {
5772
// Stop at the first offset where unfinished bytes reaches 0 or fits into data
5873
// If that doesn't happen (data too short), just concat chunk and data completely (above)
5974
for (let i = 1; i <= 3; i++) {
60-
const unfinished = unfinishedBytesUtf8(temp, chunk.length + i); // 0-3
75+
const unfinished = unfinishedBytes(temp, chunk.length + i, enc); // 0-3
6176
if (unfinished <= i) {
6277
// Always reachable at 3, but we still need 'unfinished' value for it
6378
const add = i - unfinished; // 0-3
@@ -69,4 +84,4 @@ function mergePrefixUtf8(data, chunk) {
6984
return null;
7085
}
7186

72-
module.exports = { unfinishedBytesUtf8, mergePrefixUtf8 };
87+
module.exports = { unfinishedBytes, mergePrefix };

test/parallel/test-whatwg-encoding-custom-textdecoder.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ assert(TextDecoder);
101101
}
102102

103103
// Test TextDecoder, UTF-16be
104-
if (common.hasIntl) {
104+
{
105105
const dec = new TextDecoder('utf-16be');
106106
const res = dec.decode(Buffer.from('test€', 'utf-16le').swap16());
107107
assert.strictEqual(res, 'test€');

0 commit comments

Comments
 (0)