diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
index 8ec1af1bd7a..2f2c0389ab4 100644
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
@@ -5913,7 +5913,7 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value)
// - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
// return the number of characters copied.
// - java/lang/StringUTF16.compress
-// return zero (0) if copy fails, otherwise 'len'.
+// return index of non-latin1 character if copy fails, otherwise 'len'.
//
// This version always returns the number of characters copied, and does not
// clobber the 'len' register. A successful copy will complete with the post-
@@ -6130,15 +6130,15 @@ address MacroAssembler::byte_array_inflate(Register src, Register dst, Register
}
// Compress char[] array to byte[].
+// Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
+// Return the array length if every element in array can be encoded,
+// otherwise, the index of first non-latin1 (> 0xff) character.
void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
Register res,
FloatRegister tmp0, FloatRegister tmp1,
FloatRegister tmp2, FloatRegister tmp3,
FloatRegister tmp4, FloatRegister tmp5) {
encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
- // Adjust result: res == len ? len : 0
- cmp(len, res);
- csel(res, res, zr, EQ);
}
// java.math.round(double a)
diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad
index bc7dba082da..f8f435c3522 100644
--- a/src/hotspot/cpu/ppc/ppc.ad
+++ b/src/hotspot/cpu/ppc/ppc.ad
@@ -12799,16 +12799,8 @@ instruct string_compress(rarg1RegP src, rarg2RegP dst, iRegIsrc len, iRegIdst re
ins_cost(300);
format %{ "String Compress $src,$dst,$len -> $result \t// KILL $tmp1, $tmp2, $tmp3, $tmp4, $tmp5" %}
ins_encode %{
- Label Lskip, Ldone;
- __ li($result$$Register, 0);
- __ string_compress_16($src$$Register, $dst$$Register, $len$$Register, $tmp1$$Register,
- $tmp2$$Register, $tmp3$$Register, $tmp4$$Register, $tmp5$$Register, Ldone);
- __ rldicl_($tmp1$$Register, $len$$Register, 0, 64-3); // Remaining characters.
- __ beq(CCR0, Lskip);
- __ string_compress($src$$Register, $dst$$Register, $tmp1$$Register, $tmp2$$Register, Ldone);
- __ bind(Lskip);
- __ mr($result$$Register, $len$$Register);
- __ bind(Ldone);
+ __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register, $tmp1$$Register, $tmp2$$Register,
+ $tmp3$$Register, $tmp4$$Register, $tmp5$$Register, $result$$Register, false);
%}
ins_pipe(pipe_class_default);
%}
diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
index a83be3b8f75..a234c4888e1 100644
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
@@ -1768,14 +1768,12 @@ void C2_MacroAssembler::byte_array_inflate_v(Register src, Register dst, Registe
}
// Compress char[] array to byte[].
-// result: the array length if every element in array can be encoded; 0, otherwise.
+// Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
+// result: the array length if every element in array can be encoded,
+// otherwise, the index of first non-latin1 (> 0xff) character.
void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Register len,
Register result, Register tmp) {
- Label done;
encode_iso_array_v(src, dst, len, result, tmp, false);
- beqz(len, done);
- mv(result, zr);
- bind(done);
}
// Intrinsic for
@@ -1783,7 +1781,7 @@ void C2_MacroAssembler::char_array_compress_v(Register src, Register dst, Regist
// - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
// return the number of characters copied.
// - java/lang/StringUTF16.compress
-// return zero (0) if copy fails, otherwise 'len'.
+// return index of non-latin1 character if copy fails, otherwise 'len'.
//
// This version always returns the number of characters copied. A successful
// copy will complete with the post-condition: 'res' == 'len', while an
diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad
index 89c2d538092..e45609bce34 100644
--- a/src/hotspot/cpu/s390/s390.ad
+++ b/src/hotspot/cpu/s390/s390.ad
@@ -10177,7 +10177,7 @@ instruct string_compress(iRegP src, iRegP dst, iRegI result, iRegI len, iRegI tm
format %{ "String Compress $src->$dst($len) -> $result" %}
ins_encode %{
__ string_compress($result$$Register, $src$$Register, $dst$$Register, $len$$Register,
- $tmp$$Register, false, false);
+ $tmp$$Register, true, false);
%}
ins_pipe(pipe_class_dummy);
%}
diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
index 31096d07ca2..6575b435232 100644
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp
@@ -8837,15 +8837,19 @@ void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Registe
#undef BLOCK_COMMENT
// Compress char[] array to byte[].
-// ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
+// Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
+// Return the array length if every element in array can be encoded,
+// otherwise, the index of first non-latin1 (> 0xff) character.
// @IntrinsicCandidate
-// private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
+// public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
// for (int i = 0; i < len; i++) {
-// int c = src[srcOff++];
-// if (c >>> 8 != 0) {
-// return 0;
+// char c = src[srcOff];
+// if (c > 0xff) {
+// return i; // return index of non-latin1 char
// }
-// dst[dstOff++] = (byte)c;
+// dst[dstOff] = (byte)c;
+// srcOff++;
+// dstOff++;
// }
// return len;
// }
@@ -8853,7 +8857,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
Register tmp5, Register result, KRegister mask1, KRegister mask2) {
- Label copy_chars_loop, return_length, return_zero, done;
+ Label copy_chars_loop, done, reset_sp, copy_tail;
// rsi: src
// rdi: dst
@@ -8868,28 +8872,28 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
assert(len != result, "");
// save length for return
- push(len);
+ movl(result, len);
if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
- Label copy_32_loop, copy_loop_tail, below_threshold;
+ Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail;
// alignment
Label post_alignment;
- // if length of the string is less than 16, handle it in an old fashioned way
+ // if length of the string is less than 32, handle it the old fashioned way
testl(len, -32);
jcc(Assembler::zero, below_threshold);
// First check whether a character is compressible ( <= 0xFF).
// Create mask to test for Unicode chars inside zmm vector
- movl(result, 0x00FF);
- evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
+ movl(tmp5, 0x00FF);
+ evpbroadcastw(tmp2Reg, tmp5, Assembler::AVX_512bit);
testl(len, -64);
- jcc(Assembler::zero, post_alignment);
+ jccb(Assembler::zero, post_alignment);
movl(tmp5, dst);
andl(tmp5, (32 - 1));
@@ -8898,18 +8902,19 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
// bail out when there is nothing to be done
testl(tmp5, 0xFFFFFFFF);
- jcc(Assembler::zero, post_alignment);
+ jccb(Assembler::zero, post_alignment);
// ~(~0 << len), where len is the # of remaining elements to process
- movl(result, 0xFFFFFFFF);
- shlxl(result, result, tmp5);
- notl(result);
- kmovdl(mask2, result);
+ movl(len, 0xFFFFFFFF);
+ shlxl(len, len, tmp5);
+ notl(len);
+ kmovdl(mask2, len);
+ movl(len, result);
evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
ktestd(mask1, mask2);
- jcc(Assembler::carryClear, return_zero);
+ jcc(Assembler::carryClear, copy_tail);
evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
@@ -8924,7 +8929,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
movl(tmp5, len);
andl(tmp5, (32 - 1)); // tail count (in chars)
andl(len, ~(32 - 1)); // vector count (in chars)
- jcc(Assembler::zero, copy_loop_tail);
+ jccb(Assembler::zero, copy_loop_tail);
lea(src, Address(src, len, Address::times_2));
lea(dst, Address(dst, len, Address::times_1));
@@ -8934,55 +8939,60 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
evpcmpuw(mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
kortestdl(mask1, mask1);
- jcc(Assembler::carryClear, return_zero);
+ jccb(Assembler::carryClear, reset_for_copy_tail);
// All elements in current processed chunk are valid candidates for
// compression. Write a truncated byte elements to the memory.
evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
addptr(len, 32);
- jcc(Assembler::notZero, copy_32_loop);
+ jccb(Assembler::notZero, copy_32_loop);
bind(copy_loop_tail);
// bail out when there is nothing to be done
testl(tmp5, 0xFFFFFFFF);
- jcc(Assembler::zero, return_length);
+ jcc(Assembler::zero, done);
movl(len, tmp5);
// ~(~0 << len), where len is the # of remaining elements to process
- movl(result, 0xFFFFFFFF);
- shlxl(result, result, len);
- notl(result);
+ movl(tmp5, 0xFFFFFFFF);
+ shlxl(tmp5, tmp5, len);
+ notl(tmp5);
- kmovdl(mask2, result);
+ kmovdl(mask2, tmp5);
evmovdquw(tmp1Reg, mask2, Address(src, 0), /*merge*/ false, Assembler::AVX_512bit);
evpcmpw(mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /*signed*/ false, Assembler::AVX_512bit);
ktestd(mask1, mask2);
- jcc(Assembler::carryClear, return_zero);
+ jcc(Assembler::carryClear, copy_tail);
evpmovwb(Address(dst, 0), mask2, tmp1Reg, Assembler::AVX_512bit);
- jmp(return_length);
+ jmp(done);
+
+ bind(reset_for_copy_tail);
+ lea(src, Address(src, tmp5, Address::times_2));
+ lea(dst, Address(dst, tmp5, Address::times_1));
+ subptr(len, tmp5);
+ jmp(copy_chars_loop);
bind(below_threshold);
}
if (UseSSE42Intrinsics) {
- Label copy_32_loop, copy_16, copy_tail;
+ Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail;
- movl(result, len);
+ // vectored compression
+ testl(len, 0xfffffff8);
+ jcc(Assembler::zero, copy_tail);
movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
+ movdl(tmp1Reg, tmp5);
+ pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
- // vectored compression
- andl(len, 0xfffffff0); // vector count (in chars)
- andl(result, 0x0000000f); // tail count (in chars)
- testl(len, len);
- jcc(Assembler::zero, copy_16);
+ andl(len, 0xfffffff0);
+ jccb(Assembler::zero, copy_16);
// compress 16 chars per iter
- movdl(tmp1Reg, tmp5);
- pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
pxor(tmp4Reg, tmp4Reg);
lea(src, Address(src, len, Address::times_2));
@@ -8995,59 +9005,60 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters
por(tmp4Reg, tmp3Reg);
ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
- jcc(Assembler::notZero, return_zero);
+ jccb(Assembler::notZero, reset_for_copy_tail);
packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
movdqu(Address(dst, len, Address::times_1), tmp2Reg);
addptr(len, 16);
- jcc(Assembler::notZero, copy_32_loop);
+ jccb(Assembler::notZero, copy_32_loop);
// compress next vector of 8 chars (if any)
bind(copy_16);
- movl(len, result);
- andl(len, 0xfffffff8); // vector count (in chars)
- andl(result, 0x00000007); // tail count (in chars)
- testl(len, len);
- jccb(Assembler::zero, copy_tail);
+ // len = 0
+ testl(result, 0x00000008); // check if there's a block of 8 chars to compress
+ jccb(Assembler::zero, copy_tail_sse);
- movdl(tmp1Reg, tmp5);
- pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg
pxor(tmp3Reg, tmp3Reg);
movdqu(tmp2Reg, Address(src, 0));
ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
- jccb(Assembler::notZero, return_zero);
+ jccb(Assembler::notZero, reset_for_copy_tail);
packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
movq(Address(dst, 0), tmp2Reg);
addptr(src, 16);
addptr(dst, 8);
+ jmpb(copy_tail_sse);
- bind(copy_tail);
+ bind(reset_for_copy_tail);
+ movl(tmp5, result);
+ andl(tmp5, 0x0000000f);
+ lea(src, Address(src, tmp5, Address::times_2));
+ lea(dst, Address(dst, tmp5, Address::times_1));
+ subptr(len, tmp5);
+ jmpb(copy_chars_loop);
+
+ bind(copy_tail_sse);
movl(len, result);
+ andl(len, 0x00000007); // tail count (in chars)
}
// compress 1 char per iter
+ bind(copy_tail);
testl(len, len);
- jccb(Assembler::zero, return_length);
+ jccb(Assembler::zero, done);
lea(src, Address(src, len, Address::times_2));
lea(dst, Address(dst, len, Address::times_1));
negptr(len);
bind(copy_chars_loop);
- load_unsigned_short(result, Address(src, len, Address::times_2));
- testl(result, 0xff00); // check if Unicode char
- jccb(Assembler::notZero, return_zero);
- movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte
+ load_unsigned_short(tmp5, Address(src, len, Address::times_2));
+ testl(tmp5, 0xff00); // check if Unicode char
+ jccb(Assembler::notZero, reset_sp);
+ movb(Address(dst, len, Address::times_1), tmp5); // ASCII char; compress to 1 byte
increment(len);
- jcc(Assembler::notZero, copy_chars_loop);
+ jccb(Assembler::notZero, copy_chars_loop);
- // if compression succeeded, return length
- bind(return_length);
- pop(result);
- jmpb(done);
-
- // if compression failed, return 0
- bind(return_zero);
- xorl(result, result);
- addptr(rsp, wordSize);
+ // add len then return (len will be zero if compress succeeded, otherwise negative)
+ bind(reset_sp);
+ addl(result, len);
bind(done);
}
diff --git a/src/java.base/share/classes/java/lang/AbstractStringBuilder.java b/src/java.base/share/classes/java/lang/AbstractStringBuilder.java
index 902a7c61ebd..e472ba94550 100644
--- a/src/java.base/share/classes/java/lang/AbstractStringBuilder.java
+++ b/src/java.base/share/classes/java/lang/AbstractStringBuilder.java
@@ -1675,11 +1675,10 @@ void getBytes(byte[] dst, int dstBegin, byte coder) {
/* for readObject() */
void initBytes(char[] value, int off, int len) {
if (String.COMPACT_STRINGS) {
- this.value = StringUTF16.compress(value, off, len);
- if (this.value != null) {
- this.coder = LATIN1;
- return;
- }
+ byte[] val = StringUTF16.compress(value, off, len);
+ this.coder = StringUTF16.coderFromArrayLen(val, len);
+ this.value = val;
+ return;
}
this.coder = UTF16;
this.value = StringUTF16.toBytes(value, off, len);
@@ -1720,6 +1719,9 @@ private final void putCharsAt(int index, CharSequence s, int off, int end) {
val[j++] = (byte)c;
} else {
inflate();
+ // store c to make sure it has a UTF16 char
+ StringUTF16.putChar(this.value, j++, c);
+ i++;
StringUTF16.putCharsSB(this.value, j, s, i, end);
return;
}
@@ -1807,6 +1809,10 @@ private final void appendChars(CharSequence s, int off, int end) {
} else {
count = j;
inflate();
+ // Store c to make sure sb has a UTF16 char
+ StringUTF16.putChar(this.value, j++, c);
+ count = j;
+ i++;
StringUTF16.putCharsSB(this.value, j, s, i, end);
count += end - i;
return;
@@ -1918,6 +1924,10 @@ public AbstractStringBuilder repeat(int codePoint, int count) {
*
* If {@code cs} is {@code null}, then the four characters
* {@code "null"} are repeated into this sequence.
+ *
+ * The contents are unspecified if the {@code CharSequence}
+ * is modified during the method call or an exception is thrown
+ * when accessing the {@code CharSequence}.
*
* @param cs a {@code CharSequence}
* @param count number of times to copy
diff --git a/src/java.base/share/classes/java/lang/String.java b/src/java.base/share/classes/java/lang/String.java
index cd8995e18e4..5216c9903b9 100644
--- a/src/java.base/share/classes/java/lang/String.java
+++ b/src/java.base/share/classes/java/lang/String.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1994, 2023, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1994, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -344,12 +344,10 @@ public String(int[] codePoints, int offset, int count) {
return;
}
if (COMPACT_STRINGS) {
- byte[] val = StringLatin1.toBytes(codePoints, offset, count);
- if (val != null) {
- this.coder = LATIN1;
- this.value = val;
- return;
- }
+ byte[] val = StringUTF16.compress(codePoints, offset, count);
+ this.coder = StringUTF16.coderFromArrayLen(val, count);
+ this.value = val;
+ return;
}
this.coder = UTF16;
this.value = StringUTF16.toBytes(codePoints, offset, count);
@@ -541,47 +539,43 @@ private String(Charset charset, byte[] bytes, int offset, int length) {
this.coder = LATIN1;
return;
}
- int sl = offset + length;
- byte[] dst = new byte[length];
- if (dp > 0) {
- System.arraycopy(bytes, offset, dst, 0, dp);
- offset += dp;
- }
- while (offset < sl) {
- int b1 = bytes[offset++];
+ // Decode with a stable copy, to be the result if the decoded length is the same
+ byte[] latin1 = Arrays.copyOfRange(bytes, offset, offset + length);
+ int sp = dp; // first dp bytes are already in the copy
+ while (sp < length) {
+ int b1 = latin1[sp++];
if (b1 >= 0) {
- dst[dp++] = (byte)b1;
+ latin1[dp++] = (byte)b1;
continue;
}
- if ((b1 & 0xfe) == 0xc2 && offset < sl) { // b1 either 0xc2 or 0xc3
- int b2 = bytes[offset];
+ if ((b1 & 0xfe) == 0xc2 && sp < length) { // b1 either 0xc2 or 0xc3
+ int b2 = latin1[sp];
if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65
- dst[dp++] = (byte)decode2(b1, b2);
- offset++;
+ latin1[dp++] = (byte)decode2(b1, b2);
+ sp++;
continue;
}
}
// anything not a latin1, including the REPL
// we have to go with the utf16
- offset--;
+ sp--;
break;
}
- if (offset == sl) {
- if (dp != dst.length) {
- dst = Arrays.copyOf(dst, dp);
+ if (sp == length) {
+ if (dp != latin1.length) {
+ latin1 = Arrays.copyOf(latin1, dp);
}
- this.value = dst;
+ this.value = latin1;
this.coder = LATIN1;
return;
}
- byte[] buf = StringUTF16.newBytesFor(length);
- StringLatin1.inflate(dst, 0, buf, 0, dp);
- dst = buf;
- dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, true);
+ byte[] utf16 = StringUTF16.newBytesFor(length);
+ StringLatin1.inflate(latin1, 0, utf16, 0, dp);
+ dp = decodeUTF8_UTF16(latin1, sp, length, utf16, dp, true);
if (dp != length) {
- dst = Arrays.copyOf(dst, dp << 1);
+ utf16 = Arrays.copyOf(utf16, dp << 1);
}
- this.value = dst;
+ this.value = utf16;
this.coder = UTF16;
} else { // !COMPACT_STRINGS
byte[] dst = StringUTF16.newBytesFor(length);
@@ -653,12 +647,10 @@ private String(Charset charset, byte[] bytes, int offset, int length) {
char[] ca = new char[en];
int clen = ad.decode(bytes, offset, length, ca);
if (COMPACT_STRINGS) {
- byte[] bs = StringUTF16.compress(ca, 0, clen);
- if (bs != null) {
- value = bs;
- coder = LATIN1;
- return;
- }
+ byte[] val = StringUTF16.compress(ca, 0, clen);;
+ this.coder = StringUTF16.coderFromArrayLen(val, clen);
+ this.value = val;
+ return;
}
coder = UTF16;
value = StringUTF16.toBytes(ca, 0, clen);
@@ -684,12 +676,10 @@ private String(Charset charset, byte[] bytes, int offset, int length) {
throw new Error(x);
}
if (COMPACT_STRINGS) {
- byte[] bs = StringUTF16.compress(ca, 0, caLen);
- if (bs != null) {
- value = bs;
- coder = LATIN1;
- return;
- }
+ byte[] val = StringUTF16.compress(ca, 0, caLen);
+ this.coder = StringUTF16.coderFromArrayLen(val, caLen);
+ this.value = val;
+ return;
}
coder = UTF16;
value = StringUTF16.toBytes(ca, 0, caLen);
@@ -827,10 +817,9 @@ private static String newStringNoRepl1(byte[] src, Charset cs) {
throw new IllegalArgumentException(x);
}
if (COMPACT_STRINGS) {
- byte[] bs = StringUTF16.compress(ca, 0, caLen);
- if (bs != null) {
- return new String(bs, LATIN1);
- }
+ byte[] val = StringUTF16.compress(ca, 0, caLen);
+ byte coder = StringUTF16.coderFromArrayLen(val, caLen);
+ return new String(val, coder);
}
return new String(StringUTF16.toBytes(ca, 0, caLen), UTF16);
}
@@ -4750,15 +4739,18 @@ void getBytes(byte[] dst, int srcPos, int dstBegin, byte coder, int length) {
}
/*
- * Package private constructor. Trailing Void argument is there for
+ * Private constructor. Trailing Void argument is there for
* disambiguating it against other (public) constructors.
*
* Stores the char[] value into a byte[] that each byte represents
* the8 low-order bits of the corresponding character, if the char[]
* contains only latin1 character. Or a byte[] that stores all
* characters in their byte sequences defined by the {@code StringUTF16}.
+ *
+ *
The contents of the string are unspecified if the character array
+ * is modified during string construction.
*/
- String(char[] value, int off, int len, Void sig) {
+ private String(char[] value, int off, int len, Void sig) {
if (len == 0) {
this.value = "".value;
this.coder = "".coder;
@@ -4766,11 +4758,9 @@ void getBytes(byte[] dst, int srcPos, int dstBegin, byte coder, int length) {
}
if (COMPACT_STRINGS) {
byte[] val = StringUTF16.compress(value, off, len);
- if (val != null) {
- this.value = val;
- this.coder = LATIN1;
- return;
- }
+ this.coder = StringUTF16.coderFromArrayLen(val, len);
+ this.value = val;
+ return;
}
this.coder = UTF16;
this.value = StringUTF16.toBytes(value, off, len);
@@ -4779,6 +4769,9 @@ void getBytes(byte[] dst, int srcPos, int dstBegin, byte coder, int length) {
/*
* Package private constructor. Trailing Void argument is there for
* disambiguating it against other (public) constructors.
+ *
+ *
The contents of the string are unspecified if the {@code StringBuilder}
+ * is modified during string construction.
*/
String(AbstractStringBuilder asb, Void sig) {
byte[] val = asb.getValue();
@@ -4789,12 +4782,9 @@ void getBytes(byte[] dst, int srcPos, int dstBegin, byte coder, int length) {
} else {
// only try to compress val if some characters were deleted.
if (COMPACT_STRINGS && asb.maybeLatin1) {
- byte[] buf = StringUTF16.compress(val, 0, length);
- if (buf != null) {
- this.coder = LATIN1;
- this.value = buf;
- return;
- }
+ this.value = StringUTF16.compress(val, 0, length);
+ this.coder = StringUTF16.coderFromArrayLen(this.value, length);
+ return;
}
this.coder = UTF16;
this.value = Arrays.copyOfRange(val, 0, length << 1);
diff --git a/src/java.base/share/classes/java/lang/StringLatin1.java b/src/java.base/share/classes/java/lang/StringLatin1.java
index 7c12e5711b3..7f901e4717c 100644
--- a/src/java.base/share/classes/java/lang/StringLatin1.java
+++ b/src/java.base/share/classes/java/lang/StringLatin1.java
@@ -47,8 +47,12 @@ public static char charAt(byte[] value, int index) {
return (char)(value[index] & 0xff);
}
+ public static boolean canEncode(char cp) {
+ return cp <= 0xff;
+ }
+
public static boolean canEncode(int cp) {
- return cp >>> 8 == 0;
+ return cp >=0 && cp <= 0xff;
}
public static int length(byte[] value) {
diff --git a/src/java.base/share/classes/java/lang/StringUTF16.java b/src/java.base/share/classes/java/lang/StringUTF16.java
index 73d85863990..314ddae1340 100644
--- a/src/java.base/share/classes/java/lang/StringUTF16.java
+++ b/src/java.base/share/classes/java/lang/StringUTF16.java
@@ -33,7 +33,6 @@
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import jdk.internal.util.ArraysSupport;
-import jdk.internal.vm.annotation.DontInline;
import jdk.internal.vm.annotation.ForceInline;
import jdk.internal.vm.annotation.IntrinsicCandidate;
@@ -42,15 +41,23 @@
final class StringUTF16 {
+ // Return a new byte array for a UTF16-coded string for len chars
+ // Throw an exception if out of range
public static byte[] newBytesFor(int len) {
+ return new byte[newBytesLength(len)];
+ }
+
+ // Check the size of a UTF16-coded string
+ // Throw an exception if out of range
+ public static int newBytesLength(int len) {
if (len < 0) {
throw new NegativeArraySizeException();
}
- if (len > MAX_LENGTH) {
+ if (len >= MAX_LENGTH) {
throw new OutOfMemoryError("UTF16 String size is " + len +
", should be less than " + MAX_LENGTH);
}
- return new byte[len << 1];
+ return len << 1;
}
@IntrinsicCandidate
@@ -147,6 +154,13 @@ public static char[] toChars(byte[] value) {
return dst;
}
+ /**
+ * {@return an encoded byte[] for the UTF16 characters in char[]}
+ * No checking is done on the characters, some may or may not be latin1.
+ * @param value a char array
+ * @param off an offset
+ * @param len a length
+ */
@IntrinsicCandidate
public static byte[] toBytes(char[] value, int off, int len) {
byte[] val = newBytesFor(len);
@@ -157,20 +171,209 @@ public static byte[] toBytes(char[] value, int off, int len) {
return val;
}
- public static byte[] compress(char[] val, int off, int len) {
- byte[] ret = new byte[len];
- if (compress(val, off, ret, 0, len) == len) {
- return ret;
+ // Clever way to get the coder from a byte array returned from compress
+ // that maybe either latin1 or UTF16-coded
+ // Equivalent to (len == val.length) ? LATIN1 : UTF16
+ @ForceInline
+ static byte coderFromArrayLen(byte[] value, int len) {
+ return (byte) ((len - value.length) >>> Integer.SIZE - 1);
+ }
+
+ /**
+ * {@return Compress the char array (containing UTF16) into a compact strings byte array}
+ * If all the chars are LATIN1, it returns an array with len == count,
+ * otherwise, it contains UTF16 characters.
+ *
+ * A UTF16 array is returned *only* if at least 1 non-latin1 character is present.
+ * This must be true even if the input array is modified while this method is executing.
+ * This is assured by copying the characters while checking for latin1.
+ * If all characters are latin1, a byte array with length equals count is returned,
+ * indicating all latin1 chars. The scan may be implemented as an intrinsic,
+ * which returns the index of the first non-latin1 character.
+ * When the first non-latin1 character is found, it switches to creating a new
+ * buffer; the saved prefix of latin1 characters is copied to the new buffer;
+ * and the remaining input characters are copied to the buffer.
+ * The index of the known non-latin1 character is checked, if it is latin1,
+ * the input has been changed. In this case, a second attempt is made to compress to
+ * latin1 from the copy made in the first pass to the originally allocated latin1 buffer.
+ * If it succeeds the return value is latin1, otherwise, the utf16 value is returned.
+ * In this unusual case, the result is correct for the snapshot of the value.
+ * The resulting string contents are unspecified if the input array is modified during this
+ * operation, but it is ensured that at least 1 non-latin1 character is present in
+ * the non-latin1 buffer.
+ *
+ * @param val a char array
+ * @param off starting offset
+ * @param count count of chars to be compressed, {@code count} > 0
+ */
+ @ForceInline
+ public static byte[] compress(final char[] val, final int off, final int count) {
+ byte[] latin1 = new byte[count];
+ int ndx = compress(val, off, latin1, 0, count);
+ if (ndx != count) {
+ // Switch to UTF16
+ byte[] utf16 = toBytes(val, off, count);
+ // If the original character that was found to be non-latin1 is latin1 in the copy
+ // try to make a latin1 string from the copy
+ if (getChar(utf16, ndx) > 0xff
+ || compress(utf16, 0, latin1, 0, count) != count) {
+ return utf16;
+ }
}
- return null;
+ return latin1; // latin1 success
+ }
+
+ /**
+ * {@return Compress the internal byte array (containing UTF16) into a compact strings byte array}
+ * If all the chars are LATIN1, it returns an array with len == count,
+ * otherwise, it contains UTF16 characters.
+ *
+ * Refer to the description of the algorithm in {@link #compress(char[], int, int)}.
+ *
+ * @param val a byte array with UTF16 coding
+ * @param off starting offset
+ * @param count count of chars to be compressed, {@code count} > 0
+ */
+ public static byte[] compress(final byte[] val, final int off, final int count) {
+ byte[] latin1 = new byte[count];
+ int ndx = compress(val, off, latin1, 0, count);
+ if (ndx != count) {// Switch to UTF16
+ byte[] utf16 = Arrays.copyOfRange(val, off << 1, newBytesLength(off + count));
+ // If the original character that was found to be non-latin1 is latin1 in the copy
+ // try to make a latin1 string from the copy
+ if (getChar(utf16, ndx) > 0xff
+ || compress(utf16, 0, latin1, 0, count) != count) {
+ return utf16;
+ }
+ }
+ return latin1; // latin1 success
}
- public static byte[] compress(byte[] val, int off, int len) {
- byte[] ret = new byte[len];
- if (compress(val, off, ret, 0, len) == len) {
- return ret;
+ /**
+ * {@return compress the code points into a compact strings byte array}
+ * If all the chars are LATIN1, returns an array with len == count.
+ * If not, a new byte array is allocated and code points converted to UTF16.
+ * The algorithm is similar to that of {@link #compress(char[], int, int)}.
+ *
+ * The resulting encoding is attempted in several steps:
+ *
+ * - If no non-latin1 characters are found, the encoding is latin1
+ * - If an estimate of the number of characters needed to represent the codepoints is
+ * equal to the string length, they are all BMP with at least 1 UTF16 character
+ * and are copied to the result.
+ * - The extractCodePoints method is called to carefully expand surrogates.
+ *
+ *
+ * @param val an int array of code points
+ * @param off starting offset
+ * @param count length of code points to be compressed, length > 0
+ */
+ public static byte[] compress(final int[] val, int off, final int count) {
+ // Optimistically copy all latin1 code points to the destination
+ byte[] latin1 = new byte[count];
+ final int end = off + count;
+ for (int ndx = 0; ndx < count; ndx++, off++) {
+ int cp = val[off];
+ if (cp >= 0 && cp <= 0xff) {
+ latin1[ndx] = (byte)cp;
+ } else {
+ // Pass 1: Compute precise size of char[]; see extractCodePoints for caveat
+ int estSize = ndx + computeCodePointSize(val, off, end);
+
+ // Pass 2: Switch to UTF16
+ // cp = val[ndx] is at least one code point known to be UTF16
+ byte[] utf16 = newBytesFor(estSize);
+ if (ndx > 0) {
+ StringLatin1.inflate(latin1, 0, utf16, 0, ndx); // inflate latin1 bytes
+ }
+
+ if (estSize == count) {
+ // Based on the computed size, all remaining code points are BMP and
+ // can be copied without checking again
+ putChar(utf16, ndx, cp); // ensure utf16 has a UTF16 char
+ off++;
+ for (int i = ndx + 1; i < count; i++, off++) {
+ putChar(utf16, i, val[off]);
+ }
+ } else {
+ // Some codepoint is a surrogate pair
+ utf16 = extractCodepoints(val, off, end, utf16, ndx);
+
+ // The original character that was found to be UTF16 is not UTF16 in the copy
+ // Try to make a latin1 string from the copy
+ if (getChar(utf16, ndx) <= 0xff &&
+ compress(utf16, 0, latin1, 0, count) == count) {
+ return latin1; // latin1 success
+ }
+ }
+ return utf16;
+ }
}
- return null;
+ return latin1; // Latin1 success
+ }
+
+ // Extract code points into chars in the byte array
+ //
+ // Guard against possible races with the input array changing between the previous
+ // computation of the required output size and storing the bmp or surrogates.
+ // If a BMP code point is changed to a supplementary code point it would require 2 chars
+ // in the output. Changing a supplementary char to BMP would reduce the size.
+ // If the utf16 destination is not large enough, it is resized to fit the
+ // remaining codepoints assuming they occupy 2 characters.
+ // The destination may be copied to return exactly the final length.
+ // The additional allocations and compression only occur if the input array is modified.
+ private static byte[] extractCodepoints(int[] val, int off, int end, byte[] dst, int dstOff) {
+ while (off < end) {
+ // Compute a minimum estimate on the number of characters can be put into the dst
+ // given the current codepoint and the number of remaining codepoints
+ int codePoint = val[off]; // read each codepoint from val only once
+ int dstLimit = dstOff
+ + Character.charCount(codePoint)
+ + (end - off - 1);
+ if (dstLimit > (dst.length >> 1)) {
+ // Resize to hold the remaining codepoints assuming they are all surrogates.
+ // By resizing to the maximum that might be needed, only a single resize will occur.
+ // dstLimit includes only a single char per codepoint, pad with an additional for each.
+ int maxRemaining = dstLimit + (end - off - 1);
+ dst = Arrays.copyOf(dst, newBytesLength(maxRemaining));
+ }
+ // Efficiently copy as many codepoints as fit within the current estimated limit
+ // The dst at least enough space for the current codepoint.
+ while (true) {
+ if (Character.isBmpCodePoint(codePoint)) {
+ putChar(dst, dstOff++, codePoint);
+ } else {
+ putChar(dst, dstOff++, Character.highSurrogate(codePoint));
+ putChar(dst, dstOff++, Character.lowSurrogate(codePoint));
+ }
+ off++;
+ if (dstOff + 2 > dstLimit)
+ break; // no space for another surrogate; recompute limit
+ codePoint = val[off];
+ }
+ }
+ if (dstOff != (dst.length >> 1)) {
+ // Truncate to actual length; should only occur if a codepoint was racily
+ // changed from a surrogate to a BMP character.
+ return Arrays.copyOf(dst, newBytesLength(dstOff));
+ }
+ return dst;
+ }
+
+ // Compute the number of chars needed to represent the code points from off to end-1
+ private static int computeCodePointSize(int[] val, int off, int end) {
+ int n = end - off;
+ while (off < end) {
+ int codePoint = val[off++];
+ if (Character.isBmpCodePoint(codePoint)) {
+ continue;
+ } else if (Character.isValidCodePoint(codePoint)) {
+ n++;
+ } else {
+ throw new IllegalArgumentException(Integer.toString(codePoint));
+ }
+ }
+ return n;
}
// compressedCopy char[] -> byte[]
@@ -178,9 +381,8 @@ public static byte[] compress(byte[] val, int off, int len) {
public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
for (int i = 0; i < len; i++) {
char c = src[srcOff];
- if (c > 0xFF) {
- len = 0;
- break;
+ if (c > 0xff) {
+ return i; // return index of non-latin1 char
}
dst[dstOff] = (byte)c;
srcOff++;
@@ -196,9 +398,8 @@ public static int compress(byte[] src, int srcOff, byte[] dst, int dstOff, int l
checkBoundsOffCount(srcOff, len, src);
for (int i = 0; i < len; i++) {
char c = getChar(src, srcOff);
- if (c > 0xFF) {
- len = 0;
- break;
+ if (c > 0xff) {
+ return i; // return index of non-latin1 char
}
dst[dstOff] = (byte)c;
srcOff++;
@@ -207,31 +408,14 @@ public static int compress(byte[] src, int srcOff, byte[] dst, int dstOff, int l
return len;
}
+ // Create the UTF16 buffer for !COMPACT_STRINGS
public static byte[] toBytes(int[] val, int index, int len) {
final int end = index + len;
- // Pass 1: Compute precise size of char[]
- int n = len;
- for (int i = index; i < end; i++) {
- int cp = val[i];
- if (Character.isBmpCodePoint(cp))
- continue;
- else if (Character.isValidCodePoint(cp))
- n++;
- else throw new IllegalArgumentException(Integer.toString(cp));
- }
- // Pass 2: Allocate and fill in pair
+ int n = computeCodePointSize(val, index, end);
+
byte[] buf = newBytesFor(n);
- for (int i = index, j = 0; i < end; i++, j++) {
- int cp = val[i];
- if (Character.isBmpCodePoint(cp)) {
- putChar(buf, j, cp);
- } else {
- putChar(buf, j++, Character.highSurrogate(cp));
- putChar(buf, j, Character.lowSurrogate(cp));
- }
- }
- return buf;
- }
+ return extractCodepoints(val, index, end, buf, 0);
+ }
public static byte[] toBytes(char c) {
byte[] result = new byte[2];
@@ -652,10 +836,9 @@ public static String replace(byte[] value, char oldChar, char newChar) {
if (String.COMPACT_STRINGS &&
!StringLatin1.canEncode(oldChar) &&
StringLatin1.canEncode(newChar)) {
- byte[] val = compress(buf, 0, len);
- if (val != null) {
- return new String(val, LATIN1);
- }
+ byte[] res = StringUTF16.compress(buf, 0, len);
+ byte coder = StringUTF16.coderFromArrayLen(res, len);
+ return new String(res, coder);
}
return new String(buf, UTF16);
}
@@ -770,10 +953,9 @@ public static String replace(byte[] value, int valLen, boolean valLat1,
if (String.COMPACT_STRINGS && replLat1 && !targLat1) {
// combination 6
- byte[] lat1Result = compress(result, 0, resultLen);
- if (lat1Result != null) {
- return new String(lat1Result, LATIN1);
- }
+ byte[] res = StringUTF16.compress(result, 0, resultLen);
+ byte coder = StringUTF16.coderFromArrayLen(res, resultLen);
+ return new String(res, coder); // combination 6
}
return new String(result, UTF16);
}
@@ -837,7 +1019,7 @@ public static String toLowerCase(String str, byte[] value, Locale locale) {
bits |= cp;
putChar(result, i, cp);
}
- if (bits > 0xFF) {
+ if (bits < 0 || bits > 0xff) {
return new String(result, UTF16);
} else {
return newString(result, 0, len);
@@ -938,7 +1120,7 @@ public static String toUpperCase(String str, byte[] value, Locale locale) {
bits |= cp;
putChar(result, i, cp);
}
- if (bits > 0xFF) {
+ if (bits < 0 || bits > 0xff) {
return new String(result, UTF16);
} else {
return newString(result, 0, len);
@@ -1167,10 +1349,9 @@ public static String newString(byte[] val, int index, int len) {
return "";
}
if (String.COMPACT_STRINGS) {
- byte[] buf = compress(val, index, len);
- if (buf != null) {
- return new String(buf, LATIN1);
- }
+ byte[] res = StringUTF16.compress(val, index, len);
+ byte coder = StringUTF16.coderFromArrayLen(res, len);
+ return new String(res, coder);
}
int last = index + len;
return new String(Arrays.copyOfRange(val, index << 1, last << 1), UTF16);
@@ -1501,8 +1682,8 @@ public static int lastIndexOfLatin1(byte[] src, int srcCount,
private static native boolean isBigEndian();
- static final int HI_BYTE_SHIFT;
- static final int LO_BYTE_SHIFT;
+ private static final int HI_BYTE_SHIFT;
+ private static final int LO_BYTE_SHIFT;
static {
if (isBigEndian()) {
HI_BYTE_SHIFT = 8;
diff --git a/test/hotspot/jtreg/compiler/intrinsics/string/TestStringConstructionIntrinsics.java b/test/hotspot/jtreg/compiler/intrinsics/string/TestStringConstructionIntrinsics.java
new file mode 100644
index 00000000000..8bec9822462
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/intrinsics/string/TestStringConstructionIntrinsics.java
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8311906
+ * @summary Validates String constructor intrinsics using varied input data.
+ * @key randomness
+ * @library /compiler/patches /test/lib
+ * @build java.base/java.lang.Helper
+ * @run main/othervm/timeout=1200 -Xbatch -XX:CompileThreshold=100 compiler.intrinsics.string.TestStringConstructionIntrinsics
+ */
+/*
+ * @test
+ * @bug 8311906
+ * @summary Validates String constructor intrinsic for AVX3 works with and without
+ * AVX3Threshold=0
+ * @key randomness
+ * @library /compiler/patches /test/lib
+ * @build java.base/java.lang.Helper
+ * @requires vm.cpu.features ~= ".*avx512.*"
+ * @run main/othervm/timeout=1200 -Xbatch -XX:CompileThreshold=100 -XX:UseAVX=3 compiler.intrinsics.string.TestStringConstructionIntrinsics
+ * @run main/othervm/timeout=1200 -Xbatch -XX:CompileThreshold=100 -XX:UseAVX=3 -XX:+UnlockDiagnosticVMOptions -XX:AVX3Threshold=0 compiler.intrinsics.string.TestStringConstructionIntrinsics
+ */
+
+package compiler.intrinsics.string;
+
+import java.lang.Helper;
+import java.util.Random;
+
+import jdk.test.lib.Utils;
+
+public class TestStringConstructionIntrinsics {
+
+ private static byte[] bytes = new byte[2 * (4096 + 32)];
+
+ private static char[] chars = new char[4096 + 32];
+
+ // Used a scratch buffer, sized to accommodate inflated
+ private static byte[] dst = new byte[bytes.length * 2];
+
+ private static final Random RANDOM = Utils.getRandomInstance();
+
+ /**
+ * Completely initialize the bytes test array. The lowest index that will be
+ * non-latin1 is marked by nlOffset
+ */
+ public static void initializeBytes(int off, int len, int nonLatin1, int nlOffset) {
+ int maxLen = bytes.length >> 1;
+ assert (len + off < maxLen);
+ // insert "canary" (non-latin1) values before offset
+ for (int i = 0; i < off; i++) {
+ Helper.putCharSB(bytes, i, ((i + 15) & 0x7F) | 0x180);
+ }
+ // fill the array segment
+ for (int i = off; i < len + off; i++) {
+ Helper.putCharSB(bytes, i, ((i - off + 15) & 0xFF));
+ }
+ if (nonLatin1 != 0) {
+ // modify a number disparate indexes to be non-latin1
+ for (int i = 0; i < nonLatin1; i++) {
+ int idx = off + RANDOM.nextInt(len - nlOffset) + nlOffset;
+ Helper.putCharSB(bytes, i, ((i + 15) & 0x7F) | 0x180);
+ }
+ }
+ // insert "canary" non-latin1 values after array segment
+ for (int i = len + off; i < maxLen; i++) {
+ Helper.putCharSB(bytes, i, ((i + 15) & 0x7F) | 0x180);
+ }
+ }
+
+ /**
+ * Completely initialize the char test array. The lowest index that will be
+ * non-latin1 is marked by nlOffset
+ */
+ public static void initializeChars(int off, int len, int nonLatin1, int nlOffset) {
+ assert (len + off <= chars.length);
+ // insert "canary" non-latin1 values before offset
+ for (int i = 0; i < off; ++i) {
+ chars[i] = (char) (((i + 15) & 0x7F) | 0x180);
+ }
+ // fill the array segment
+ for (int i = off; i < len + off; ++i) {
+ chars[i] = (char) (((i - off + 15) & 0xFF));
+ }
+ if (nonLatin1 != 0) {
+ // modify a number disparate chars inside
+ // segment to be non-latin1.
+ for (int i = 0; i < nonLatin1; i++) {
+ int idx = off + RANDOM.nextInt(len - nlOffset) + nlOffset;
+ chars[idx] = (char) (0x180 | chars[idx]);
+ }
+ }
+ // insert "canary" non-latin1 values after array segment
+ for (int i = len + off; i < chars.length; ++i) {
+ chars[i] = (char) (((i + 15) & 0x7F) | 0x180);
+ }
+ }
+
+ /**
+ * Test different array segment sizes, offsets, and number of non-latin1
+ * chars.
+ */
+ public static void testConstructBytes() throws Exception {
+ for (int off = 0; off < 16; off++) { // starting offset of array segment
+ // Test all array segment sizes 1-63
+ for (int len = 1; len < 64; len++) {
+ testConstructBytes(off, len, 0, 0);
+ testConstructBytes(off, len, 1, 0);
+ testConstructBytes(off, len, RANDOM.nextInt(30) + 2, 0);
+ }
+ // Test a random selection of sizes between 64 and 4099, inclusive
+ for (int i = 0; i < 20; i++) {
+ int len = 64 + RANDOM.nextInt(4100 - 64);
+ testConstructBytes(off, len, 0, 0);
+ testConstructBytes(off, len, 1, 0);
+ testConstructBytes(off, len, RANDOM.nextInt(len) + 2, 0);
+ }
+ for (int len : new int[] { 128, 2048 }) {
+ // test with negatives only in a 1-63 byte tail
+ int tail = RANDOM.nextInt(63) + 1;
+ int ng = RANDOM.nextInt(tail) + 1;
+ testConstructBytes(off, len + tail, ng, len);
+ }
+ }
+ }
+
+ private static void testConstructBytes(int off, int len, int ng, int ngOffset) throws Exception {
+ assert (len + off < bytes.length);
+ initializeBytes(off, len, ng, ngOffset);
+ byte[] dst = new byte[bytes.length];
+
+ int calculated = Helper.compress(bytes, off, dst, 0, len);
+ int expected = compress(bytes, off, dst, 0, len);
+ if (calculated != expected) {
+ if (expected != len && ng >= 0 && calculated >= 0 && calculated < expected) {
+ // allow intrinsics to return early with a lower value,
+ // but only if we're not expecting the full length (no
+ // negative bytes)
+ return;
+ }
+ throw new Exception("Failed testConstructBytes: " + "offset: " + off + " "
+ + "length: " + len + " " + "return: " + calculated + " expected: " + expected + " negatives: "
+ + ng + " offset: " + ngOffset);
+ }
+ }
+
+ private static int compress(byte[] src, int srcOff, byte[] dst, int dstOff, int len) {
+ for (int i = 0; i < len; i++) {
+ char c = Helper.charAt(src, srcOff);
+ if (c > 0xff) {
+ return i; // return index of non-latin1 char
+ }
+ dst[dstOff] = (byte)c;
+ srcOff++;
+ dstOff++;
+ }
+ return len;
+ }
+
+ /**
+ * Test different array segment sizes, offsets, and number of non-latin1
+ * chars.
+ */
+ public static void testConstructChars() throws Exception {
+ for (int off = 0; off < 16; off++) { // starting offset of array segment
+ // Test all array segment sizes 1-63
+ for (int len = 1; len < 64; len++) {
+ testConstructChars(off, len, 0, 0);
+ testConstructChars(off, len, 1, 0);
+ testConstructChars(off, len, RANDOM.nextInt(30) + 2, 0);
+ }
+ // Test a random selection of sizes between 64 and 4099, inclusive
+ for (int i = 0; i < 20; i++) {
+ int len = 64 + RANDOM.nextInt(4100 - 64);
+ testConstructChars(off, len, 0, 0);
+ testConstructChars(off, len, 1, 0);
+ testConstructChars(off, len, RANDOM.nextInt(len) + 2, 0);
+ }
+ for (int len : new int[] { 128, 2048 }) {
+ // test with negatives only in a 1-63 byte tail
+ int tail = RANDOM.nextInt(63) + 1;
+ int ng = RANDOM.nextInt(tail) + 1;
+ testConstructChars(off, len + tail, ng, len);
+ }
+ }
+ }
+
+ private static void testConstructChars(int off, int len, int nonLatin1, int nlOffset) throws Exception {
+ assert (len + off < bytes.length);
+ initializeChars(off, len, nonLatin1, nlOffset);
+
+ int calculated = Helper.compress(chars, off, dst, 0, len);
+ int expected = compress(chars, off, dst, 0, len);
+ if (calculated != expected) {
+ if (expected != len && nonLatin1 >= 0 && calculated >= 0 && calculated < expected) {
+ // allow intrinsics to return early with a lower value,
+ // but only if we're not expecting the full length (no
+ // negative bytes)
+ return;
+ }
+ throw new Exception("Failed testConstructChars: " + "offset: " + off + " "
+ + "length: " + len + " " + "return: " + calculated + " expected: " + expected + " non-latin1: "
+ + nonLatin1 + " offset: " + nlOffset);
+ }
+ }
+
+ private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
+ for (int i = 0; i < len; i++) {
+ char c = src[srcOff];
+ if (c > 0xff) {
+ return i; // return index of non-latin1 char
+ }
+ dst[dstOff] = (byte)c;
+ srcOff++;
+ dstOff++;
+ }
+ return len;
+ }
+
+ public void run() throws Exception {
+ // iterate to eventually get intrinsic inlined
+ for (int j = 0; j < 200; ++j) {
+ testConstructBytes();
+ testConstructChars();
+ }
+ }
+
+ public static void main(String[] args) throws Exception {
+ (new TestStringConstructionIntrinsics()).run();
+ System.out.println("string construction intrinsics validated");
+ }
+}
diff --git a/test/hotspot/jtreg/compiler/patches/java.base/java/lang/Helper.java b/test/hotspot/jtreg/compiler/patches/java.base/java/lang/Helper.java
index 49cb89b6f7f..a24d7b98ada 100644
--- a/test/hotspot/jtreg/compiler/patches/java.base/java/lang/Helper.java
+++ b/test/hotspot/jtreg/compiler/patches/java.base/java/lang/Helper.java
@@ -44,6 +44,11 @@ public static byte[] compressByte(byte[] src, int srcOff, int dstSize, int dstOf
return dst;
}
+ @jdk.internal.vm.annotation.ForceInline
+ public static int compress(byte[] src, int srcOff, byte[] dst, int dstOff, int len) {
+ return StringUTF16.compress(src, srcOff, dst, dstOff, len);
+ }
+
@jdk.internal.vm.annotation.ForceInline
public static byte[] compressChar(char[] src, int srcOff, int dstSize, int dstOff, int len) {
byte[] dst = new byte[dstSize];
@@ -51,6 +56,11 @@ public static byte[] compressChar(char[] src, int srcOff, int dstSize, int dstOf
return dst;
}
+ @jdk.internal.vm.annotation.ForceInline
+ public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
+ return StringUTF16.compress(src, srcOff, dst, dstOff, len);
+ }
+
@jdk.internal.vm.annotation.ForceInline
public static byte[] inflateByte(byte[] src, int srcOff, int dstSize, int dstOff, int len) {
byte[] dst = new byte[dstSize];
diff --git a/test/jdk/java/lang/String/Chars.java b/test/jdk/java/lang/String/Chars.java
index ab6771b8e0b..035a7a9de27 100644
--- a/test/jdk/java/lang/String/Chars.java
+++ b/test/jdk/java/lang/String/Chars.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -22,10 +22,12 @@
*/
/*
- @test
- @bug 8054307
- @summary test chars() and codePoints()
-*/
+ * @test
+ * @bug 8054307 8311906 8321514
+ * @summary test String chars() and codePoints()
+ * @run main/othervm -XX:+CompactStrings Chars
+ * @run main/othervm -XX:-CompactStrings Chars
+ */
import java.util.Arrays;
import java.util.Random;
@@ -44,6 +46,8 @@ public static void main(String[] args) {
cc[j] = (char)(ccExp[j] = cpExp[j] = r.nextInt(0x80));
}
testChars(cc, ccExp);
+ testCharsSubrange(cc, ccExp);
+ testIntsSubrange(ccExp);
testCPs(cc, cpExp);
// bmp without surrogates
@@ -51,6 +55,7 @@ public static void main(String[] args) {
cc[j] = (char)(ccExp[j] = cpExp[j] = r.nextInt(0x8000));
}
testChars(cc, ccExp);
+ testCharsSubrange(cc, ccExp);
testCPs(cc, cpExp);
// bmp with surrogates
@@ -69,6 +74,8 @@ public static void main(String[] args) {
}
cpExp = Arrays.copyOf(cpExp, k);
testChars(cc, ccExp);
+ testCharsSubrange(cc, ccExp);
+ testIntsSubrange(ccExp);
testCPs(cc, cpExp);
}
}
@@ -76,14 +83,56 @@ public static void main(String[] args) {
static void testChars(char[] cc, int[] expected) {
String str = new String(cc);
if (!Arrays.equals(expected, str.chars().toArray())) {
- throw new RuntimeException("chars/codePoints() failed!");
+ throw new RuntimeException("testChars failed!");
+ }
+ }
+
+ static void testCharsSubrange(char[] cc, int[] expected) {
+ int[] offsets = { 7, 31 }; // offsets to test
+ int LENGTH = 13;
+ for (int i = 0; i < offsets.length; i++) {
+ int offset = Math.max(0, offsets[i]); // confine to the input array
+ int count = Math.min(LENGTH, cc.length - offset);
+ String str = new String(cc, offset, count);
+ int[] actual = str.chars().toArray();
+ int errOffset = Arrays.mismatch(actual, 0, actual.length,
+ expected, offset, offset + count);
+ if (errOffset >= 0) {
+ System.err.printf("expected[%d] (%d) != actual[%d] (%d)%n",
+ offset + errOffset, expected[offset + errOffset],
+ errOffset, actual[errOffset]);
+ System.err.println("expected: " + Arrays.toString(expected));
+ System.err.println("actual: " + Arrays.toString(actual));
+ throw new RuntimeException("testCharsSubrange failed!");
+ }
+ }
+ }
+
+ static void testIntsSubrange(int[] expected) {
+ int[] offsets = { 7, 31 }; // offsets to test
+ int LENGTH = 13;
+ for (int i = 0; i < offsets.length; i++) {
+ int offset = Math.max(0, offsets[i]); // confine to the input array
+ int count = Math.min(LENGTH, expected.length - offset);
+ String str = new String(expected, offset, count);
+ int[] actual = str.chars().toArray();
+ int errOffset = Arrays.mismatch(actual, 0, actual.length,
+ expected, offset, offset + count);
+ if (errOffset >= 0) {
+ System.err.printf("expected[%d] (%d) != actual[%d] (%d)%n",
+ offset + errOffset, expected[offset + errOffset],
+ errOffset, actual[errOffset]);
+ System.err.println("expected: " + Arrays.toString(expected));
+ System.err.println("actual: " + Arrays.toString(actual));
+ throw new RuntimeException("testIntsSubrange failed!");
+ }
}
}
static void testCPs(char[] cc, int[] expected) {
String str = new String(cc);
if (!Arrays.equals(expected, str.codePoints().toArray())) {
- throw new RuntimeException("chars/codePoints() failed!");
+ throw new RuntimeException("testCPs failed!");
}
}
}
diff --git a/test/jdk/java/lang/String/CompactString/MaxSizeUTF16String.java b/test/jdk/java/lang/String/CompactString/MaxSizeUTF16String.java
new file mode 100644
index 00000000000..530171e0b64
--- /dev/null
+++ b/test/jdk/java/lang/String/CompactString/MaxSizeUTF16String.java
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+import org.junit.jupiter.api.Test;
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.nio.charset.StandardCharsets;
+
+/*
+ * @test
+ * @bug 8077559 8321180
+ * @summary Tests Compact String for maximum size strings
+ * @requires os.maxMemory >= 8g & vm.bits == 64
+ * @requires vm.flagless
+ * @run junit/othervm -XX:+CompactStrings -Xmx8g MaxSizeUTF16String
+ * @run junit/othervm -XX:-CompactStrings -Xmx8g MaxSizeUTF16String
+ * @run junit/othervm -Xcomp -Xmx8g MaxSizeUTF16String
+ */
+
+public class MaxSizeUTF16String {
+
+ private final static int MAX_UTF16_STRING_LENGTH = Integer.MAX_VALUE / 2;
+
+ private final static String EXPECTED_OOME_MESSAGE = "UTF16 String size is";
+ private final static String EXPECTED_VM_LIMIT_MESSAGE = "Requested array size exceeds VM limit";
+ private final static String UNEXPECTED_JAVA_HEAP_SPACE = "Java heap space";
+
+ // Create a large UTF-8 byte array with a single non-latin1 character
+ private static byte[] generateUTF8Data(int byteSize) {
+ byte[] nonAscii = "\u0100".getBytes(StandardCharsets.UTF_8);
+ byte[] arr = new byte[byteSize];
+ System.arraycopy(nonAscii, 0, arr, 0, nonAscii.length); // non-latin1 at start
+ return arr;
+ }
+
+ // Create a large char array with a single non-latin1 character
+ private static char[] generateCharData(int size) {
+ char[] nonAscii = "\u0100".toCharArray();
+ char[] arr = new char[size];
+ System.arraycopy(nonAscii, 0, arr, 0, nonAscii.length); // non-latin1 at start
+ return arr;
+ }
+
+ @Test
+ public void testMaxUTF8() {
+ // Overly large UTF-8 data with 1 non-latin1 char
+ final byte[] large_utf8_bytes = generateUTF8Data(MAX_UTF16_STRING_LENGTH + 1);
+ int[] sizes = new int[] {
+ MAX_UTF16_STRING_LENGTH + 1,
+ MAX_UTF16_STRING_LENGTH,
+ MAX_UTF16_STRING_LENGTH - 1};
+ for (int size : sizes) {
+ System.err.println("Checking max UTF16 string len: " + size);
+ try {
+ // Use only part of the UTF-8 byte array
+ new String(large_utf8_bytes, 0, size, StandardCharsets.UTF_8);
+ if (size >= MAX_UTF16_STRING_LENGTH) {
+ fail("Expected OutOfMemoryError with message prefix: " + EXPECTED_OOME_MESSAGE);
+ }
+ } catch (OutOfMemoryError ex) {
+ if (ex.getMessage().equals(UNEXPECTED_JAVA_HEAP_SPACE)) {
+ // Insufficient heap size
+ throw ex;
+ }
+ if (!ex.getMessage().startsWith(EXPECTED_OOME_MESSAGE) &&
+ !ex.getMessage().startsWith(EXPECTED_VM_LIMIT_MESSAGE)) {
+ fail("Failed: Not the OutOfMemoryError expected", ex);
+ }
+ }
+ }
+ }
+
+ @Test
+ public void testMaxCharArray() {
+ // Overly large UTF-8 data with 1 non-latin1 char
+ final char[] large_char_array = generateCharData(MAX_UTF16_STRING_LENGTH + 1);
+ int[] sizes = new int[]{
+ MAX_UTF16_STRING_LENGTH + 1,
+ MAX_UTF16_STRING_LENGTH,
+ MAX_UTF16_STRING_LENGTH - 1};
+ for (int size : sizes) {
+ System.err.println("Checking max UTF16 string len: " + size);
+ try {
+ // Large char array with 1 non-latin1 char
+ new String(large_char_array, 0, size);
+ if (size >= MAX_UTF16_STRING_LENGTH) {
+ fail("Expected OutOfMemoryError with message prefix: " + EXPECTED_OOME_MESSAGE);
+ }
+ } catch (OutOfMemoryError ex) {
+ if (ex.getMessage().equals(UNEXPECTED_JAVA_HEAP_SPACE)) {
+ // Insufficient heap size
+ throw ex;
+ }
+ if (!ex.getMessage().startsWith(EXPECTED_OOME_MESSAGE) &&
+ !ex.getMessage().startsWith(EXPECTED_VM_LIMIT_MESSAGE)) {
+ throw new RuntimeException("Wrong exception message: " + ex.getMessage(), ex);
+ }
+ }
+ }
+ }
+}
diff --git a/test/jdk/java/lang/String/StringRacyConstructor.java b/test/jdk/java/lang/String/StringRacyConstructor.java
new file mode 100644
index 00000000000..bfec99da75e
--- /dev/null
+++ b/test/jdk/java/lang/String/StringRacyConstructor.java
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package test.java.lang.String;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.Arrays;
+import java.util.ConcurrentModificationException;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.condition.EnabledIf;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
+
+/*
+ * @test
+ * @bug 8311906
+ * @modules java.base/java.lang:open
+ * @summary check String's racy constructors
+ * @run junit/othervm -XX:+CompactStrings test.java.lang.String.StringRacyConstructor
+ * @run junit/othervm -XX:-CompactStrings test.java.lang.String.StringRacyConstructor
+ */
+
+public class StringRacyConstructor {
+ private static final byte LATIN1 = 0;
+ private static final byte UTF16 = 1;
+
+ private static final Field STRING_CODER_FIELD;
+ private static final Field SB_CODER_FIELD;
+ private static final boolean COMPACT_STRINGS;
+
+ static {
+ try {
+ STRING_CODER_FIELD = String.class.getDeclaredField("coder");
+ STRING_CODER_FIELD.setAccessible(true);
+ SB_CODER_FIELD = Class.forName("java.lang.AbstractStringBuilder").getDeclaredField("coder");
+ SB_CODER_FIELD.setAccessible(true);
+ COMPACT_STRINGS = isCompactStrings();
+ } catch (NoSuchFieldException ex ) {
+ throw new ExceptionInInitializerError(ex);
+ } catch (ClassNotFoundException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /* {@return true iff CompactStrings are enabled}
+ */
+ public static boolean isCompactStrings() {
+ try {
+ Field compactStringField = String.class.getDeclaredField("COMPACT_STRINGS");
+ compactStringField.setAccessible(true);
+ return compactStringField.getBoolean(null);
+ } catch (NoSuchFieldException ex) {
+ throw new ExceptionInInitializerError(ex);
+ } catch (IllegalAccessException iae) {
+ throw new AssertionError(iae);
+ }
+ }
+
+ // Return the coder for the String
+ private static int coder(String s) {
+ try {
+ return STRING_CODER_FIELD.getByte(s);
+ } catch (IllegalAccessException iae) {
+ throw new AssertionError(iae);
+ }
+ }
+
+ // Return the coder for the StringBuilder
+ private static int sbCoder(StringBuilder sb) {
+ try {
+ return SB_CODER_FIELD.getByte(sb);
+ } catch (IllegalAccessException iae) {
+ throw new AssertionError(iae);
+ }
+ }
+
+ // Return a summary of the internals of the String
+ // The coder and indicate if the coder matches the string contents
+ private static String inspectString(String s) {
+ try {
+ char[] chars = s.toCharArray();
+ String r = new String(chars);
+
+ boolean invalidCoder = coder(s) != coder(r);
+ String coder = STRING_CODER_FIELD.getByte(s) == 0 ? "isLatin1" : "utf16";
+ return (invalidCoder ? "INVALID CODER" : "" ) + " \"" + s + "\", coder: " + coder;
+ } catch (IllegalAccessException ex ) {
+ return "EXCEPTION: " + ex.getMessage();
+ }
+ }
+
+ /**
+ * {@return true if the coder matches the presence/lack of UTF16 characters}
+ * If it returns false, the coder and the contents have failed the precondition for string.
+ * @param orig a string
+ */
+ private static boolean validCoder(String orig) {
+ if (!COMPACT_STRINGS) {
+ assertEquals(UTF16, coder(orig), "Non-COMPACT STRINGS coder must be UTF16");
+ }
+ int accum = 0;
+ for (int i = 0; i < orig.length(); i++)
+ accum |= orig.charAt(i);
+ byte expectedCoder = (accum < 256) ? LATIN1 : UTF16;
+ return expectedCoder == coder(orig);
+ }
+
+ // Check a StringBuilder for consistency of coder and latin1 vs UTF16
+ private static boolean validCoder(StringBuilder orig) {
+ int accum = 0;
+ for (int i = 0; i < orig.length(); i++)
+ accum |= orig.charAt(i);
+ byte expectedCoder = (accum < 256) ? LATIN1 : UTF16;
+ return expectedCoder == sbCoder(orig);
+ }
+
+ @Test
+ @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
+ public void checkStringRange() {
+ char[] chars = {'a', 'b', 'c', 0xff21, 0xff22, 0xff23};
+ String orig = new String(chars);
+ char[] xx = orig.toCharArray();
+ String stringFromChars = new String(xx);
+ assertEquals(orig, stringFromChars, "mixed chars");
+ assertTrue(validCoder(stringFromChars), "invalid coder"
+ + ", invalid coder: " + inspectString(stringFromChars));
+ }
+
+ private static List strings() {
+ return List.of("01234", " ");
+ }
+
+ @ParameterizedTest
+ @MethodSource("strings")
+ @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
+ public void racyString(String orig) {
+ String racyString = racyStringConstruction(orig);
+ // The contents are indeterminate due to the race
+ assertTrue(validCoder(racyString), orig + " string invalid"
+ + ", racyString: " + inspectString(racyString));
+ }
+
+ @ParameterizedTest
+ @MethodSource("strings")
+ @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
+ public void racyCodePoint(String orig) {
+ String iffyString = racyStringConstructionCodepoints(orig);
+ // The contents are indeterminate due to the race
+ assertTrue(validCoder(iffyString), "invalid coder in non-deterministic string"
+ + ", orig:" + inspectString(orig)
+ + ", iffyString: " + inspectString(iffyString));
+ }
+
+ @ParameterizedTest
+ @MethodSource("strings")
+ @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
+ public void racyCodePointSurrogates(String orig) {
+ String iffyString = racyStringConstructionCodepointsSurrogates(orig);
+ // The contents are indeterminate due to the race
+ if (!orig.equals(iffyString))
+ System.err.println("orig: " + orig + ", iffy: " + iffyString + Arrays.toString(iffyString.codePoints().toArray()));
+ assertTrue(validCoder(iffyString), "invalid coder in non-deterministic string"
+ + ", orig:" + inspectString(orig)
+ + ", iffyString: " + inspectString(iffyString));
+ }
+
+ // Test the private methods of StringUTF16 that compress and copy COMPRESSED_STRING
+ // encoded byte arrays.
+ @Test
+ public void verifyUTF16CopyBytes()
+ throws ClassNotFoundException, NoSuchMethodException, InvocationTargetException, IllegalAccessException {
+ Class> stringUTF16 = Class.forName("java.lang.StringUTF16");
+ Method mCompressChars = stringUTF16.getDeclaredMethod("compress",
+ char[].class, int.class, byte[].class, int.class, int.class);
+ mCompressChars.setAccessible(true);
+
+ // First warmup the intrinsic and check 1 case
+ char[] chars = {'a', 'b', 'c', 0xff21, 0xff22, 0xff23};
+ byte[] bytes = new byte[chars.length];
+ int printWarningCount = 0;
+
+ for (int i = 0; i < 1_000_000; i++) { // repeat to get C2 to kick in
+ // Copy only latin1 chars from UTF-16 converted prefix (3 chars -> 3 bytes)
+ int intResult = (int) mCompressChars.invoke(null, chars, 0, bytes, 0, chars.length);
+ if (intResult == 0) {
+ if (printWarningCount == 0) {
+ printWarningCount = 1;
+ System.err.println("Intrinsic for StringUTF16.compress returned 0, may not have been updated.");
+ }
+ } else {
+ assertEquals(3, intResult, "return length not-equal, iteration: " + i);
+ }
+ }
+
+ // Exhaustively check compress returning the correct index of the non-latin1 char.
+ final int SIZE = 48;
+ final byte FILL_BYTE = 'R';
+ chars = new char[SIZE];
+ bytes = new byte[chars.length];
+ for (int i = 0; i < SIZE; i++) { // Every starting index
+ for (int j = i; j < SIZE; j++) { // Every location of non-latin1
+ Arrays.fill(chars, 'A');
+ Arrays.fill(bytes, FILL_BYTE);
+ chars[j] = 0xFF21;
+ int intResult = (int) mCompressChars.invoke(null, chars, i, bytes, 0, chars.length - i);
+ assertEquals(j - i, intResult, "compress found wrong index");
+ assertEquals(FILL_BYTE, bytes[j], "extra character stored");
+ }
+ }
+
+ }
+
+ // Check that a concatenated "hello" has a valid coder
+ @Test
+ @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
+ public void checkConcatAndIntern() {
+ var helloWorld = "hello world";
+ String helloToo = racyStringConstruction("hell".concat("o"));
+ String o = helloToo.intern();
+ var hello = "hello";
+ assertTrue(validCoder(helloToo), "startsWith: "
+ + ", hell: " + inspectString(helloToo)
+ + ", o: " + inspectString(o)
+ + ", hello: " + inspectString(hello)
+ + ", hello world: " + inspectString(helloWorld));
+ }
+
+ // Check that an empty string with racy construction has a valid coder
+ @Test
+ @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
+ public void racyEmptyString() {
+ var space = racyStringConstruction(" ");
+ var trimmed = space.trim();
+ assertTrue(validCoder(trimmed), "empty string invalid coder"
+ + ", trimmed: " + inspectString(trimmed));
+ }
+
+ // Check that an exception in a user implemented CharSequence doesn't result in
+ // an invalid coder when appended to a StringBuilder
+ @Test
+ @EnabledIf("test.java.lang.String.StringRacyConstructor#isCompactStrings")
+ void charSequenceException() {
+ ThrowingCharSequence bs = new ThrowingCharSequence("A\u2030\uFFFD");
+ var sb = new StringBuilder();
+ try {
+ sb.append(bs);
+ fail("An IllegalArgumentException should have been thrown");
+ } catch (IllegalArgumentException ex) {
+ // ignore expected
+ }
+ assertTrue(validCoder(sb), "invalid coder in StringBuilder");
+ }
+
+ /**
+ * Given a latin-1 String, attempt to create a copy that is
+ * incorrectly encoded as UTF-16.
+ */
+ public static String racyStringConstruction(String original) throws ConcurrentModificationException {
+ if (original.chars().max().getAsInt() >= 256) {
+ throw new IllegalArgumentException(
+ "Only work with latin-1 Strings");
+ }
+
+ char[] chars = original.toCharArray();
+
+ // In another thread, flip the first character back
+ // and forth between being latin-1 or not
+ Thread thread = new Thread(() -> {
+ while (!Thread.interrupted()) {
+ chars[0] ^= 256;
+ }
+ });
+ thread.start();
+
+ // at the same time call the String constructor,
+ // until we hit the race condition
+ int i = 0;
+ while (true) {
+ i++;
+ String s = new String(chars);
+ if ((s.charAt(0) < 256 && !original.equals(s)) || i > 1_000_000) {
+ thread.interrupt();
+ try {
+ thread.join();
+ } catch (InterruptedException ie) {
+ // ignore interrupt
+ }
+ return s;
+ }
+ }
+ }
+
+ /**
+ * Given a latin-1 String, creates a copy that is
+ * incorrectly encoded as UTF-16 using the APIs for Codepoints.
+ */
+ public static String racyStringConstructionCodepoints(String original) throws ConcurrentModificationException {
+ if (original.chars().max().getAsInt() >= 256) {
+ throw new IllegalArgumentException(
+ "Can only work with latin-1 Strings");
+ }
+
+ int len = original.length();
+ int[] codePoints = new int[len];
+ for (int i = 0; i < len; i++) {
+ codePoints[i] = original.charAt(i);
+ }
+
+ // In another thread, flip the first character back
+ // and forth between being latin-1 or not
+ Thread thread = new Thread(() -> {
+ while (!Thread.interrupted()) {
+ codePoints[0] ^= 256;
+ }
+ });
+ thread.start();
+
+ // at the same time call the String constructor,
+ // until we hit the race condition
+ int i = 0;
+ while (true) {
+ i++;
+ String s = new String(codePoints, 0, len);
+ if ((s.charAt(0) < 256 && !original.equals(s)) || i > 1_000_000) {
+ thread.interrupt();
+ try {
+ thread.join();
+ } catch (InterruptedException ie) {
+ // ignore interrupt
+ }
+ return s;
+ }
+ }
+ }
+
+ /**
+ * Returns a string created from a codepoint array that has been racily
+ * modified to contain high and low surrogates. The string is a different length
+ * than the original due to the surrogate encoding.
+ */
+ public static String racyStringConstructionCodepointsSurrogates(String original) throws ConcurrentModificationException {
+ if (original.chars().max().getAsInt() >= 256) {
+ throw new IllegalArgumentException(
+ "Can only work with latin-1 Strings");
+ }
+
+ int len = original.length();
+ int[] codePoints = new int[len];
+ for (int i = 0; i < len; i++) {
+ codePoints[i] = original.charAt(i);
+ }
+
+ // In another thread, flip the first character back
+ // and forth between being latin-1 or as a surrogate pair.
+ Thread thread = new Thread(() -> {
+ while (!Thread.interrupted()) {
+ codePoints[0] ^= 0x10000;
+ }
+ });
+ thread.start();
+
+ // at the same time call the String constructor,
+ // until we hit the race condition
+ int i = 0;
+ while (true) {
+ i++;
+ String s = new String(codePoints, 0, len);
+ if ((s.length() != original.length()) || i > 1_000_000) {
+ thread.interrupt();
+ try {
+ thread.join();
+ } catch (InterruptedException ie) {
+ // ignore interrupt
+ }
+ return s;
+ }
+ }
+ }
+
+ // A CharSequence that returns characters from a string and throws IllegalArgumentException
+ // when the character requested is 0xFFFD (the replacement character)
+ // The string contents determine when the exception is thrown.
+ static class ThrowingCharSequence implements CharSequence {
+ private final String aString;
+
+ ThrowingCharSequence(String aString) {
+ this.aString = aString;
+ }
+
+ @Override
+ public int length() {
+ return aString.length();
+ }
+
+ @Override
+ public char charAt(int index) {
+ char ch = aString.charAt(index);
+ if (ch == 0xFFFD) {
+ throw new IllegalArgumentException("Replacement character at index " + index);
+ }
+ return ch;
+ }
+
+ @Override
+ // Not used; returns the entire string
+ public CharSequence subSequence(int start, int end) {
+ return this;
+ }
+ }
+}
diff --git a/test/jdk/java/nio/file/Files/ReadWriteString.java b/test/jdk/java/nio/file/Files/ReadWriteString.java
index 885cbb771dc..538cc870fa8 100644
--- a/test/jdk/java/nio/file/Files/ReadWriteString.java
+++ b/test/jdk/java/nio/file/Files/ReadWriteString.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -29,8 +29,10 @@
import java.nio.charset.UnmappableCharacterException;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.US_ASCII;
-import static java.nio.charset.StandardCharsets.UTF_16;
import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.nio.charset.StandardCharsets.UTF_16;
+import static java.nio.charset.StandardCharsets.UTF_16BE;
+import static java.nio.charset.StandardCharsets.UTF_16LE;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Path;
@@ -40,15 +42,15 @@
import java.util.Arrays;
import java.util.Random;
import java.util.concurrent.Callable;
+import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
import static org.testng.Assert.fail;
-import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
/* @test
- * @bug 8201276 8205058 8209576 8287541 8288589
+ * @bug 8201276 8205058 8209576 8287541 8288589 8325590
* @build ReadWriteString PassThroughFileSystem
* @run testng ReadWriteString
* @summary Unit test for methods for Files readString and write methods.
@@ -61,10 +63,15 @@ public class ReadWriteString {
// data for text files
final String TEXT_UNICODE = "\u201CHello\u201D";
final String TEXT_ASCII = "ABCDEFGHIJKLMNOPQRSTUVWXYZ\n abcdefghijklmnopqrstuvwxyz\n 1234567890\n";
+ final static String TEXT_PERSON_CART_WHEELING = "\ud83e\udd38";
private static final String JA_STRING = "\u65e5\u672c\u8a9e\u6587\u5b57\u5217";
private static final Charset WINDOWS_1252 = Charset.forName("windows-1252");
private static final Charset WINDOWS_31J = Charset.forName("windows-31j");
+ private static final Charset UTF_32 = Charset.forName("utf-32");
+ private static final Charset UTF_32BE = Charset.forName("utf-32be");
+ private static final Charset UTF_32LE = Charset.forName("utf-32le");
+
static byte[] data = getData();
static byte[] getData() {
@@ -154,7 +161,16 @@ public Object[][] getReadString() {
{testFiles[1], TEXT_ASCII, US_ASCII, US_ASCII},
{testFiles[1], TEXT_ASCII, US_ASCII, UTF_8},
{testFiles[1], TEXT_UNICODE, UTF_8, null},
- {testFiles[1], TEXT_UNICODE, UTF_8, UTF_8}
+ {testFiles[1], TEXT_UNICODE, UTF_8, UTF_8},
+ {testFiles[1], TEXT_ASCII, US_ASCII, ISO_8859_1},
+ {testFiles[1], TEXT_PERSON_CART_WHEELING, UTF_16, UTF_16},
+ {testFiles[1], TEXT_PERSON_CART_WHEELING, UTF_16BE, UTF_16BE},
+ {testFiles[1], TEXT_PERSON_CART_WHEELING, UTF_16LE, UTF_16LE},
+ {testFiles[1], TEXT_PERSON_CART_WHEELING, UTF_32, UTF_32},
+ {testFiles[1], TEXT_PERSON_CART_WHEELING, UTF_32BE, UTF_32BE},
+ {testFiles[1], TEXT_PERSON_CART_WHEELING, UTF_32LE, UTF_32LE},
+ {testFiles[1], TEXT_PERSON_CART_WHEELING, WINDOWS_1252, WINDOWS_1252},
+ {testFiles[1], TEXT_PERSON_CART_WHEELING, WINDOWS_31J, WINDOWS_31J}
};
}
@@ -304,6 +320,21 @@ public void testMalformedReadBytes(byte[] data, Charset csRead, Class c) {
try {
c.call();
diff --git a/test/micro/org/openjdk/bench/java/lang/StringConstructor.java b/test/micro/org/openjdk/bench/java/lang/StringConstructor.java
index 1509d6b798f..e9ed0022eda 100644
--- a/test/micro/org/openjdk/bench/java/lang/StringConstructor.java
+++ b/test/micro/org/openjdk/bench/java/lang/StringConstructor.java
@@ -21,11 +21,13 @@
* questions.
*/
-package micro.org.openjdk.bench.java.lang;
+package org.openjdk.bench.java.lang;
import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
import java.util.concurrent.TimeUnit;
@State(Scope.Thread)
@@ -36,45 +38,115 @@
@Fork(3)
public class StringConstructor {
- @Param({"7", "64"})
- public int size;
-
- // Offset to use for ranged newStrings
- @Param("1")
- public int offset;
- private byte[] array;
-
- @Setup
- public void setup() {
- if (offset > size) {
- offset = size;
- }
- array = "a".repeat(size).getBytes(StandardCharsets.UTF_8);
- }
-
- @Benchmark
- public String newStringFromArray() {
- return new String(array);
- }
-
- @Benchmark
- public String newStringFromArrayWithCharset() {
- return new String(array, StandardCharsets.UTF_8);
- }
-
- @Benchmark
- public String newStringFromArrayWithCharsetName() throws Exception {
- return new String(array, StandardCharsets.UTF_8.name());
- }
-
- @Benchmark
- public String newStringFromRangedArray() {
- return new String(array, offset, array.length - offset);
- }
-
- @Benchmark
- public String newStringFromRangedArrayWithCharset() {
- return new String(array, offset, array.length - offset, StandardCharsets.UTF_8);
- }
+ private static final char INTEROBANG = 0x2030;
+ // Fixed offset to use for ranged newStrings
+ public final int offset = 1;
+
+ @Param({"7", "64"})
+ public int size;
+
+ private byte[] array;
+ private char[] chars;
+ private char[] charsMixedBegin;
+ private char[] charsMixedSmall;
+ private char[] charsMixedEnd;
+ private int[] codePointsLatin1;
+ private int[] codePointsMixedBegin;
+ private int[] codePointsMixedSmall;
+
+ private static int[] intCopyOfChars(char[] chars, int newLength) {
+ int[] res = new int[newLength];
+ for (int i = 0; i < Math.min(chars.length, newLength); i++)
+ res[i] = chars[i];
+ return res;
+ }
+
+ @Setup
+ public void setup() {
+ String s = "a".repeat(size);
+ array = s.getBytes(StandardCharsets.UTF_8);
+ chars = s.toCharArray();
+ charsMixedBegin = Arrays.copyOf(chars, array.length);
+ charsMixedBegin[0] = INTEROBANG;
+ charsMixedSmall = Arrays.copyOf(chars, array.length);
+ charsMixedSmall[Math.min(charsMixedSmall.length - 1, 7)] = INTEROBANG;
+ charsMixedEnd = new char[size + 7];
+ Arrays.fill(charsMixedEnd, 'a');
+ charsMixedEnd[charsMixedEnd.length - 1] = INTEROBANG;
+
+ codePointsLatin1 = intCopyOfChars(chars, array.length);
+ codePointsMixedBegin = intCopyOfChars(chars, array.length);
+ codePointsMixedBegin[0] = INTEROBANG;
+ codePointsMixedSmall = intCopyOfChars(chars, array.length);
+ codePointsMixedSmall[Math.min(codePointsMixedSmall.length - 1, 7)] = INTEROBANG;
+ }
+
+ @Benchmark
+ public String newStringFromBytes() {
+ return new String(array);
+ }
+
+ @Benchmark
+ public String newStringFromBytesRanged() {
+ return new String(array, offset, array.length - offset);
+ }
+
+ @Benchmark
+ public String newStringFromBytesRangedWithCharsetUTF8() {
+ return new String(array, offset, array.length - offset, StandardCharsets.UTF_8);
+ }
+
+ @Benchmark
+ public String newStringFromBytesWithCharsetUTF8() {
+ return new String(array, StandardCharsets.UTF_8);
+ }
+
+ @Benchmark
+ public String newStringFromBytesWithCharsetNameUTF8() throws Exception {
+ return new String(array, StandardCharsets.UTF_8.name());
+ }
+
+ @Benchmark
+ public String newStringFromCharsLatin1() {
+ return new String(chars);
+ }
+
+ @Benchmark
+ public String newStringFromCharsMixedBegin() {
+ return new String(charsMixedBegin);
+ }
+
+ @Benchmark
+ public String newStringFromCharsMixedSmall() {
+ return new String(charsMixedSmall);
+ }
+
+ @Benchmark
+ public String newStringFromCharsMixedEnd() {
+ return new String(charsMixedEnd);
+ }
+
+ @Benchmark
+ @CompilerControl(CompilerControl.Mode.DONT_INLINE)
+ public void newStringFromCharsMixedAll(Blackhole bh) {
+ bh.consume(new String(charsMixedBegin));
+ bh.consume(new String(charsMixedSmall));
+ bh.consume(new String(chars));
+ }
+
+ @Benchmark
+ public String newStringFromCodePointRangedLatin1() {
+ return new String(codePointsLatin1, 0, codePointsLatin1.length);
+ }
+
+ @Benchmark
+ public String newStringFromCodePointRangedMixedBegin() {
+ return new String(codePointsMixedBegin, 0, codePointsMixedBegin.length);
+ }
+
+ @Benchmark
+ public String newStringFromCodePointRangedMixedSmall() {
+ return new String(codePointsMixedSmall, 0, codePointsMixedSmall.length);
+ }
}