Skip to content

Commit ade1196

Browse files
authored
HBASE-30073 Test fixes for some flappers and a reproducible error (#8057) (#8082) (#8092)
Signed-off by: Charles Connell <cconnell@apache.org>
1 parent 328b31a commit ade1196

File tree

8 files changed

+137
-27
lines changed

8 files changed

+137
-27
lines changed

hbase-common/src/test/java/org/apache/hadoop/hbase/HBaseJupiterExtension.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ public class HBaseJupiterExtension implements InvocationInterceptor, BeforeAllCa
8484

8585
private static final Map<String, Duration> TAG_TO_TIMEOUT =
8686
ImmutableMap.of(SmallTests.TAG, Duration.ofMinutes(3), MediumTests.TAG, Duration.ofMinutes(6),
87-
LargeTests.TAG, Duration.ofMinutes(13), IntegrationTests.TAG, Duration.ZERO);
87+
LargeTests.TAG, Duration.ofMinutes(20), IntegrationTests.TAG, Duration.ZERO);
8888

8989
private static final String EXECUTOR = "executor";
9090

hbase-compression/pom.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@
4646
<artifactId>hbase-resource-bundle</artifactId>
4747
<optional>true</optional>
4848
</dependency>
49+
<dependency>
50+
<groupId>org.junit.jupiter</groupId>
51+
<artifactId>junit-jupiter-api</artifactId>
52+
<scope>test</scope>
53+
</dependency>
4954
</dependencies>
5055

5156
<build>

hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,13 @@ private void createSubDirAndSystemProperty(String propertyName, Path parent, Str
489489

490490
String sysValue = System.getProperty(propertyName);
491491

492-
if (sysValue != null) {
492+
// Check if directory sharing should be disabled for this test.
493+
// Tests that run with high parallelism and don't need shared directories can set this
494+
// to avoid race conditions where one test's tearDown() deletes directories another test
495+
// is still using.
496+
boolean disableSharing = conf.getBoolean("hbase.test.disable-directory-sharing", false);
497+
498+
if (sysValue != null && !disableSharing) {
493499
// There is already a value set. So we do nothing but hope
494500
// that there will be no conflicts
495501
LOG.info("System.getProperty(\"" + propertyName + "\") already set to: " + sysValue
@@ -502,7 +508,7 @@ private void createSubDirAndSystemProperty(String propertyName, Path parent, Str
502508
}
503509
conf.set(propertyName, sysValue);
504510
} else {
505-
// Ok, it's not set, so we create it as a subdirectory
511+
// Ok, it's not set (or sharing is disabled), so we create it as a subdirectory
506512
createSubDir(propertyName, parent, subDirName);
507513
System.setProperty(propertyName, conf.get(propertyName));
508514
}

hbase-server/src/test/java/org/apache/hadoop/hbase/TestZooKeeper.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ public static void setUpBeforeClass() throws Exception {
7777
conf.setInt(HConstants.ZK_SESSION_TIMEOUT, 1000);
7878
conf.setClass(HConstants.HBASE_MASTER_LOADBALANCER_CLASS, MockLoadBalancer.class,
7979
LoadBalancer.class);
80-
TEST_UTIL.startMiniDFSCluster(2);
8180
}
8281

8382
@AfterAll

hbase-server/src/test/java/org/apache/hadoop/hbase/client/AbstractTestAsyncTableScan.java

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
import java.util.stream.Stream;
4747
import org.apache.hadoop.conf.Configuration;
4848
import org.apache.hadoop.hbase.ConnectionRule;
49+
import org.apache.hadoop.hbase.HBaseConfiguration;
4950
import org.apache.hadoop.hbase.HBaseTestingUtility;
5051
import org.apache.hadoop.hbase.MatcherPredicate;
5152
import org.apache.hadoop.hbase.MiniClusterRule;
@@ -73,8 +74,21 @@
7374
public abstract class AbstractTestAsyncTableScan {
7475

7576
protected static final OpenTelemetryClassRule OTEL_CLASS_RULE = OpenTelemetryClassRule.create();
76-
protected static final MiniClusterRule MINI_CLUSTER_RULE = MiniClusterRule.newBuilder()
77-
.setMiniClusterOption(StartMiniClusterOption.builder().numWorkers(3).build()).build();
77+
78+
private static Configuration createConfiguration() {
79+
// Use HBaseConfiguration.create() instead of new Configuration() to properly load
80+
// hbase-default.xml which contains required default values (e.g. for log cleaner plugins)
81+
Configuration conf = HBaseConfiguration.create();
82+
// Disable directory sharing to prevent race conditions when tests run in parallel.
83+
// Each test instance gets its own isolated directories to avoid one test's tearDown()
84+
// deleting directories another parallel test is still using.
85+
conf.setBoolean("hbase.test.disable-directory-sharing", true);
86+
return conf;
87+
}
88+
89+
protected static final MiniClusterRule MINI_CLUSTER_RULE =
90+
MiniClusterRule.newBuilder().setConfiguration(createConfiguration())
91+
.setMiniClusterOption(StartMiniClusterOption.builder().numWorkers(3).build()).build();
7892

7993
protected static final ConnectionRule CONN_RULE =
8094
ConnectionRule.createAsyncConnectionRule(MINI_CLUSTER_RULE::createAsyncConnection);

hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestRollbackSCP.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,11 @@ public static void tearDownAfterClass() throws IOException {
136136
@Before
137137
public void setUp() throws IOException {
138138
UTIL.ensureSomeNonStoppedRegionServersAvailable(2);
139+
// Surefire reruns failed tests in the same JVM without re-running @BeforeClass. Reset injection
140+
// state so compareAndSet in persistToMeta can succeed again and kill-before-store flags clear.
141+
INJECTED.set(false);
142+
ProcedureTestingUtility.setKillAndToggleBeforeStoreUpdateInRollback(
143+
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor(), false);
139144
}
140145

141146
private ServerCrashProcedure getSCPForServer(ServerName serverName) throws IOException {

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegion.java

Lines changed: 98 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,10 @@ public void setup() throws IOException {
259259
TEST_UTIL = HBaseTestingUtility.createLocalHTU();
260260
FILESYSTEM = TEST_UTIL.getTestFileSystem();
261261
CONF = TEST_UTIL.getConfiguration();
262+
// Disable directory sharing to prevent race conditions when tests run in parallel.
263+
// Each test instance gets its own isolated directories to avoid one test's tearDown()
264+
// deleting directories another parallel test is still using.
265+
CONF.setBoolean("hbase.test.disable-directory-sharing", true);
262266
NettyAsyncFSWALConfigHelper.setEventLoopConfig(CONF, GROUP, NioSocketChannel.class);
263267
dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
264268
method = name.getMethodName();
@@ -1352,18 +1356,23 @@ public void testGetWhileRegionClose() throws IOException {
13521356
threads[i].start();
13531357
}
13541358
} finally {
1359+
done.set(true);
1360+
for (GetTillDoneOrException t : threads) {
1361+
if (t != null) {
1362+
try {
1363+
t.join(5000);
1364+
} catch (InterruptedException e) {
1365+
e.printStackTrace();
1366+
}
1367+
}
1368+
}
13551369
if (this.region != null) {
13561370
HBaseTestingUtility.closeRegionAndWAL(this.region);
13571371
this.region = null;
13581372
}
13591373
}
1360-
done.set(true);
1374+
// Check for errors after threads have been stopped
13611375
for (GetTillDoneOrException t : threads) {
1362-
try {
1363-
t.join();
1364-
} catch (InterruptedException e) {
1365-
e.printStackTrace();
1366-
}
13671376
if (t.e != null) {
13681377
LOG.info("Exception=" + t.e);
13691378
assertFalse("Found a NPE in " + t.getName(), t.e instanceof NullPointerException);
@@ -1391,7 +1400,7 @@ class GetTillDoneOrException extends Thread {
13911400

13921401
@Override
13931402
public void run() {
1394-
while (!this.done.get()) {
1403+
while (!this.done.get() && !Thread.currentThread().isInterrupted()) {
13951404
try {
13961405
assertTrue(region.get(g).size() > 0);
13971406
this.count.incrementAndGet();
@@ -4517,7 +4526,7 @@ public void checkNoError() {
45174526
@Override
45184527
public void run() {
45194528
done = false;
4520-
while (!done) {
4529+
while (!done && !Thread.currentThread().isInterrupted()) {
45214530
synchronized (this) {
45224531
try {
45234532
wait();
@@ -4730,7 +4739,7 @@ public void checkNoError() {
47304739
@Override
47314740
public void run() {
47324741
done = false;
4733-
while (!done) {
4742+
while (!done && !Thread.currentThread().isInterrupted()) {
47344743
try {
47354744
for (int r = 0; r < numRows; r++) {
47364745
byte[] row = Bytes.toBytes("row" + r);
@@ -5264,7 +5273,7 @@ public void testParallelIncrementWithMemStoreFlush() throws Exception {
52645273
Runnable flusher = new Runnable() {
52655274
@Override
52665275
public void run() {
5267-
while (!incrementDone.get()) {
5276+
while (!incrementDone.get() && !Thread.currentThread().isInterrupted()) {
52685277
try {
52695278
region.flush(true);
52705279
} catch (Exception e) {
@@ -5280,13 +5289,38 @@ public void run() {
52805289
long expected = (long) threadNum * incCounter;
52815290
Thread[] incrementers = new Thread[threadNum];
52825291
Thread flushThread = new Thread(flusher);
5292+
flushThread.setName("FlushThread-" + method);
52835293
for (int i = 0; i < threadNum; i++) {
52845294
incrementers[i] = new Thread(new Incrementer(this.region, incCounter));
52855295
incrementers[i].start();
52865296
}
52875297
flushThread.start();
5288-
for (int i = 0; i < threadNum; i++) {
5289-
incrementers[i].join();
5298+
try {
5299+
for (int i = 0; i < threadNum; i++) {
5300+
incrementers[i].join();
5301+
}
5302+
5303+
incrementDone.set(true);
5304+
flushThread.join();
5305+
5306+
Get get = new Get(Incrementer.incRow);
5307+
get.addColumn(Incrementer.family, Incrementer.qualifier);
5308+
get.readVersions(1);
5309+
Result res = this.region.get(get);
5310+
List<Cell> kvs = res.getColumnCells(Incrementer.family, Incrementer.qualifier);
5311+
5312+
// we just got the latest version
5313+
assertEquals(1, kvs.size());
5314+
Cell kv = kvs.get(0);
5315+
assertEquals(expected, Bytes.toLong(kv.getValueArray(), kv.getValueOffset()));
5316+
} finally {
5317+
// Ensure flush thread is stopped even if test fails or times out
5318+
incrementDone.set(true);
5319+
flushThread.interrupt();
5320+
flushThread.join(5000); // Wait up to 5 seconds for thread to stop
5321+
if (flushThread.isAlive()) {
5322+
LOG.warn("Flush thread did not stop within timeout for test " + method);
5323+
}
52905324
}
52915325

52925326
incrementDone.set(true);
@@ -5349,7 +5383,7 @@ public void testParallelAppendWithMemStoreFlush() throws Exception {
53495383
Runnable flusher = new Runnable() {
53505384
@Override
53515385
public void run() {
5352-
while (!appendDone.get()) {
5386+
while (!appendDone.get() && !Thread.currentThread().isInterrupted()) {
53535387
try {
53545388
region.flush(true);
53555389
} catch (Exception e) {
@@ -5369,13 +5403,41 @@ public void run() {
53695403
}
53705404
Thread[] appenders = new Thread[threadNum];
53715405
Thread flushThread = new Thread(flusher);
5406+
flushThread.setName("FlushThread-" + method);
53725407
for (int i = 0; i < threadNum; i++) {
53735408
appenders[i] = new Thread(new Appender(this.region, appendCounter));
53745409
appenders[i].start();
53755410
}
53765411
flushThread.start();
5377-
for (int i = 0; i < threadNum; i++) {
5378-
appenders[i].join();
5412+
try {
5413+
for (int i = 0; i < threadNum; i++) {
5414+
appenders[i].join();
5415+
}
5416+
5417+
appendDone.set(true);
5418+
flushThread.join();
5419+
5420+
Get get = new Get(Appender.appendRow);
5421+
get.addColumn(Appender.family, Appender.qualifier);
5422+
get.readVersions(1);
5423+
Result res = this.region.get(get);
5424+
List<Cell> kvs = res.getColumnCells(Appender.family, Appender.qualifier);
5425+
5426+
// we just got the latest version
5427+
assertEquals(1, kvs.size());
5428+
Cell kv = kvs.get(0);
5429+
byte[] appendResult = new byte[kv.getValueLength()];
5430+
System.arraycopy(kv.getValueArray(), kv.getValueOffset(), appendResult, 0,
5431+
kv.getValueLength());
5432+
assertArrayEquals(expected, appendResult);
5433+
} finally {
5434+
// Ensure flush thread is stopped even if test fails or times out
5435+
appendDone.set(true);
5436+
flushThread.interrupt();
5437+
flushThread.join(5000); // Wait up to 5 seconds for thread to stop
5438+
if (flushThread.isAlive()) {
5439+
LOG.warn("Flush thread did not stop within timeout for test " + method);
5440+
}
53795441
}
53805442

53815443
appendDone.set(true);
@@ -7337,7 +7399,7 @@ public void testMutateRowInParallel() throws Exception {
73377399
// Writer thread
73387400
Thread writerThread = new Thread(() -> {
73397401
try {
7340-
while (true) {
7402+
while (!Thread.currentThread().isInterrupted()) {
73417403
// If all the reader threads finish, then stop the writer thread
73427404
if (latch.await(0, TimeUnit.MILLISECONDS)) {
73437405
return;
@@ -7362,15 +7424,19 @@ public void testMutateRowInParallel() throws Exception {
73627424
.addColumn(fam1, q3, tsIncrement + 1, Bytes.toBytes(1L))
73637425
.addColumn(fam1, q4, tsAppend + 1, Bytes.toBytes("a")) });
73647426
}
7427+
} catch (InterruptedException e) {
7428+
// Test interrupted, exit gracefully
7429+
Thread.currentThread().interrupt();
73657430
} catch (Exception e) {
73667431
assertionError.set(new AssertionError(e));
73677432
}
73687433
});
7434+
writerThread.setName("WriterThread-" + method);
73697435
writerThread.start();
73707436

73717437
// Reader threads
73727438
for (int i = 0; i < numReaderThreads; i++) {
7373-
new Thread(() -> {
7439+
Thread readerThread = new Thread(() -> {
73747440
try {
73757441
for (int j = 0; j < 10000; j++) {
73767442
// Verify the values
@@ -7399,13 +7465,24 @@ public void testMutateRowInParallel() throws Exception {
73997465
}
74007466

74017467
latch.countDown();
7402-
}).start();
7468+
});
7469+
readerThread.setName("ReaderThread-" + i + "-" + method);
7470+
readerThread.start();
74037471
}
74047472

7405-
writerThread.join();
7473+
try {
7474+
writerThread.join();
74067475

7407-
if (assertionError.get() != null) {
7408-
throw assertionError.get();
7476+
if (assertionError.get() != null) {
7477+
throw assertionError.get();
7478+
}
7479+
} finally {
7480+
// Ensure writer thread is stopped on test timeout
7481+
writerThread.interrupt();
7482+
writerThread.join(5000);
7483+
if (writerThread.isAlive()) {
7484+
LOG.warn("Writer thread did not stop within timeout for test " + method);
7485+
}
74097486
}
74107487
}
74117488

hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationBase.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,10 @@ protected static void loadData(String prefix, byte[] row, byte[] familyName) thr
183183
protected static void setupConfig(HBaseTestingUtility util, String znodeParent) {
184184
Configuration conf = util.getConfiguration();
185185
conf.set(HConstants.ZOOKEEPER_ZNODE_PARENT, znodeParent);
186+
// Disable directory sharing to prevent race conditions when tests run in parallel.
187+
// Each test instance gets its own isolated directories to avoid one test's tearDown()
188+
// deleting directories another parallel test is still using.
189+
conf.setBoolean("hbase.test.disable-directory-sharing", true);
186190
// We don't want too many edits per batch sent to the ReplicationEndpoint to trigger
187191
// sufficient number of events. But we don't want to go too low because
188192
// HBaseInterClusterReplicationEndpoint partitions entries into batches and we want

0 commit comments

Comments
 (0)