Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions conf/cassandra.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2048,6 +2048,11 @@ batch_size_fail_threshold: 50KiB
# Log WARN on any batches not of type LOGGED than span across more partitions than this limit
unlogged_batch_across_partitions_warn_threshold: 10

# GCInspector configs:
# For GC like ShenandoahGC/ZGC etc., there are GC events that do not have STW pauses (Concurrent phases)
# Operator might find it reasonable to use lower thresholds for events require STW pauses and higher thresholds
# for concurrent phases.
#
# GC Pauses greater than 200 ms will be logged at INFO level
# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
Expand All @@ -2059,6 +2064,17 @@ unlogged_batch_across_partitions_warn_threshold: 10
# Min unit: ms
# gc_warn_threshold: 1000ms

# GC Concurrent phase greater than 1000 ms will be logged at INFO level
# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
# gc_concurrent_phase_log_threshold: 1000ms

# GC Concurrent phase than gc_concurrent_phase_warn_threshold will be logged at WARN level
# Adjust the threshold based on your application throughput requirement. Setting to 0
# will deactivate the feature.
# Min unit: ms
# gc_concurrent_phase_warn_threshold: 2000ms

# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
# early. Any value size larger than this threshold will result into marking an SSTable
# as corrupted. This should be positive and less than 2GiB.
Expand Down
16 changes: 16 additions & 0 deletions conf/cassandra_latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1913,6 +1913,11 @@ batch_size_fail_threshold: 50KiB
# Log WARN on any batches not of type LOGGED than span across more partitions than this limit
unlogged_batch_across_partitions_warn_threshold: 10

# GCInspector configs:
# For GC like ShenandoahGC/ZGC etc., there are GC events that do not have STW pauses.
# Such events are called Concurrent phases. Operator might find it reasonable to use lower thresholds
# for events require STW pauses and higher thresholds for concurrent phases.
#
# GC Pauses greater than 200 ms will be logged at INFO level
# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
Expand All @@ -1924,6 +1929,17 @@ unlogged_batch_across_partitions_warn_threshold: 10
# Min unit: ms
# gc_warn_threshold: 1000ms

# GC Concurrent phase greater than 1000 ms will be logged at INFO level
# This threshold can be adjusted to minimize logging if necessary
# Min unit: ms
# gc_concurrent_phase_log_threshold: 1000ms

# GC Concurrent phase than gc_concurrent_phase_warn_threshold will be logged at WARN level
# Adjust the threshold based on your application throughput requirement. Setting to 0
# will deactivate the feature.
# Min unit: ms
# gc_concurrent_phase_warn_threshold: 2000ms

# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption
# early. Any value size larger than this threshold will result into marking an SSTable
# as corrupted. This should be positive and less than 2GiB.
Expand Down
2 changes: 2 additions & 0 deletions src/java/org/apache/cassandra/config/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,8 @@ public static class SSTableConfig
public volatile DurationSpec.IntMillisecondsBound gc_log_threshold = new DurationSpec.IntMillisecondsBound("200ms");
@Replaces(oldName = "gc_warn_threshold_in_ms", converter = Converters.MILLIS_DURATION_INT, deprecated = true)
public volatile DurationSpec.IntMillisecondsBound gc_warn_threshold = new DurationSpec.IntMillisecondsBound("1s");
public volatile DurationSpec.IntMillisecondsBound gc_concurrent_phase_log_threshold = new DurationSpec.IntMillisecondsBound("1s");
public volatile DurationSpec.IntMillisecondsBound gc_concurrent_phase_warn_threshold = new DurationSpec.IntMillisecondsBound("2s");

// TTL for different types of trace events.
@Replaces(oldName = "tracetype_query_ttl", converter = Converters.SECONDS_DURATION, deprecated=true)
Expand Down
80 changes: 74 additions & 6 deletions src/java/org/apache/cassandra/config/DatabaseDescriptor.java
Original file line number Diff line number Diff line change
Expand Up @@ -1220,6 +1220,22 @@ else if (conf.max_value_size.toMebibytes() >= 2048)
// run audit logging options through sanitation and validation
if (conf.audit_logging_options != null)
setAuditLoggingOptions(conf.audit_logging_options);

try
{
// Run through the validation by setting current values back to their setters, so we are sure that their values are valid.
// We are catching IllegalArgumentException and translating it to ConfigurationException to comply with
// rest of the logic in this method. These setters are also called in GCInspectorMXBean were IllegalArgumentException
// is thrown when arguments are invalid instead ConfigurationException, on purpose.
DatabaseDescriptor.setGCLogThreshold((int) DatabaseDescriptor.getGCLogThreshold());
DatabaseDescriptor.setGCWarnThreshold((int) DatabaseDescriptor.getGCWarnThreshold());
DatabaseDescriptor.setGCConcurrentPhaseLogThreshold(DatabaseDescriptor.getGCConcurrentPhaseLogThreshold());
DatabaseDescriptor.setGCConcurrentPhaseWarnThreshold(DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold());
}
catch (IllegalArgumentException ex)
{
throw new ConfigurationException(ex.getMessage());
}
}

@VisibleForTesting
Expand Down Expand Up @@ -4719,14 +4735,17 @@ public static long getGCLogThreshold()
return conf.gc_log_threshold.toMilliseconds();
}

public static void setGCLogThreshold(int gcLogThreshold)
public static void setGCLogThreshold(int threshold)
{
conf.gc_log_threshold = new DurationSpec.IntMillisecondsBound(gcLogThreshold);
}
if (threshold <= 0)
throw new IllegalArgumentException("Threshold value for gc_log_threshold must be greater than 0");

public static EncryptionContext getEncryptionContext()
{
return encryptionContext;
long gcWarnThresholdInMs = getGCWarnThreshold();
if (gcWarnThresholdInMs != 0 && threshold > gcWarnThresholdInMs)
throw new IllegalArgumentException("Threshold value for gc_log_threshold (" + threshold + ") must be less than gc_warn_threshold which is currently "
+ gcWarnThresholdInMs);

conf.gc_log_threshold = new DurationSpec.IntMillisecondsBound(threshold);
}

public static long getGCWarnThreshold()
Expand All @@ -4736,9 +4755,58 @@ public static long getGCWarnThreshold()

public static void setGCWarnThreshold(int threshold)
{
if (threshold < 0)
throw new IllegalArgumentException("Threshold value for gc_warn_threshold must be greater than or equal to 0");

long gcLogThresholdInMs = getGCLogThreshold();
if (threshold != 0 && threshold <= gcLogThresholdInMs)
throw new IllegalArgumentException("Threshold value for gc_warn_threshold (" + threshold + ") must be greater than gc_log_threshold which is currently "
+ gcLogThresholdInMs);

conf.gc_warn_threshold = new DurationSpec.IntMillisecondsBound(threshold);
}

public static int getGCConcurrentPhaseLogThreshold()
{
return conf.gc_concurrent_phase_log_threshold.toMilliseconds();
}

public static void setGCConcurrentPhaseLogThreshold(int threshold)
{
if (threshold <= 0)
throw new IllegalArgumentException("Threshold must be greater than 0");

long gcConcurrentPhaseWarnThresholdInMs = getGCConcurrentPhaseWarnThreshold();
if (gcConcurrentPhaseWarnThresholdInMs != 0 && threshold > gcConcurrentPhaseWarnThresholdInMs)
throw new IllegalArgumentException("Threshold value for gc_concurrent_phase_log_threshold (" + threshold + ") must be less than gc_concurrent_phase_warn_threshold which is currently "
+ gcConcurrentPhaseWarnThresholdInMs);

conf.gc_concurrent_phase_log_threshold = new DurationSpec.IntMillisecondsBound(threshold);
}

public static int getGCConcurrentPhaseWarnThreshold()
{
return conf.gc_concurrent_phase_warn_threshold.toMilliseconds();
}

public static void setGCConcurrentPhaseWarnThreshold(int threshold)
{
if (threshold < 0)
throw new IllegalArgumentException("Threshold value for gc_concurrent_phase_warn_threshold must be greater than or equal to 0");

long gcConcurrentPhaseLogThresholdInMs = getGCConcurrentPhaseLogThreshold();
if (threshold != 0 && threshold <= gcConcurrentPhaseLogThresholdInMs)
throw new IllegalArgumentException("Threshold value for gc_concurrent_phase_warn_threshold (" + threshold + ") must be greater than gc_concurrent_phase_log_threshold which is currently "
+ gcConcurrentPhaseLogThresholdInMs);

conf.gc_concurrent_phase_warn_threshold = new DurationSpec.IntMillisecondsBound(threshold);
}

public static EncryptionContext getEncryptionContext()
{
return encryptionContext;
}

public static boolean isCDCEnabled()
{
return conf.cdc_enabled;
Expand Down
89 changes: 65 additions & 24 deletions src/java/org/apache/cassandra/service/GCInspector.java
Original file line number Diff line number Diff line change
Expand Up @@ -287,23 +287,56 @@ public void handleNotification(final Notification notification, final Object han
if (state.compareAndSet(prev, new State(duration, bytes, prev)))
break;
}

if (getGcWarnThresholdInMs() != 0 && duration > getGcWarnThresholdInMs())
logger.warn(sb.toString());
else if (duration > getGcLogThresholdInMs())
logger.info(sb.toString());
else if (logger.isTraceEnabled())
logger.trace(sb.toString());

if (duration > this.getStatusThresholdInMs())
StatusLogger.log();
if (isConcurrentPhase(info.getGcCause(), info.getGcName()))
{
if (getGcConcurrentPhaseWarnThresholdInMs() != 0 && duration > getGcConcurrentPhaseWarnThresholdInMs())
logger.warn(sb.toString());
else if (duration > getGcConcurrentPhaseLogThresholdInMs())
logger.info(sb.toString());
else if (logger.isTraceEnabled())
logger.trace(sb.toString());

if (duration > this.getConcurrentStatusThresholdInMs())
StatusLogger.log();
}
else
{
if (getGcWarnThresholdInMs() != 0 && duration > getGcWarnThresholdInMs())
logger.warn(sb.toString());
else if (duration > getGcLogThresholdInMs())
logger.info(sb.toString());
else if (logger.isTraceEnabled())
logger.trace(sb.toString());

if (duration > this.getStatusThresholdInMs())
StatusLogger.log();
}

// if we just finished an old gen collection and we're still using a lot of memory, try to reduce the pressure
if (gcState.assumeGCIsOldGen)
LifecycleTransaction.rescheduleFailedDeletions();
}
}

static boolean isConcurrentPhase(String cause, String name) {
// Mostly taken from: https://github.com/Netflix/spectator/blob/v1.7.x/spectator-ext-gc/src/main/java/com/netflix/spectator/gc/GcLogger.java
// So far the only indicator known is that the cause will be reported as "No GC"
// when using CMS.
//
// For ZGC, behavior was changed in JDK17: https://bugs.openjdk.java.net/browse/JDK-8265136
// For ZGC in older versions, there is no way to accurately get the amount of time
// in STW pauses.
//
// For G1, a new bean is added in JDK20 to indicate time spent in concurrent phases:
// https://bugs.openjdk.org/browse/JDK-8297247

return "No GC".equals(cause) // CMS
|| "G1 Concurrent GC".equals(name) // G1 in JDK20+
|| name.endsWith(" Cycles"); // Shenandoah, ZGC
}


public State getTotalSinceLastCheck()
{
return state.getAndSet(new State());
Expand Down Expand Up @@ -378,14 +411,6 @@ private static long getFieldValue(Field field, boolean isAtomicLong)

public void setGcWarnThresholdInMs(long threshold)
{
long gcLogThresholdInMs = getGcLogThresholdInMs();
if (threshold < 0)
throw new IllegalArgumentException("Threshold must be greater than or equal to 0");
if (threshold != 0 && threshold <= gcLogThresholdInMs)
throw new IllegalArgumentException("Threshold must be greater than gcLogThresholdInMs which is currently "
+ gcLogThresholdInMs);
if (threshold > Integer.MAX_VALUE)
throw new IllegalArgumentException("Threshold must be less than Integer.MAX_VALUE");
DatabaseDescriptor.setGCWarnThreshold((int)threshold);
}

Expand All @@ -396,15 +421,27 @@ public long getGcWarnThresholdInMs()

public void setGcLogThresholdInMs(long threshold)
{
if (threshold <= 0)
throw new IllegalArgumentException("Threshold must be greater than 0");
DatabaseDescriptor.setGCLogThreshold((int) threshold);
}

long gcWarnThresholdInMs = getGcWarnThresholdInMs();
if (gcWarnThresholdInMs != 0 && threshold > gcWarnThresholdInMs)
throw new IllegalArgumentException("Threshold must be less than gcWarnThresholdInMs which is currently "
+ gcWarnThresholdInMs);
public int getGcConcurrentPhaseWarnThresholdInMs()
{
return DatabaseDescriptor.getGCConcurrentPhaseWarnThreshold();
}

DatabaseDescriptor.setGCLogThreshold((int) threshold);
public void setGcConcurrentPhaseWarnThresholdInMs(int threshold)
{
DatabaseDescriptor.setGCConcurrentPhaseWarnThreshold(threshold);
}

public int getGcConcurrentPhaseLogThresholdInMs()
{
return DatabaseDescriptor.getGCConcurrentPhaseLogThreshold();
}

public void setGcConcurrentPhaseLogThresholdInMs(int threshold)
{
DatabaseDescriptor.setGCConcurrentPhaseLogThreshold(threshold);
}

public long getGcLogThresholdInMs()
Expand All @@ -417,4 +454,8 @@ public long getStatusThresholdInMs()
return getGcWarnThresholdInMs() != 0 ? getGcWarnThresholdInMs() : getGcLogThresholdInMs();
}

public long getConcurrentStatusThresholdInMs()
{
return getGcConcurrentPhaseWarnThresholdInMs() != 0 ? getGcConcurrentPhaseWarnThresholdInMs() : getGcConcurrentPhaseLogThresholdInMs();
}
}
4 changes: 4 additions & 0 deletions src/java/org/apache/cassandra/service/GCInspectorMXBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ public interface GCInspectorMXBean
void setGcWarnThresholdInMs(long threshold);
long getGcWarnThresholdInMs();
void setGcLogThresholdInMs(long threshold);
int getGcConcurrentPhaseLogThresholdInMs();
void setGcConcurrentPhaseWarnThresholdInMs(int threshold);
int getGcConcurrentPhaseWarnThresholdInMs();
void setGcConcurrentPhaseLogThresholdInMs(int threshold);
long getGcLogThresholdInMs();
long getStatusThresholdInMs();
}
Loading