Skip to content

Distribute scheduling and experiment with throttling #422

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pathlib
import re
import sys
from enum import StrEnum
from typing import TYPE_CHECKING, Annotated, Any, List, Optional, Union

Expand All @@ -25,6 +26,7 @@
from contentctl.objects.config import CustomApp

import datetime
import random
from functools import cached_property

from contentctl.enrichments.cve_enrichment import CveEnrichmentObj
Expand Down Expand Up @@ -52,11 +54,48 @@
from contentctl.objects.rba import RBAObject, RiskScoreValue_Type
from contentctl.objects.security_content_object import SecurityContentObject
from contentctl.objects.test_group import TestGroup
from contentctl.objects.throttling import Throttling
from contentctl.objects.unit_test import UnitTest

# Those AnalyticsTypes that we do not test via contentctl
SKIPPED_ANALYTICS_TYPES: set[str] = {AnalyticsType.Correlation}

import questionary

Check failure on line 63 in contentctl/objects/abstract_security_content_objects/detection_abstract.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (E402)

contentctl/objects/abstract_security_content_objects/detection_abstract.py:63:1: E402 Module level import not at top of file

try:
PERCENTAGE_OF_SEARCHES_TO_ENABLE_BY_DEFAULT = int(
questionary.text(
"Enter the percentage of searches (as a whole number integer) you want to enable by default",
default="0",
).ask()
)

EXACT_START_MINUTE: bool = questionary.confirm(
"Shall we assign EXACT start minute for each detection? \n'Yes' "
"will assign an exact start minute, while 'No' will assign a "
"minute of '0' and result in maximal skew"
).ask()

if EXACT_START_MINUTE:
DETERMINISTIC_START_TIMES: bool = questionary.confirm(
"Shall we deterministicly spread detection start times between 0-59 minutes,"
" with their start times the same between different builds? \nChoosing 'Yes' will "
"mean that some minutes have more searches scheduled than other minutes.\n"
"Choosing 'No' will mean that the start time for a specific search can change from build to build"
).ask()
else:
# If we are not starting at an exact minute, then we will always start
# at minute 0 which leaves ambiguity as the decision is made by splunk scheduler
DETERMINISTIC_START_TIMES: bool = False


except Exception as e:
print(f"Issue getting answers for the build. Quitting... \n{str(e)}")
sys.exit(1)

GLOBAL_COUNTER = -1
random.seed(42) # For reproducibility in tests


class Detection_Abstract(SecurityContentObject):
name: str = Field(..., max_length=CONTENTCTL_MAX_SEARCH_NAME_LENGTH)
Expand All @@ -70,6 +109,94 @@
known_false_positives: str = Field(..., min_length=4)
rba: Optional[RBAObject] = Field(default=None)

@computed_field
@property
def statistically_disabled(self) -> str:
global GLOBAL_COUNTER
"""
Returns a string that indicates whether the detection is statistically disabled.
This is used to determine whether or not in test app builds, for the purposes
of performance testing, this detection should be enabled by default or not.
"""

# Convert the UUID and mod by 100, letting us set probability of this
# search being enabled between 0 and 100

# Remember, the name of this field is disabled, so 0 means the search
# should be "enabled" and 1 means disabled. Kind of feels backwards.
if random.randint(0, 99) < PERCENTAGE_OF_SEARCHES_TO_ENABLE_BY_DEFAULT:
return "false"
else:
return "true"

@computed_field
@property
def calculated_cron(self) -> str:
global GLOBAL_COUNTER
"""
Returns the cron expression for the detection.
Read the docs here to have a better understranding of what cron
expressions are skewable (and good or bad candidates for skewing):
https://docs.splunk.com/Documentation/SplunkCloud/latest/Report/Skewscheduledreportstarttimes#How_the_search_schedule_affects_the_potential_schedule_offset

"""
"""
# Convert the UUID, which is unique per detection, to an integer.
uuid_as_int = int(self.id)
name_hash = hash(self.name)

# Then, mod this by 60. This should give us a fairly random distribution from 0-60
MIN_TIME = 0
MAX_TIME = 59
TIME_DIFF = (MAX_TIME + 1) - MIN_TIME

# We do this instead of imply using randrandge or similar because using the UUID makes
# generation of the cron schedule deterministic, which is useful for testing different
# windows. For example, there is a good chance we may get another request to not have
# things starts within the first 5 minutes, given that many other searches are scheduled
# in ES to kick off at that time.
new_start_minute = name_hash % TIME_DIFF

# Every cron schedule for an ESCU Search is 0 * * * *, we we will just substitute what
# we generated above, ignoring what is actually in the deploymnet
"""

GLOBAL_COUNTER += 1
if not EXACT_START_MINUTE:
if self.type is AnalyticsType.TTP:
return self.deployment.scheduling.cron_schedule.format(minute="*")
else:
return self.deployment.scheduling.cron_schedule.format(minute="0")
print("\nEXACT START MINUTE IS NOT SUPPORTED ANYMORE.\n")
sys.exit(1)
if DETERMINISTIC_START_TIMES:
sys.exit(1)
uuid_as_int = int(self.id)
if self.type is AnalyticsType.TTP:
# TTP run every 15 minutes, so mod this by 15
start_minute = uuid_as_int % 15
else:
start_minute = uuid_as_int % 60

Check failure on line 179 in contentctl/objects/abstract_security_content_objects/detection_abstract.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F841)

contentctl/objects/abstract_security_content_objects/detection_abstract.py:179:17: F841 Local variable `start_minute` is assigned to but never used

# The spacing of the above implementation winds up being quite poor, maybe because
# our sample size is too small to approach a uniform distribution.
# So just use an int and mod it

# Try our best to spread these as evenly as possible
#

if self.type is AnalyticsType.TTP:
minute_start = GLOBAL_COUNTER % 15
minute_stop = minute_start + 45

return self.deployment.scheduling.cron_schedule.format(
minute_range=f"{minute_start}-{minute_stop}"
)

return self.deployment.scheduling.cron_schedule.format(
minute=GLOBAL_COUNTER % 60
)

@computed_field
@property
def risk_score(self) -> RiskScoreValue_Type:
Expand Down Expand Up @@ -804,22 +931,40 @@
return self

@model_validator(mode="after")
def ensureThrottlingFieldsExist(self):
def automaticallyCreateThrottling(self, default_throttling_period: str = "3600s"):
"""
If throttling is not explicitly configured, then automatically create
it from the risk and threat objects defined in the RBA config.


For throttling to work properly, the fields to throttle on MUST
exist in the search itself. If not, then we cannot apply the throttling
"""
if self.tags.throttling is None:
# No throttling configured for this detection
return self

# Automatically add throttling fields based on the risk and threat objects
if self.rba is None:
# Cannot add any throttling because there is no RBA config
return self

self.tags.throttling = Throttling(
fields=[ro.field for ro in self.rba.risk_objects] # type: ignore
+ [to.field for to in self.rba.threat_objects], # type: ignore
period=default_throttling_period, # provide a default period in line with the argument to this function
)

missing_fields: list[str] = [
field for field in self.tags.throttling.fields if field not in self.search
]
if len(missing_fields) > 0:
raise ValueError(
f"The following throttle fields were missing from the search: {missing_fields}"
print(
f"\nThe following throttle fields were missing from the search [{self.name}]. This is just a warning for now since this is an experimental feature: {missing_fields}\n"
)
return self
# raise ValueError(
# f"The following throttle fields were missing from the search [{self.name}]: {missing_fields}"
# )

else:
# All throttling fields present in search
Expand Down
69 changes: 69 additions & 0 deletions contentctl/objects/baseline.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
model_serializer,
)

from contentctl.objects.abstract_security_content_objects.detection_abstract import (
DETERMINISTIC_START_TIMES,
EXACT_START_MINUTE,
GLOBAL_COUNTER,
)
from contentctl.objects.baseline_tags import BaselineTags
from contentctl.objects.config import CustomApp
from contentctl.objects.constants import (
Expand All @@ -39,6 +44,70 @@
deployment: Deployment = Field({})
status: ContentStatus

@computed_field
@property
def calculated_cron(self) -> str:
global GLOBAL_COUNTER
"""
Returns the cron expression for the detection.
Read the docs here to have a better understranding of what cron
expressions are skewable (and good or bad candidates for skewing):
https://docs.splunk.com/Documentation/SplunkCloud/latest/Report/Skewscheduledreportstarttimes#How_the_search_schedule_affects_the_potential_schedule_offset

"""
"""
# Convert the UUID, which is unique per detection, to an integer.
uuid_as_int = int(self.id)
name_hash = hash(self.name)

# Then, mod this by 60. This should give us a fairly random distribution from 0-60
MIN_TIME = 0
MAX_TIME = 59
TIME_DIFF = (MAX_TIME + 1) - MIN_TIME

# We do this instead of imply using randrandge or similar because using the UUID makes
# generation of the cron schedule deterministic, which is useful for testing different
# windows. For example, there is a good chance we may get another request to not have
# things starts within the first 5 minutes, given that many other searches are scheduled
# in ES to kick off at that time.
new_start_minute = name_hash % TIME_DIFF

# Every cron schedule for an ESCU Search is 0 * * * *, we we will just substitute what
# we generated above, ignoring what is actually in the deploymnet
"""

GLOBAL_COUNTER += 1
if not EXACT_START_MINUTE:
return self.deployment.scheduling.cron_schedule.format(minute="0")

print("\nEXACT START MINUTE IS NOT SUPPORTED ANYMORE.\n")
import sys

sys.exit(1)
if DETERMINISTIC_START_TIMES:
sys.exit(1)
uuid_as_int = int(self.id)
start_minute = uuid_as_int % 60

Check failure on line 90 in contentctl/objects/baseline.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F841)

contentctl/objects/baseline.py:90:13: F841 Local variable `start_minute` is assigned to but never used

# The spacing of the above implementation winds up being quite poor, maybe because
# our sample size is too small to approach a uniform distribution.
# So just use an int and mod it

# Try our best to spread these as evenly as possible
#

if self.type is AnalyticsType.TTP:

Check failure on line 99 in contentctl/objects/baseline.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F821)

contentctl/objects/baseline.py:99:25: F821 Undefined name `AnalyticsType`
minute_start = GLOBAL_COUNTER % 15
minute_stop = minute_start + 45

return self.deployment.scheduling.cron_schedule.format(
minute_range=f"{minute_start}-{minute_stop}"
)

return self.deployment.scheduling.cron_schedule.format(
minute=GLOBAL_COUNTER % 60
)

@field_validator("status", mode="after")
@classmethod
def NarrowStatus(cls, status: ContentStatus) -> ContentStatus:
Expand Down
5 changes: 4 additions & 1 deletion contentctl/objects/throttling.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pydantic import BaseModel, Field, field_validator
from typing import Annotated

from pydantic import BaseModel, Field, computed_field, field_validator


# Alert Suppression/Throttling settings have been taken from
# https://docs.splunk.com/Documentation/Splunk/9.2.2/Admin/Savedsearchesconf
Expand Down Expand Up @@ -28,6 +29,8 @@ def no_spaces_in_fields(cls, v: list[str]) -> list[str]:
)
return v

@computed_field
@property
def conf_formatted_fields(self) -> str:
"""
TODO:
Expand Down
2 changes: 1 addition & 1 deletion contentctl/output/templates/savedsearches_baselines.j2
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ action.escu.analytic_story = {{ objectListToNameList(detection.tags.analytic_sto
action.escu.analytic_story = []
{% endif %}
action.escu.data_models = {{ detection.datamodel | tojson }}
cron_schedule = {{ detection.deployment.scheduling.cron_schedule }}
cron_schedule = {{ detection.calculated_cron }}
enableSched = 1
dispatch.earliest_time = {{ detection.deployment.scheduling.earliest_time }}
dispatch.latest_time = {{ detection.deployment.scheduling.latest_time }}
Expand Down
6 changes: 3 additions & 3 deletions contentctl/output/templates/savedsearches_detections.j2
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ action.risk.param._risk = {{ detection.risk | tojson }}
action.risk.param._risk_score = 0
action.risk.param.verbose = 0
{% endif %}
cron_schedule = {{ detection.deployment.scheduling.cron_schedule }}
cron_schedule = {{ detection.calculated_cron }}
dispatch.earliest_time = {{ detection.deployment.scheduling.earliest_time }}
dispatch.latest_time = {{ detection.deployment.scheduling.latest_time }}
action.correlationsearch.enabled = 1
Expand Down Expand Up @@ -70,7 +70,7 @@ action.sendtophantom.param.sensitivity = {{ detection.deployment.alert_action.ph
action.sendtophantom.param.severity = {{ detection.deployment.alert_action.phantom.severity | custom_jinja2_enrichment_filter(detection) }}
{% endif %}
alert.digest_mode = 1
disabled = {{ (not detection.enabled_by_default) | lower }}
disabled = {{ detection.statistically_disabled }}
enableSched = 1
allow_skew = 100%
counttype = number of events
Expand All @@ -80,7 +80,7 @@ realtime_schedule = 0
is_visible = false
{% if detection.tags.throttling %}
alert.suppress = true
alert.suppress.fields = {{ detection.tags.throttling.conf_formatted_fields() }}
alert.suppress.fields = {{ detection.tags.throttling.conf_formatted_fields }}
alert.suppress.period = {{ detection.tags.throttling.period }}
{% endif %}
search = {{ detection.search | escapeNewlines() }}
Expand Down
Loading