Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 78 additions & 8 deletions gprofiler/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
from typing import Iterable, Optional, Type, cast

import configargparse
from granulate_utils.exceptions import AlreadyInCgroup, UnsupportedCGroupV2
from granulate_utils.linux.cgroups.cpu_cgroup import CpuCgroup
from granulate_utils.linux.cgroups.memory_cgroup import MemoryCgroup
from granulate_utils.linux.ns import is_running_in_init_pid
from granulate_utils.linux.process import is_process_running
from granulate_utils.metadata import Metadata
Expand Down Expand Up @@ -602,6 +605,30 @@ def parse_cmd_args() -> configargparse.Namespace:
" beginning of a session.",
)

parser.add_argument(
"--limit-memory",
default=(1 << 30), # 1Gi, same as in the k8s DaemonSet
dest="memory_limit",
type=int,
help="Limit on the memory used by gProfiler."
)

parser.add_argument(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do I specific no CPU limit? i.e --limit-cpu none should be a possible parameter, not necessarily this way, but you should be able to limit ONLY the memory or CPU.

"--limit-cpu",
default=0.5, # 500m, same as in the k8s DaemonSet
dest="cpu_limit",
type=float,
help="Limit on the cpu used by gProfiler."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
help="Limit on the cpu used by gProfiler."
help="Limit on the cpu used by gProfiler. Units are cores and the default is %(default)s"

same for memory (with appropriate units)

)

parser.add_argument(
"--no-cgroups",
action="store_true",
dest="disable_cgroups",
default=False,
help="Disable the cgroups changes.",
)

args = parser.parse_args()

args.perf_inject = args.nodejs_mode == "perf"
Expand Down Expand Up @@ -679,12 +706,6 @@ def verify_preconditions(args: configargparse.Namespace) -> None:
)
sys.exit(1)

if args.log_usage and get_run_mode() not in ("k8s", "container"):
# TODO: we *can* move into another cpuacct cgroup, to let this work also when run as a standalone
# executable.
print("--log-usage is available only when run as a container!", file=sys.stderr)
sys.exit(1)


def setup_signals() -> None:
# When we run under staticx & PyInstaller, both of them forward (some of the) signals to gProfiler.
Expand Down Expand Up @@ -723,6 +744,45 @@ def init_pid_file(pid_file: str) -> None:
Path(pid_file).write_text(str(os.getpid()))


# Set limits and return path of the cgroup.
def set_limits(cpu: float, memory: int) -> str:
cgroups = {}
logger.debug("Check if cgroup version is supported.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This log is not needed IMO. As I suggested in https://github.com/Granulate/gprofiler/pull/564/files#r1007993619, log only the negative case.

try:
cgroups["cpu"] = CpuCgroup()
cgroups["memory"] = MemoryCgroup()
except UnsupportedCGroupV2:
logger.error("cgroup v2 is not supported by gProfiler, cpu and memory limits wouldn't be set.")
return

logger.debug("Prepare gProfiler cpu cgroup.")
try:
cgroups["cpu"].move_to_cgroup("gprofiler", os.getpid())
except AlreadyInCgroup:
logger.warning("gProfiler have already a cpu group.")

logger.debug("Set cpu limit in the cgroup.")
cgroups["cpu"].set_cpu_limit_cores(cpu)

logger.debug("Prepare gProfiler memory cgroup.")
try:
cgroups["memory"].move_to_cgroup("gprofiler", os.getpid())
except AlreadyInCgroup:
logger.warning("gProfiler have already a memory group.")

logger.debug("Set memory limit in the cgroup.")
cgroups["memory"].set_limit_in_bytes(memory)

return cgroups['cpu'].cgroup


def setup_usage_logger(log_usage: bool, cgroup: str) -> UsageLoggerInterface:
if log_usage:
return CgroupsUsageLogger(logger, cgroup)
else:
return NoopUsageLogger()


def main() -> None:
args = parse_cmd_args()
verify_preconditions(args)
Expand All @@ -738,10 +798,20 @@ def main() -> None:
remote_logs_handler,
)

# check if there is no kill switch for managing cgroups
# TODO(Creatone): Check the containerized scenario.
cgroup = "/" # assume we run in the root cgroup (when containerized, that's our view)
if not args.disable_cgroups and get_run_mode() not in ("k8s", "container"):
logger.info(f"Trying to set resource limits, cpu='{args.cpu_limit}' and memory='{args.memory_limit}'.")
try:
cgroup = set_limits(args.cpu_limit, args.memory_limit)
except Exception:
logger.exception("Failed to set resource limits, continuing anyway")

setup_signals()
reset_umask()
# assume we run in the root cgroup (when containerized, that's our view)
usage_logger = CgroupsUsageLogger(logger, "/") if args.log_usage else NoopUsageLogger()

usage_logger = CgroupsUsageLogger(logger, cgroup) if args.log_usage else NoopUsageLogger()

try:
init_pid_file(args.pid_file)
Expand Down
17 changes: 14 additions & 3 deletions gprofiler/usage_loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,17 @@

import psutil

CGROUPFS_ROOT = "/sys/fs/cgroup" # TODO extract from /proc/mounts, this may change
from granulate_utils.linux.cgroups.cgroup import find_v1_hierarchies, find_v2_hierarchy


# TODO(Creatone): Move it to granulate-utils.
def _obtain_cgroup_controller_path(cgroup: str, controller: str) -> str:
cgroup_v1_hierarchies = find_v1_hierarchies()
if len(cgroup_v1_hierarchies) != 1:
assert controller in cgroup_v1_hierarchies
return f"{cgroup_v1_hierarchies[controller]}{cgroup}"
else:
return f"{find_v2_hierarchy()}/{controller}{cgroup}"
Copy link
Contributor

@Jongy Jongy Oct 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this will work now - because cgroups v2 files are different. For example - there is no cpuacct.usage file, but cpu.stat file which contains a usage_usec field.

I created a ticket for cgroups v2 support. Until that's done, I suggest you raise an exception here if v2 is in use.

Also I see that granulate-utils does this check: len(get_cgroups(os.getpid())) == 1 (in _verify_preconditions). It's subtly different from what you did here (checks the controllers available for this processes instead of the mounted controllers). I think it'll produce the same results, but let's be consistent and use the same check here (you can export it to a function in granulate-utils - generally, if you need to make changes in granulate-utils, you can opne a PR there as well, and in the gProfiler repo you just update the revision of the submodule to point to your PR revision)



class UsageLoggerInterface:
Expand All @@ -30,7 +40,8 @@ class CpuUsageLogger(UsageLoggerInterface):

def __init__(self, logger: logging.LoggerAdapter, cgroup: str):
self._logger = logger
self._cpuacct_usage = Path(f"{CGROUPFS_ROOT}{cgroup}cpuacct/cpuacct.usage")
cpu_root = _obtain_cgroup_controller_path(cgroup, 'cpuacct')
self._cpuacct_usage = Path(os.path.join(cpu_root, "cpuacct.usage"))
self._last_usage: Optional[int] = None
self._last_ts: Optional[float] = None

Expand Down Expand Up @@ -78,7 +89,7 @@ class MemoryUsageLogger(UsageLoggerInterface):

def __init__(self, logger: logging.LoggerAdapter, cgroup: str):
self._logger = logger
memory_root = f"{CGROUPFS_ROOT}{cgroup}memory"
memory_root = _obtain_cgroup_controller_path(cgroup, 'memory')
self._memory_usage = Path(os.path.join(memory_root, "memory.usage_in_bytes"))
self._memory_watermark = Path(os.path.join(memory_root, "memory.max_usage_in_bytes"))
self._last_usage: Optional[int] = None
Expand Down