diff --git a/deepeval/benchmarks/human_eval/human_eval.py b/deepeval/benchmarks/human_eval/human_eval.py index fe3a3c887..9f4cbcc81 100644 --- a/deepeval/benchmarks/human_eval/human_eval.py +++ b/deepeval/benchmarks/human_eval/human_eval.py @@ -1,4 +1,10 @@ -from typing import List, Optional, Dict +import ast +import logging +import os +import subprocess +import sys +import tempfile +from typing import Dict, List, Optional from deepeval.dataset import Golden from deepeval.benchmarks.base_benchmark import ( @@ -11,6 +17,127 @@ from deepeval.telemetry import capture_benchmark_run +logger = logging.getLogger(__name__) + +SAFE_IMPORT_MODULES = { + "math", + "collections", + "itertools", + "string", + "re", + "functools", + "typing", + "heapq", + "bisect", + "copy", + "operator", +} + + +def _is_code_safe_for_humaneval(code_str: str) -> bool: + try: + parsed = ast.parse(code_str) + except SyntaxError: + return False + + for node in ast.walk(parsed): + if isinstance(node, ast.Import): + for alias in node.names: + module = alias.name.split(".")[0] + if module not in SAFE_IMPORT_MODULES: + return False + + if isinstance(node, ast.ImportFrom): + if node.level > 0 or not node.module: + return False + module = node.module.split(".")[0] + if module not in SAFE_IMPORT_MODULES: + return False + + if ( + isinstance(node, ast.Attribute) + and node.attr.startswith("__") + and node.attr.endswith("__") + ): + return False + + return True + + +def _build_posix_resource_limiter(): + if sys.platform == "win32": + return None + + try: + import resource + except Exception: + return None + + def _limit_resources(): + # Limit CPU time, memory, file descriptors, and child processes. + resource.setrlimit(resource.RLIMIT_CPU, (5, 5)) + if hasattr(resource, "RLIMIT_AS"): + resource.setrlimit(resource.RLIMIT_AS, (256 * 1024 * 1024, 256 * 1024 * 1024)) + elif hasattr(resource, "RLIMIT_DATA"): + resource.setrlimit(resource.RLIMIT_DATA, (256 * 1024 * 1024, 256 * 1024 * 1024)) + resource.setrlimit(resource.RLIMIT_NOFILE, (32, 32)) + if hasattr(resource, "RLIMIT_NPROC"): + resource.setrlimit(resource.RLIMIT_NPROC, (0, 0)) + + return _limit_resources + + +def _run_code_in_subprocess(code_str: str, timeout_seconds: int = 10) -> bool: + if not _is_code_safe_for_humaneval(code_str): + return False + + tmp_file_path = None + preexec_fn = _build_posix_resource_limiter() + + try: + with tempfile.NamedTemporaryFile( + mode="w", suffix=".py", delete=False, encoding="utf-8" + ) as tmp_file: + tmp_file.write(code_str) + tmp_file_path = tmp_file.name + + process = subprocess.Popen( + [sys.executable, tmp_file_path], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env={}, + cwd=tempfile.gettempdir(), + preexec_fn=preexec_fn, + ) + + try: + process.communicate(timeout=timeout_seconds) + except subprocess.TimeoutExpired: + process.kill() + process.communicate() + logger.warning( + "HumanEval candidate timed out after %s seconds", + timeout_seconds, + ) + return False + + return process.returncode == 0 + except OSError: + logger.warning( + "Subprocess execution failed; falling back to restricted exec" + ) + try: + secure_exec(code_str) + return True + except Exception: + return False + except Exception: + return False + finally: + if tmp_file_path and os.path.exists(tmp_file_path): + os.remove(tmp_file_path) + + def secure_exec(code_str, global_vars=None, local_vars=None): """Securely execute code with restricted globals and locals.""" if global_vars is None: @@ -58,12 +185,8 @@ def secure_exec(code_str, global_vars=None, local_vars=None): "AssertionError": AssertionError, "StopIteration": StopIteration, "isinstance": isinstance, - "hasattr": hasattr, - "getattr": getattr, - "type": type, "hash": hash, "frozenset": frozenset, - "repr": repr, "print": print, "True": True, "False": False, @@ -201,14 +324,9 @@ def predict( ) c = 0 for function in functions: - try: - full_code = function + "\n" + golden.expected_output - secure_exec(full_code) + full_code = function + "\n" + golden.expected_output + if _run_code_in_subprocess(full_code): c += 1 - except AssertionError: - pass - except Exception: - pass self.c[task.value] = c self.functions[task.value] = functions diff --git a/deepeval/utils.py b/deepeval/utils.py index 7f0e61d6e..409e096c5 100644 --- a/deepeval/utils.py +++ b/deepeval/utils.py @@ -1,6 +1,7 @@ import copy import os import json +import subprocess import time import webbrowser import tqdm @@ -663,25 +664,71 @@ def format_turn( # GPU-related business +def _get_gpu_memory_free_mib(): + try: + completed = subprocess.run( + ["nvidia-smi", "-q", "-d", "Memory"], + capture_output=True, + text=True, + timeout=30, + check=True, + ) + except FileNotFoundError as error: + raise RuntimeError("nvidia-smi is not installed or not on PATH") from error + except subprocess.TimeoutExpired as error: + raise RuntimeError("nvidia-smi timed out after 30 seconds") from error + except subprocess.CalledProcessError as error: + raise RuntimeError("nvidia-smi failed to query GPU memory") from error + + lines = completed.stdout.splitlines() + free_memory_values = [] + index = 0 + + while index < len(lines): + if "GPU" not in lines[index]: + index += 1 + continue + + for offset in range(1, 5): + line_index = index + offset + if line_index >= len(lines): + break + + line = lines[line_index].strip() + if "Free" not in line: + continue + + parts = line.split() + if len(parts) < 3: + continue + + try: + free_memory_values.append(int(parts[2])) + except ValueError as error: + raise RuntimeError( + f"Unexpected nvidia-smi output line: {line}" + ) from error + + index += 1 + + return free_memory_values + + def get_freer_gpu(): import numpy as np - os.system("nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp_smi") memory_available = [ - int(x.split()[2]) + 5 * i - for i, x in enumerate(open("tmp_smi", "r").readlines()) + value + 5 * i + for i, value in enumerate(_get_gpu_memory_free_mib()) ] - os.remove("tmp_smi") return np.argmax(memory_available) def any_gpu_with_space(gb_needed): - os.system("nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp_smi") memory_available = [ - float(x.split()[2]) / 1024.0 - for i, x in enumerate(open("tmp_smi", "r").readlines()) + float(value) / 1024.0 + for value in _get_gpu_memory_free_mib() ] - os.remove("tmp_smi") return any([mem >= gb_needed for mem in memory_available])