Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 130 additions & 12 deletions deepeval/benchmarks/human_eval/human_eval.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from typing import List, Optional, Dict
import ast
import logging
import os
import subprocess
import sys
import tempfile
from typing import Dict, List, Optional

from deepeval.dataset import Golden
from deepeval.benchmarks.base_benchmark import (
Expand All @@ -11,6 +17,127 @@
from deepeval.telemetry import capture_benchmark_run


logger = logging.getLogger(__name__)

SAFE_IMPORT_MODULES = {
"math",
"collections",
"itertools",
"string",
"re",
"functools",
"typing",
"heapq",
"bisect",
"copy",
"operator",
}


def _is_code_safe_for_humaneval(code_str: str) -> bool:
try:
parsed = ast.parse(code_str)
except SyntaxError:
return False

for node in ast.walk(parsed):
if isinstance(node, ast.Import):
for alias in node.names:
module = alias.name.split(".")[0]
if module not in SAFE_IMPORT_MODULES:
return False

if isinstance(node, ast.ImportFrom):
if node.level > 0 or not node.module:
return False
module = node.module.split(".")[0]
if module not in SAFE_IMPORT_MODULES:
return False

if (
isinstance(node, ast.Attribute)
and node.attr.startswith("__")
and node.attr.endswith("__")
):
return False

return True


def _build_posix_resource_limiter():
if sys.platform == "win32":
return None

try:
import resource
except Exception:
return None

def _limit_resources():
# Limit CPU time, memory, file descriptors, and child processes.
resource.setrlimit(resource.RLIMIT_CPU, (5, 5))
if hasattr(resource, "RLIMIT_AS"):
resource.setrlimit(resource.RLIMIT_AS, (256 * 1024 * 1024, 256 * 1024 * 1024))
elif hasattr(resource, "RLIMIT_DATA"):
resource.setrlimit(resource.RLIMIT_DATA, (256 * 1024 * 1024, 256 * 1024 * 1024))
resource.setrlimit(resource.RLIMIT_NOFILE, (32, 32))
if hasattr(resource, "RLIMIT_NPROC"):
resource.setrlimit(resource.RLIMIT_NPROC, (0, 0))

return _limit_resources


def _run_code_in_subprocess(code_str: str, timeout_seconds: int = 10) -> bool:
if not _is_code_safe_for_humaneval(code_str):
return False

tmp_file_path = None
preexec_fn = _build_posix_resource_limiter()

try:
with tempfile.NamedTemporaryFile(
mode="w", suffix=".py", delete=False, encoding="utf-8"
) as tmp_file:
tmp_file.write(code_str)
tmp_file_path = tmp_file.name

process = subprocess.Popen(
[sys.executable, tmp_file_path],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env={},
cwd=tempfile.gettempdir(),
preexec_fn=preexec_fn,
)

try:
process.communicate(timeout=timeout_seconds)
except subprocess.TimeoutExpired:
process.kill()
process.communicate()
logger.warning(
"HumanEval candidate timed out after %s seconds",
timeout_seconds,
)
return False

return process.returncode == 0
except OSError:
logger.warning(
"Subprocess execution failed; falling back to restricted exec"
)
try:
secure_exec(code_str)
return True
except Exception:
return False
except Exception:
return False
finally:
if tmp_file_path and os.path.exists(tmp_file_path):
os.remove(tmp_file_path)


def secure_exec(code_str, global_vars=None, local_vars=None):
"""Securely execute code with restricted globals and locals."""
if global_vars is None:
Expand Down Expand Up @@ -58,12 +185,8 @@ def secure_exec(code_str, global_vars=None, local_vars=None):
"AssertionError": AssertionError,
"StopIteration": StopIteration,
"isinstance": isinstance,
"hasattr": hasattr,
"getattr": getattr,
"type": type,
"hash": hash,
"frozenset": frozenset,
"repr": repr,
"print": print,
"True": True,
"False": False,
Expand Down Expand Up @@ -201,14 +324,9 @@ def predict(
)
c = 0
for function in functions:
try:
full_code = function + "\n" + golden.expected_output
secure_exec(full_code)
full_code = function + "\n" + golden.expected_output
if _run_code_in_subprocess(full_code):
c += 1
except AssertionError:
pass
except Exception:
pass
self.c[task.value] = c
self.functions[task.value] = functions

Expand Down
63 changes: 55 additions & 8 deletions deepeval/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import copy
import os
import json
import subprocess
import time
import webbrowser
import tqdm
Expand Down Expand Up @@ -663,25 +664,71 @@ def format_turn(
# GPU-related business


def _get_gpu_memory_free_mib():
try:
completed = subprocess.run(
["nvidia-smi", "-q", "-d", "Memory"],
capture_output=True,
text=True,
timeout=30,
check=True,
)
except FileNotFoundError as error:
raise RuntimeError("nvidia-smi is not installed or not on PATH") from error
except subprocess.TimeoutExpired as error:
raise RuntimeError("nvidia-smi timed out after 30 seconds") from error
except subprocess.CalledProcessError as error:
raise RuntimeError("nvidia-smi failed to query GPU memory") from error

lines = completed.stdout.splitlines()
free_memory_values = []
index = 0

while index < len(lines):
if "GPU" not in lines[index]:
index += 1
continue

for offset in range(1, 5):
line_index = index + offset
if line_index >= len(lines):
break

line = lines[line_index].strip()
if "Free" not in line:
continue

parts = line.split()
if len(parts) < 3:
continue

try:
free_memory_values.append(int(parts[2]))
except ValueError as error:
raise RuntimeError(
f"Unexpected nvidia-smi output line: {line}"
) from error

index += 1

return free_memory_values


def get_freer_gpu():
import numpy as np

os.system("nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp_smi")
memory_available = [
int(x.split()[2]) + 5 * i
for i, x in enumerate(open("tmp_smi", "r").readlines())
value + 5 * i
for i, value in enumerate(_get_gpu_memory_free_mib())
]
os.remove("tmp_smi")
return np.argmax(memory_available)


def any_gpu_with_space(gb_needed):
os.system("nvidia-smi -q -d Memory |grep -A4 GPU|grep Free >tmp_smi")
memory_available = [
float(x.split()[2]) / 1024.0
for i, x in enumerate(open("tmp_smi", "r").readlines())
float(value) / 1024.0
for value in _get_gpu_memory_free_mib()
]
os.remove("tmp_smi")
return any([mem >= gb_needed for mem in memory_available])


Expand Down
Loading