diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..d5fb4527 --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +# Image tag for image containing e2e tests +E2E_TEST_IMAGE_VERSION ?= latest +E2E_TEST_IMAGE ?= quay.io/opendatahub/codeflare-sdk-tests:${E2E_TEST_IMAGE_VERSION} + +# Build the test image +.PHONY: build-test-image +build-test-image: + @echo "Building test image: $(E2E_TEST_IMAGE)" + # Build the Docker image using podman + podman build -f images/tests/Dockerfile -t $(E2E_TEST_IMAGE) . + +# Push the test image +.PHONY: push-test-image +push-test-image: + @echo "Pushing test image: $(E2E_TEST_IMAGE)" + podman push $(E2E_TEST_IMAGE) diff --git a/images/tests/Dockerfile b/images/tests/Dockerfile new file mode 100644 index 00000000..743b8232 --- /dev/null +++ b/images/tests/Dockerfile @@ -0,0 +1,127 @@ +# Multi-stage build for Python tests with oc CLI +FROM python:3.12-slim AS builder + +# Install system dependencies needed for building +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + curl \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Install Poetry +ENV POETRY_VERSION=1.8.3 \ + POETRY_HOME="/opt/poetry" \ + POETRY_VIRTUALENVS_IN_PROJECT=true \ + POETRY_NO_INTERACTION=1 \ + POETRY_CACHE_DIR=/tmp/poetry_cache + +ENV PATH="$POETRY_HOME/bin:$PATH" + +RUN pip install --no-cache-dir "poetry==$POETRY_VERSION" + +# ============================================================================ +# Base Directory: /codeflare-sdk +# This is the root directory where all project files will be located. +# Similar to kuberay structure, all source code and tests are under /codeflare-sdk +# ============================================================================ +WORKDIR /codeflare-sdk + +# Copy dependency files +COPY pyproject.toml poetry.lock* ./ + +# Install dependencies (including test dependencies) +RUN poetry install --no-root --with test && rm -rf $POETRY_CACHE_DIR + +# Runtime stage +FROM python:3.12-slim + +# Install system dependencies for runtime +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + curl \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +# Install OpenShift CLI (oc) +RUN curl -L https://mirror.openshift.com/pub/openshift-v4/clients/oc/latest/linux/oc.tar.gz | \ + tar -xz -C /usr/local/bin && \ + chmod +x /usr/local/bin/oc && \ + oc version --client + +# Install Poetry for runtime (needed for poetry run command) +ENV POETRY_VERSION=1.8.3 \ + POETRY_HOME="/opt/poetry" \ + POETRY_VIRTUALENVS_IN_PROJECT=true \ + POETRY_NO_INTERACTION=1 + +ENV PATH="$POETRY_HOME/bin:/codeflare-sdk/.venv/bin:$PATH" + +RUN pip install --no-cache-dir "poetry==$POETRY_VERSION" + +# ============================================================================ +# Base Directory: /codeflare-sdk +# This is the root directory where all project files will be located. +# Similar to kuberay structure, all source code and tests are under /codeflare-sdk +# ============================================================================ +WORKDIR /codeflare-sdk + +# Copy virtual environment from builder +COPY --from=builder /codeflare-sdk/.venv /codeflare-sdk/.venv + +# Copy project files +COPY pyproject.toml poetry.lock* ./ +COPY README.md ./ +COPY src/ ./src/ +COPY tests/ ./tests/ + +# Copy test runner script, entrypoint, and RBAC file +COPY images/tests/run-tests.sh /codeflare-sdk/run-tests.sh +COPY images/tests/entrypoint.sh /codeflare-sdk/entrypoint.sh +COPY images/tests/rbac-test-user-permissions.yaml /codeflare-sdk/images/tests/rbac-test-user-permissions.yaml +RUN chmod +x /codeflare-sdk/run-tests.sh /codeflare-sdk/entrypoint.sh + +# Install the codeflare_sdk package in editable mode so it can be imported +# This is needed because we used --no-root in the builder stage +WORKDIR /codeflare-sdk +RUN /codeflare-sdk/.venv/bin/pip install -e . + +# ============================================================================ +# Working Directory: /codeflare-sdk +# Tests expect to run from the project root so relative paths like ./tests/e2e/ work correctly +# ============================================================================ +WORKDIR /codeflare-sdk + +ENV KUBECONFIG=/codeflare-sdk/tests/.kube/config + +# ============================================================================ +# Test Results Directory: /codeflare-sdk/tests/results +# This directory will contain pytest output files (JUnit XML, coverage, etc.) +# Mount this as a volume when running the container to access test results +# ============================================================================ +RUN mkdir -p /codeflare-sdk/tests/results + +# ============================================================================ +# Environment File Setup +# The containerEnvFile should be passed via --env-file parameter when running +# the container. Expected environment variables: +# TEST_USER_USERNAME, TEST_USER_PASSWORD +# OCP_ADMIN_USER_USERNAME, OCP_ADMIN_USER_PASSWORD +# Example: podman run --env-file containerEnvFile codeflare-sdk-tests +# ============================================================================ + +# ============================================================================ +# Default Command (can be overridden when running the container) +# This CMD runs the test wrapper script which handles: +# - Extracting OpenShift API URL from kubeconfig +# - Applying RBAC policies +# - Logging in with TEST_USER +# - Running tests +# - Logging in with OCP_ADMIN_USER after tests +# - Cleaning up RBAC +# To override it, just provide your command after the image name: +# podman run --env-file containerEnvFile codeflare-sdk-tests poetry run pytest tests/e2e/specific_test.py +# ============================================================================ +# Set entrypoint to handle -- separator arguments +ENTRYPOINT ["/codeflare-sdk/entrypoint.sh"] +# Default command (can be overridden, but entrypoint will handle it) +CMD [] diff --git a/images/tests/entrypoint.sh b/images/tests/entrypoint.sh new file mode 100644 index 00000000..b0aab127 --- /dev/null +++ b/images/tests/entrypoint.sh @@ -0,0 +1,5 @@ +#!/bin/sh +# Entrypoint script that handles -- separator in podman commands +# Passes all arguments to run-tests.sh which will forward them to pytest + +exec /codeflare-sdk/run-tests.sh "$@" diff --git a/images/tests/rbac-test-user-permissions.yaml b/images/tests/rbac-test-user-permissions.yaml new file mode 100644 index 00000000..12882349 --- /dev/null +++ b/images/tests/rbac-test-user-permissions.yaml @@ -0,0 +1,174 @@ +--- +# RBAC permissions for test user to run e2e tests +# Apply this as cluster-admin before running tests: +# oc apply -f images/tests/rbac-test-user-permissions.yaml +# OR +# kubectl apply -f images/tests/rbac-test-user-permissions.yaml +# +# The username "TEST_USER_USERNAME_PLACEHOLDER" will be replaced at runtime with the actual test username + +# For OpenShift: Grant self-provisioner role (allows namespace creation) +# This is the recommended approach for OpenShift +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: test-user-self-provisioner + # For OpenShift, you can also use: oc adm policy add-cluster-role-to-user self-provisioner TEST_USER_USERNAME_PLACEHOLDER +subjects: +- kind: User + name: TEST_USER_USERNAME_PLACEHOLDER + apiGroup: rbac.authorization.k8s.io +roleRef: + kind: ClusterRole + name: self-provisioner + apiGroup: rbac.authorization.k8s.io +--- +# Alternative: Grant admin role (more permissive, use if self-provisioner doesn't work) +# For OpenShift, you can also use: oc adm policy add-cluster-role-to-user admin TEST_USER_USERNAME_PLACEHOLDER +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: test-user-admin +subjects: +- kind: User + name: TEST_USER_USERNAME_PLACEHOLDER + apiGroup: rbac.authorization.k8s.io +roleRef: + kind: ClusterRole + name: admin + apiGroup: rbac.authorization.k8s.io +--- +# For Kubernetes: Grant cluster-admin role (allows all operations including namespace creation) +# This is more permissive but ensures all test operations work +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: test-user-cluster-admin +subjects: +- kind: User + name: TEST_USER_USERNAME_PLACEHOLDER + apiGroup: rbac.authorization.k8s.io +roleRef: + kind: ClusterRole + name: cluster-admin + apiGroup: rbac.authorization.k8s.io +--- +# Additional permissions for Kueue resources (if needed) +# This allows the user to create/manage Kueue CustomResources +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: test-user-kueue-admin +rules: +- apiGroups: + - kueue.x-k8s.io + resources: + - clusterqueues + - resourceflavors + - localqueues + - workloads + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: test-user-kueue-admin +subjects: +- kind: User + name: TEST_USER_USERNAME_PLACEHOLDER + apiGroup: rbac.authorization.k8s.io +roleRef: + kind: ClusterRole + name: test-user-kueue-admin + apiGroup: rbac.authorization.k8s.io +--- +# Permissions for RayCluster and RayJob CustomResources +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: test-user-ray-admin +rules: +- apiGroups: + - ray.io + resources: + - rayclusters + - rayjobs + verbs: + - get + - list + - watch + - create + - update + - patch + - delete +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: test-user-ray-admin +subjects: +- kind: User + name: TEST_USER_USERNAME_PLACEHOLDER + apiGroup: rbac.authorization.k8s.io +roleRef: + kind: ClusterRole + name: test-user-ray-admin + apiGroup: rbac.authorization.k8s.io +--- +# Comprehensive RBAC role for Kueue batch operations +# This role provides permissions for namespaces, Kueue, Ray, and core Kubernetes resources +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kueue-batch-user-role +rules: +- apiGroups: [""] + resources: ["namespaces"] + verbs: ["create", "get", "list", "watch"] +- apiGroups: ["kueue.x-k8s.io"] + resources: ["clusterqueues", "resourceflavors", "localqueues"] + verbs: ["create", "get", "list", "watch", "update", "patch", "delete"] +- apiGroups: ["ray.io"] + resources: ["rayclusters", "rayjobs"] + verbs: ["create", "get", "list", "watch", "update", "patch", "delete"] +- apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["create", "get", "list", "watch", "update", "patch", "delete"] +- apiGroups: [""] + resources: ["pods", "services", "configmaps", "secrets"] + verbs: ["create", "get", "list", "watch", "update", "patch", "delete"] +--- +# ClusterRoleBinding for authenticated users (group-based) +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kueue-batch-user-rolebinding +subjects: + - kind: Group + apiGroup: rbac.authorization.k8s.io + name: 'system:authenticated' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kueue-batch-user-role +--- +# ClusterRoleBinding for specific test user +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kueue-batch-user-specific-rolebinding +subjects: + - kind: User + apiGroup: rbac.authorization.k8s.io + name: 'TEST_USER_USERNAME_PLACEHOLDER' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kueue-batch-user-role diff --git a/images/tests/run-tests.sh b/images/tests/run-tests.sh new file mode 100644 index 00000000..25bc0585 --- /dev/null +++ b/images/tests/run-tests.sh @@ -0,0 +1,455 @@ +#!/bin/bash +set -e + +# ============================================================================ +# Cleanup function to ensure RBAC and Kueue cleanup always runs +# ============================================================================ +# shellcheck disable=SC2329 +cleanup_on_exit() { + # Use TEST_EXIT_CODE if set, otherwise use the current exit code + local exit_code=${TEST_EXIT_CODE:-$?} + local cleanup_ran=0 + + # Only run cleanup if we've started the process (TEMP_KUBECONFIG exists) + if [ -n "${TEMP_KUBECONFIG:-}" ] && [ -f "${TEMP_KUBECONFIG}" ]; then + cleanup_ran=1 + echo "" + echo "============================================================================" + echo "Running cleanup (test exit code: $exit_code)" + echo "============================================================================" + + # Ensure KUBECONFIG is set to temp file + export KUBECONFIG="${TEMP_KUBECONFIG}" + + # Try to login as admin for cleanup + if [ -n "${OCP_ADMIN_USER_USERNAME:-}" ] && [ -n "${OCP_ADMIN_USER_PASSWORD:-}" ] && [ -n "${OCP_API_URL:-}" ]; then + echo "Logging in to OpenShift with OCP_ADMIN_USER for cleanup..." + if oc login "$OCP_API_URL" \ + --username="$OCP_ADMIN_USER_USERNAME" \ + --password="$OCP_ADMIN_USER_PASSWORD" \ + --insecure-skip-tls-verify=true 2>/dev/null; then + echo "Successfully logged in with OCP_ADMIN_USER for cleanup" + + # Cleanup RBAC Policies + if [ -n "${TEST_USER_USERNAME:-}" ]; then + echo "Cleaning up RBAC policies..." + RBAC_FILE="/codeflare-sdk/images/tests/rbac-test-user-permissions.yaml" + RBAC_TEMP_FILE="/tmp/rbac-test-user-permissions-cleanup-$$.yaml" + + if [ -f "$RBAC_FILE" ]; then + ESCAPED_USERNAME=$(printf '%s\n' "$TEST_USER_USERNAME" | sed 's/[[\.*^$()+?{|]/\\&/g') + sed "s/TEST_USER_USERNAME_PLACEHOLDER/$ESCAPED_USERNAME/g" "$RBAC_FILE" > "$RBAC_TEMP_FILE" 2>/dev/null + + if [ -f "$RBAC_TEMP_FILE" ]; then + echo "Deleting RBAC resources..." + oc delete -f "$RBAC_TEMP_FILE" 2>/dev/null || { + echo "WARNING: Some RBAC resources may not exist or were already deleted" + } + echo "RBAC cleanup completed" + rm -f "$RBAC_TEMP_FILE" + fi + else + echo "WARNING: RBAC file not found: $RBAC_FILE" + fi + else + echo "WARNING: TEST_USER_USERNAME not found, cannot cleanup RBAC" + fi + + # Set Kueue Component to Removed State + echo "Setting Kueue component to Removed state..." + DSC_NAME=$(get_dsc_name 2>/dev/null || echo "") + + if [ -n "$DSC_NAME" ] && [[ ! "$DSC_NAME" =~ ^ERROR ]]; then + set_kueue_management_state "Removed" "$DSC_NAME" 2>/dev/null || { + echo "WARNING: Failed to set Kueue to Removed state" + } + wait_for_dsc_ready 600 2>/dev/null || { + echo "WARNING: DataScienceCluster did not reach Ready state after setting Kueue to Removed" + } + else + echo "WARNING: Failed to get DataScienceCluster name, skipping Kueue cleanup" + fi + else + echo "WARNING: Failed to login with OCP_ADMIN_USER for cleanup" + fi + else + echo "WARNING: Admin credentials not available for cleanup" + fi + + # Cleanup temporary kubeconfig + rm -f "${TEMP_KUBECONFIG}" 2>/dev/null || true + + echo "============================================================================" + echo "" + fi + + # Only exit if we actually ran cleanup (to avoid double exit) + if [ $cleanup_ran -eq 1 ]; then + exit $exit_code + fi +} + +# Set trap to run cleanup on exit +trap cleanup_on_exit EXIT + +# ============================================================================ +# Environment Variables Setup +# +# Required environment variables (should be set by Jenkins or --env-file): +# TEST_USER_USERNAME= +# TEST_USER_PASSWORD= +# OCP_ADMIN_USER_USERNAME= +# OCP_ADMIN_USER_PASSWORD= +# ============================================================================ + +# ============================================================================ +# Debug: Check Environment Variables +# ============================================================================ +echo "============================================================================" +echo "Environment Variables Debug" +echo "============================================================================" +echo "Checking required environment variables..." + +# List of required environment variables +REQUIRED_VARS=( + "TEST_USER_USERNAME" + "TEST_USER_PASSWORD" + "OCP_ADMIN_USER_USERNAME" + "OCP_ADMIN_USER_PASSWORD" +) + +# Check each variable +MISSING_VARS=() +for var in "${REQUIRED_VARS[@]}"; do + if [ -n "${!var}" ]; then + echo " ✓ $var: [SET]" + else + echo " ✗ $var: [NOT SET]" + MISSING_VARS+=("$var") + fi +done + +echo "" +if [ ${#MISSING_VARS[@]} -gt 0 ]; then + echo "ERROR: The following required environment variables are not set:" + for var in "${MISSING_VARS[@]}"; do + echo " - $var" + done + echo "" + exit 1 +else + echo "All required environment variables are set." + echo "" +fi +echo "============================================================================" +echo "" + +# ============================================================================ +# Helper Functions +# ============================================================================ + +# Get DataScienceCluster resource name +get_dsc_name() { + local dsc_name + dsc_name=$(oc get DataScienceCluster -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + if [ -z "$dsc_name" ]; then + echo "ERROR: Failed to get DataScienceCluster resource name" + return 1 + fi + echo "$dsc_name" +} + +# Wait for DataScienceCluster to be in Ready state +# Arguments: timeout_seconds (default: 600 = 10 minutes) +wait_for_dsc_ready() { + local timeout=${1:-600} + local interval=10 + local elapsed=0 + + echo "Waiting for DataScienceCluster to be in Ready state (timeout: ${timeout}s)..." + + while [ $elapsed -lt "$timeout" ]; do + local phase + phase=$(oc get DataScienceCluster --no-headers -o custom-columns=":status.phase" 2>/dev/null | head -n1) + + if [ "$phase" = "Ready" ]; then + echo "DataScienceCluster is in Ready state" + return 0 + fi + + echo "DataScienceCluster phase: ${phase:-Unknown} (elapsed: ${elapsed}s)" + sleep $interval + elapsed=$((elapsed + interval)) + done + + echo "ERROR: Timeout waiting for DataScienceCluster to be Ready (waited ${timeout}s)" + return 1 +} + +# Set Kueue component management state +# Arguments: state (Unmanaged or Removed), cluster_name +set_kueue_management_state() { + local state=$1 + local cluster_name=$2 + + if [ -z "$state" ] || [ -z "$cluster_name" ]; then + echo "ERROR: Invalid arguments for set_kueue_management_state" + return 1 + fi + + echo "Setting Kueue component management state to: $state" + oc patch DataScienceCluster "$cluster_name" --type 'json' -p "[{\"op\" : \"replace\" ,\"path\" : \"/spec/components/kueue/managementState\" ,\"value\" : \"$state\"}]" || { + echo "ERROR: Failed to set Kueue management state to $state" + return 1 + } + + echo "Successfully set Kueue management state to: $state" + return 0 +} + +# ============================================================================ +# Get OpenShift API URL (from active oc session or kubeconfig) +# ============================================================================ +echo "Extracting OpenShift API URL from active oc session..." +# Try to get URL from active oc session first (if already logged in) +OCP_API_URL=$(oc whoami --show-server 2>/dev/null) + +if [ -z "$OCP_API_URL" ]; then + echo "No active oc session found, extracting from kubeconfig..." + if [ -z "${KUBECONFIG}" ]; then + echo "ERROR: KUBECONFIG environment variable is not set and no active oc session" + exit 1 + fi + + if [ ! -f "${KUBECONFIG}" ]; then + echo "ERROR: Kubeconfig file not found at ${KUBECONFIG}" + exit 1 + fi + + OCP_API_URL=$(oc config view -o jsonpath='{.clusters[0].cluster.server}' --kubeconfig="${KUBECONFIG}" 2>/dev/null) + if [ -z "$OCP_API_URL" ]; then + echo "ERROR: Failed to extract API URL from kubeconfig" + exit 1 + fi + echo "OpenShift API URL extracted from kubeconfig: $OCP_API_URL" +else + echo "OpenShift API URL from active oc session: $OCP_API_URL" +fi + +# ============================================================================ +# Login to OpenShift with Admin User (OCP_ADMIN_USER) to apply RBAC +# ============================================================================ +echo "Logging in to OpenShift with OCP_ADMIN_USER to apply RBAC policies..." +if [ -z "$OCP_ADMIN_USER_USERNAME" ] || [ -z "$OCP_ADMIN_USER_PASSWORD" ]; then + echo "ERROR: OCP_ADMIN_USER credentials not found in environment (required to apply RBAC)" + exit 1 +fi + +# Use a temporary kubeconfig for login (since the mounted one is read-only) +TEMP_KUBECONFIG="/tmp/kubeconfig-$$" +cp "${KUBECONFIG}" "${TEMP_KUBECONFIG}" 2>/dev/null || { + echo "WARNING: Could not copy kubeconfig, creating new one" + touch "${TEMP_KUBECONFIG}" +} + +# Set KUBECONFIG to the temporary one before login +export KUBECONFIG="${TEMP_KUBECONFIG}" + +# Create ~/.kube directory and ensure config file exists there +# This is needed because config_check() looks for ~/.kube/config +mkdir -p ~/.kube +cp "${TEMP_KUBECONFIG}" ~/.kube/config || { + echo "WARNING: Could not copy kubeconfig to ~/.kube/config" +} + +oc login "$OCP_API_URL" \ + --username="$OCP_ADMIN_USER_USERNAME" \ + --password="$OCP_ADMIN_USER_PASSWORD" \ + --insecure-skip-tls-verify=true || { + echo "ERROR: Failed to login with OCP_ADMIN_USER" + rm -f "${TEMP_KUBECONFIG}" + exit 1 +} + +# Update ~/.kube/config after login (oc login modifies the kubeconfig) +cp "${TEMP_KUBECONFIG}" ~/.kube/config || { + echo "WARNING: Could not update ~/.kube/config after login" +} + +# Verify we're logged in as the admin user +CURRENT_USER=$(oc whoami 2>/dev/null) +if [ "$CURRENT_USER" != "$OCP_ADMIN_USER_USERNAME" ]; then + echo "ERROR: Login verification failed. Expected user: $OCP_ADMIN_USER_USERNAME, got: ${CURRENT_USER:-none}" + rm -f "${TEMP_KUBECONFIG}" + exit 1 +fi + +# Warn if admin user is the same as test user (likely a configuration error) +if [ "$OCP_ADMIN_USER_USERNAME" = "$TEST_USER_USERNAME" ]; then + echo "WARNING: OCP_ADMIN_USER_USERNAME is the same as TEST_USER_USERNAME ($OCP_ADMIN_USER_USERNAME)" + echo " This user may not have cluster-admin permissions needed to apply RBAC policies." + echo " Please ensure OCP_ADMIN_USER_USERNAME is set to a user with cluster-admin role." +fi + +echo "Successfully logged in with OCP_ADMIN_USER (verified: $CURRENT_USER)" + +# ============================================================================ +# Set Kueue Component to Unmanaged State +# ============================================================================ +echo "Setting Kueue component to Unmanaged state..." +DSC_NAME=$(get_dsc_name) || { + echo "ERROR: Failed to get DataScienceCluster name" + exit 1 +} + +set_kueue_management_state "Unmanaged" "$DSC_NAME" || { + echo "ERROR: Failed to set Kueue to Unmanaged state" + exit 1 +} + +# Wait for DataScienceCluster to be Ready after setting Kueue to Unmanaged +wait_for_dsc_ready 600 || { + echo "ERROR: DataScienceCluster did not reach Ready state after setting Kueue to Unmanaged" + exit 1 +} + +# ============================================================================ +# Apply RBAC Policies +# ============================================================================ +echo "Applying RBAC policies..." +if [ -z "$TEST_USER_USERNAME" ]; then + echo "ERROR: TEST_USER_USERNAME not found in environment" + exit 1 +fi + +RBAC_FILE="/codeflare-sdk/images/tests/rbac-test-user-permissions.yaml" +RBAC_TEMP_FILE="/tmp/rbac-test-user-permissions-processed.yaml" + +# Replace placeholder with actual test username (escape special characters for sed) +ESCAPED_USERNAME=$(printf '%s\n' "$TEST_USER_USERNAME" | sed 's/[[\.*^$()+?{|]/\\&/g') +sed "s/TEST_USER_USERNAME_PLACEHOLDER/$ESCAPED_USERNAME/g" "$RBAC_FILE" > "$RBAC_TEMP_FILE" + +# Verify we're still logged in as admin before applying RBAC +CURRENT_USER=$(oc whoami 2>/dev/null) +if [ "$CURRENT_USER" != "$OCP_ADMIN_USER_USERNAME" ]; then + echo "ERROR: Not logged in as admin user. Current user: ${CURRENT_USER:-none}, expected: $OCP_ADMIN_USER_USERNAME" + echo "Re-logging in as admin..." + oc login "$OCP_API_URL" \ + --username="$OCP_ADMIN_USER_USERNAME" \ + --password="$OCP_ADMIN_USER_PASSWORD" \ + --insecure-skip-tls-verify=true || { + echo "ERROR: Failed to re-login with OCP_ADMIN_USER" + rm -f "$RBAC_TEMP_FILE" + exit 1 + } + CURRENT_USER=$(oc whoami 2>/dev/null) + if [ "$CURRENT_USER" != "$OCP_ADMIN_USER_USERNAME" ]; then + echo "ERROR: Still not logged in as admin after re-login. Current user: ${CURRENT_USER:-none}" + rm -f "$RBAC_TEMP_FILE" + exit 1 + fi +fi + +echo "Applying RBAC policies as user: $CURRENT_USER" +# Apply the RBAC policies +oc apply -f "$RBAC_TEMP_FILE" || { + echo "ERROR: Failed to apply RBAC policies" + echo "Current user context: $(oc whoami 2>/dev/null || echo 'unknown')" + rm -f "$RBAC_TEMP_FILE" + exit 1 +} + +echo "Successfully applied RBAC policies for user: $TEST_USER_USERNAME" +rm -f "$RBAC_TEMP_FILE" + +# ============================================================================ +# Login to OpenShift with TEST_USER +# ============================================================================ +echo "Logging in to OpenShift with TEST_USER..." +if [ -z "$TEST_USER_USERNAME" ] || [ -z "$TEST_USER_PASSWORD" ]; then + echo "ERROR: TEST_USER credentials not found in environment" + exit 1 +fi + +oc login "$OCP_API_URL" \ + --username="$TEST_USER_USERNAME" \ + --password="$TEST_USER_PASSWORD" \ + --insecure-skip-tls-verify=true || { + echo "ERROR: Failed to login with TEST_USER" + rm -f "${TEMP_KUBECONFIG}" + exit 1 +} + +# Update ~/.kube/config after test user login (oc login modifies the kubeconfig) +cp "${TEMP_KUBECONFIG}" ~/.kube/config || { + echo "WARNING: Could not update ~/.kube/config after test user login" +} + +echo "Successfully logged in with TEST_USER" + +# ============================================================================ +# Run Tests +# ============================================================================ +echo "Running tests..." + +# Change to the codeflare-sdk directory to ensure correct paths +cd /codeflare-sdk || { + echo "ERROR: Failed to change to /codeflare-sdk directory" + exit 1 +} + +# Default pytest options +DEFAULT_PYTEST_OPTS=( + "--junitxml=/codeflare-sdk/tests/results/results.xml" + "-o" + "junit_suite_name=codeflare-sdk" + "-v" + "-s" + "--tb=short" +) + +# Expand glob patterns for test paths +# Use nullglob to handle cases where no files match +shopt -s nullglob +EXPANDED_TEST_PATHS=() +# Expand globs directly (don't quote the patterns so bash expands them) +for file in tests/e2e/*oauth_test.py tests/e2e/rayjob/*_test.py tests/upgrade/*_test.py; do + EXPANDED_TEST_PATHS+=("$file") +done +shopt -u nullglob + +# Build pytest command +PYTEST_ARGS=() +PYTEST_ARGS+=("${EXPANDED_TEST_PATHS[@]}") +PYTEST_ARGS+=("${DEFAULT_PYTEST_OPTS[@]}") + +# Check if pytest marker arguments were passed (e.g., -m smoke, -m tier1) +# If arguments are passed to the script, they are pytest arguments +if [ $# -gt 0 ]; then + echo "Received pytest arguments: $*" + # Add passed arguments to pytest args + PYTEST_ARGS+=("$@") +else + echo "No pytest arguments provided, running all oauth tests" +fi + +if [ ${#EXPANDED_TEST_PATHS[@]} -eq 0 ]; then + echo "ERROR: No test files found matching patterns: tests/e2e/*oauth_test.py tests/e2e/rayjob/*_test.py tests/upgrade/*_test.py" + exit 1 +fi + +echo "Executing: poetry run pytest ${PYTEST_ARGS[*]}" +poetry run pytest "${PYTEST_ARGS[@]}" + +TEST_EXIT_CODE=$? + +# ============================================================================ +# Cleanup will be handled by the trap function (cleanup_on_exit) +# The trap ensures cleanup always runs, even if tests fail or script exits early +# ============================================================================ +echo "" +echo "Tests completed with exit code: $TEST_EXIT_CODE" +echo "" + +# Exit - the trap will handle cleanup automatically +exit $TEST_EXIT_CODE diff --git a/pyproject.toml b/pyproject.toml index ef4103ba..15edb00c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,7 +79,11 @@ filterwarnings = [ markers = [ "kind", "openshift", - "nvidia_gpu" + "nvidia_gpu", + "smoke: Smoke tests - quick validation tests", + "tier1: Tier1 tests - standard test suite", + "pre_upgrade: Tests to run before upgrade", + "post_upgrade: Tests to run after upgrade" ] addopts = "--timeout=900 --ignore=src/codeflare_sdk/vendored" testpaths = ["src/codeflare_sdk"] diff --git a/tests/e2e/heterogeneous_clusters_oauth_test.py b/tests/e2e/heterogeneous_clusters_oauth_test.py index 8b8ef340..8b9032fd 100644 --- a/tests/e2e/heterogeneous_clusters_oauth_test.py +++ b/tests/e2e/heterogeneous_clusters_oauth_test.py @@ -12,6 +12,7 @@ @pytest.mark.openshift +@pytest.mark.tier1 class TestHeterogeneousClustersOauth: def setup_method(self): initialize_kubernetes_client(self) diff --git a/tests/e2e/local_interactive_sdk_oauth_test.py b/tests/e2e/local_interactive_sdk_oauth_test.py index a5faad2a..ec26d689 100644 --- a/tests/e2e/local_interactive_sdk_oauth_test.py +++ b/tests/e2e/local_interactive_sdk_oauth_test.py @@ -14,6 +14,7 @@ @pytest.mark.skip(reason="Remote ray.init() is temporarily unsupported") @pytest.mark.openshift +@pytest.mark.tier1 class TestRayLocalInteractiveOauth: def setup_method(self): initialize_kubernetes_client(self) diff --git a/tests/e2e/mnist_raycluster_sdk_oauth_test.py b/tests/e2e/mnist_raycluster_sdk_oauth_test.py index d9f20e78..f1c18b3a 100644 --- a/tests/e2e/mnist_raycluster_sdk_oauth_test.py +++ b/tests/e2e/mnist_raycluster_sdk_oauth_test.py @@ -17,6 +17,7 @@ @pytest.mark.openshift +@pytest.mark.tier1 class TestRayClusterSDKOauth: def setup_method(self): initialize_kubernetes_client(self) diff --git a/tests/e2e/rayjob/ray_version_validation_oauth_test.py b/tests/e2e/rayjob/ray_version_validation_oauth_test.py index 794d739a..c3d49cc8 100644 --- a/tests/e2e/rayjob/ray_version_validation_oauth_test.py +++ b/tests/e2e/rayjob/ray_version_validation_oauth_test.py @@ -12,6 +12,7 @@ ) +@pytest.mark.smoke class TestRayJobRayVersionValidationOauth: def setup_method(self): initialize_kubernetes_client(self) diff --git a/tests/e2e/rayjob/rayjob_existing_cluster_test.py b/tests/e2e/rayjob/rayjob_existing_cluster_test.py index 8f6f0c3b..b237cb7e 100644 --- a/tests/e2e/rayjob/rayjob_existing_cluster_test.py +++ b/tests/e2e/rayjob/rayjob_existing_cluster_test.py @@ -15,6 +15,7 @@ from codeflare_sdk.vendored.python_client.kuberay_job_api import RayjobApi +@pytest.mark.tier1 class TestRayJobExistingCluster: """Test RayJob against existing Kueue-managed clusters.""" diff --git a/tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py b/tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py index 2256f06f..2ed0c814 100644 --- a/tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py +++ b/tests/e2e/rayjob/rayjob_lifecycled_cluster_test.py @@ -14,6 +14,7 @@ from codeflare_sdk.vendored.python_client.kuberay_cluster_api import RayClusterApi +@pytest.mark.tier1 class TestRayJobLifecycledCluster: """Test RayJob with auto-created cluster lifecycle management.""" diff --git a/tests/upgrade/raycluster_sdk_upgrade_test.py b/tests/upgrade/raycluster_sdk_upgrade_test.py index 80fd105f..7a917750 100644 --- a/tests/upgrade/raycluster_sdk_upgrade_test.py +++ b/tests/upgrade/raycluster_sdk_upgrade_test.py @@ -1,3 +1,4 @@ +import pytest import requests from time import sleep @@ -17,6 +18,7 @@ # Creates a Ray cluster +@pytest.mark.pre_upgrade class TestMNISTRayClusterApply: def setup_method(self): initialize_kubernetes_client(self) @@ -80,6 +82,7 @@ def run_mnist_raycluster_sdk_oauth(self): assert False, "Cluster is not ready!" +@pytest.mark.post_upgrade class TestMnistJobSubmit: def setup_method(self): initialize_kubernetes_client(self) @@ -101,15 +104,94 @@ def test_mnist_job_submission(self): # Assertions def assert_jobsubmit_withoutLogin(self, cluster): dashboard_url = cluster.cluster_dashboard_uri() - try: - RayJobClient(address=dashboard_url, verify=False) - assert False - except Exception as e: - if e.response.status_code == 403: - assert True + + # Verify that job submission is actually blocked by attempting to submit without auth + # The endpoint path depends on whether we're using HTTPRoute (with path prefix) or not + if "/ray/" in dashboard_url: + # HTTPRoute format: https://hostname/ray/namespace/cluster-name + # API endpoint is at the same base path + api_url = dashboard_url + "/api/jobs/" + else: + # OpenShift Route format: https://hostname + # API endpoint is directly under the hostname + api_url = dashboard_url + "/api/jobs/" + + jobdata = { + "entrypoint": "python mnist.py", + "runtime_env": { + "working_dir": "./tests/e2e/", + "pip": "./tests/e2e/mnist_pip_requirements.txt", + "env_vars": get_setup_env_variables(), + }, + } + + # Try to submit a job without authentication + # Follow redirects to see the final response - if it redirects to login, that's still a failure + response = requests.post( + api_url, verify=False, json=jobdata, allow_redirects=True + ) + + # Check if the submission was actually blocked + # Success indicators that submission was blocked: + # 1. Status code 403 (Forbidden) + # 2. Status code 302 (Redirect to login) - but we need to verify the final response after redirect + # 3. Status code 200 but with HTML content (login page) instead of JSON (job submission response) + # 4. Status code 401 (Unauthorized) + + submission_blocked = False + + if response.status_code == 403: + submission_blocked = True + elif response.status_code == 401: + submission_blocked = True + elif response.status_code == 302: + # Redirect happened - check if final response after redirect is also a failure + # If we followed redirects, check the final status + submission_blocked = True # Redirect to login means submission failed + elif response.status_code == 200: + # Check if response is HTML (login page) instead of JSON (job submission response) + content_type = response.headers.get("Content-Type", "") + if "text/html" in content_type or "application/json" not in content_type: + # Got HTML (likely login page) instead of JSON - submission was blocked + submission_blocked = True else: - print(f"An unexpected error occurred. Error: {e}") - assert False + # Got JSON response - check if it's an error or actually a successful submission + try: + json_response = response.json() + # If it's a successful job submission, it should have a 'job_id' or 'submission_id' + # If it's an error, it might have 'error' or 'message' + if "job_id" in json_response or "submission_id" in json_response: + # Job was actually submitted - this is a failure! + submission_blocked = False + else: + # Error response - submission was blocked + submission_blocked = True + except ValueError: + # Not JSON - likely HTML login page + submission_blocked = True + + if not submission_blocked: + assert ( + False + ), f"Job submission succeeded without authentication! Status: {response.status_code}, Response: {response.text[:200]}" + + # Also verify that RayJobClient cannot be used without authentication + try: + client = RayJobClient(address=dashboard_url, verify=False) + # Try to call a method to trigger the connection and authentication check + client.list_jobs() + assert ( + False + ), "RayJobClient succeeded without authentication - this should not be possible" + except ( + requests.exceptions.JSONDecodeError, + requests.exceptions.HTTPError, + Exception, + ): + # Any exception is expected when trying to use the client without auth + pass + + assert True, "Job submission without authentication was correctly blocked" def assert_jobsubmit_withlogin(self, cluster): auth_token = run_oc_command(["whoami", "--show-token=true"])