LCORE-1497: Fix disruption flag not reset when Prow lightspeed restart restores llama-stack (#1628)

are-ces · claude · web-flow · commit d15d37dba4d0 · 2026-05-08T11:16:06.000+02:00
* Add diagnostic pod logs on e2e failure and remove disrupt-once optimization

* Increase vLLM max-model-len to 35936 (GPU memory limit)

* Accept 503 as valid port-forward proof in e2e connectivity check

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-cpu.yaml
@@ -24,7 +24,7 @@ spec:
         - --port
         - "8080"
         - --max-model-len
-        - "32768"
+        - "35936"
       image: quay.io/rh-ee-cpompeia/vllm-cpu:latest
       name: kserve-container
       env:
diff --git a/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml b/tests/e2e-prow/rhoai/manifests/vllm/vllm-runtime-gpu.yaml
@@ -24,7 +24,7 @@ spec:
         - --port
         - "8080"
         - --max-model-len
-        - "32768"
+        - "35936"
         - --gpu-memory-utilization
         - "0.9"
       image: ${VLLM_IMAGE}
diff --git a/tests/e2e-prow/rhoai/scripts/e2e-ops.sh b/tests/e2e-prow/rhoai/scripts/e2e-ops.sh
@@ -192,10 +192,10 @@ verify_connectivity() {
     local http_code=""
 
     for ((attempt=1; attempt<=max_attempts; attempt++)); do
-        # First check /readiness to see if port-forward is alive (accept 200 or 401)
+        # First check /readiness to see if port-forward is alive (accept 200, 401, or 503)
         http_code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 5 "http://localhost:$local_port/readiness" 2>/dev/null) || http_code="000"
 
-        if [[ "$http_code" == "200" || "$http_code" == "401" ]]; then
+        if [[ "$http_code" == "200" || "$http_code" == "401" || "$http_code" == "503" ]]; then
             # Port-forward works; now verify the app is fully initialized by hitting
             # a real endpoint. /v1/models requires the Llama Stack handshake to complete.
             # Accept 200 (no auth) or 401 (auth enabled) — both prove the full app
diff --git a/tests/e2e/features/environment.py b/tests/e2e/features/environment.py
@@ -237,6 +237,26 @@ def before_scenario(context: Context, scenario: Scenario) -> None:
             delattr(context, _attr)
 
 
+def _dump_pod_logs_on_failure(scenario: Scenario, namespace: str) -> None:
+    """Dump llama-stack and lightspeed-stack pod logs when a scenario fails in Prow."""
+    if scenario.status != "failed":
+        return
+    for pod in ("llama-stack-service", "lightspeed-stack-service"):
+        print(f"--- {pod} logs (scenario failed: {scenario.name}) ---")
+        try:
+            r = subprocess.run(
+                ["oc", "logs", pod, "-n", namespace, "--tail=100"],
+                capture_output=True,
+                text=True,
+                timeout=15,
+                check=False,
+            )
+            print(r.stdout or r.stderr or "(no output)")
+        except subprocess.TimeoutExpired:
+            print("(timed out fetching logs)")
+        print(f"--- end {pod} logs ---")
+
+
 def after_scenario(context: Context, scenario: Scenario) -> None:
     """Run after each scenario is run.
 
@@ -266,6 +286,11 @@ def after_scenario(context: Context, scenario: Scenario) -> None:
               used for the llama-stack health check.
         scenario (Scenario): Behave scenario (unused; shield restore uses context flags).
     """
+    if is_prow_environment():
+        _dump_pod_logs_on_failure(
+            scenario, os.environ.get("NAMESPACE", "e2e-rhoai-dsc")
+        )
+
     if getattr(context, "scenario_lightspeed_override_active", False):
         context.scenario_lightspeed_override_active = False
         feature_cfg = getattr(context, "feature_config", None)