dgx-spark-playbooks/nvidia/station-healthcare-agent/assets/scripts/test-all.sh
2026-05-26 18:25:53 +00:00

448 lines
24 KiB
Bash
Executable File

#!/usr/bin/env bash
# test-all.sh -- Comprehensive CLI test suite for clinical-intelligence.
#
# Usage:
# bash scripts/test-all.sh # default: levels 1-3 (~3 min)
# bash scripts/test-all.sh --level 1 # infrastructure only (~30s)
# bash scripts/test-all.sh --level 4 # includes agent tests (~30 min)
# bash scripts/test-all.sh --level 5 # full e2e (~45 min)
# bash scripts/test-all.sh --test T3.8 # single test
# bash scripts/test-all.sh --verbose # show full output per test
#
# Runs from the DGX host. Requires: openshell CLI on PATH, sandbox running.
set -uo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
REPO_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
source "$SCRIPT_DIR/test-lib.sh"
# Source .env so OLLAMA_PORT/OPENFOLD_PORT overrides reach the curl URLs below.
if [ -f "$REPO_DIR/.env" ]; then
set -a
# shellcheck disable=SC1091
. "$REPO_DIR/.env"
set +a
fi
MAX_LEVEL=3
SINGLE_TEST=""
while [[ $# -gt 0 ]]; do
case "$1" in
--level) MAX_LEVEL="$2"; shift 2 ;;
--test) SINGLE_TEST="$2"; shift 2 ;;
--verbose) VERBOSE=true; shift ;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done
init_test_run
BRIDGE_IP=$(_bridge_ip)
# Helper: run only if test matches single-test filter or no filter set
should_run() {
[[ -z "$SINGLE_TEST" ]] || [[ "$1" == "$SINGLE_TEST" ]]
}
# ═══════════════════════════════════════════════════════════════════════
# Level 1: Infrastructure Health (host-side, ~30 seconds)
# ═══════════════════════════════════════════════════════════════════════
run_level1() {
echo ""
echo "═══ Level 1: Infrastructure Health ═══"
echo ""
should_run "T1.1" && run_test "T1.1" "Ollama alive" \
"curl -sf http://localhost:${OLLAMA_PORT:-11434}/" \
assert_exit_0 \
"Ollama not running. Docker (default): make up. Host Ollama alternative: OLLAMA_HOST=0.0.0.0 ollama serve."
should_run "T1.2" && run_test "T1.2" "Model available (nemotron-3-super)" \
"curl -s http://localhost:${OLLAMA_PORT:-11434}/api/tags | python3 -c \"import sys,json; names=[m['name'] for m in json.load(sys.stdin)['models']]; print('FOUND' if any('nemotron-3-super' in n for n in names) else 'MISSING')\"" \
assert_contains \
"Model not pulled. Run: ollama pull nemotron-3-super" \
"FOUND"
should_run "T1.3" && run_test "T1.3" "Ollama generates text (direct)" \
"curl -sf -m 30 -X POST http://localhost:${OLLAMA_PORT:-11434}/v1/chat/completions -H 'Content-Type: application/json' -d '{\"model\":\"nemotron-3-super:120b-a12b\",\"messages\":[{\"role\":\"user\",\"content\":\"Say OK\"}],\"max_tokens\":5}' | python3 -c \"import sys,json; c=json.load(sys.stdin).get('choices',[{}])[0].get('message',{}).get('content',''); print(c if c else 'EMPTY')\"" \
assert_output_not_empty \
"Ollama can't generate. Check: curl localhost:${OLLAMA_PORT:-11434}/api/ps"
should_run "T1.5" && run_test "T1.5" "OpenFold3 NIM ready" \
"curl -sf http://localhost:${OPENFOLD_PORT:-8000}/v1/health/ready" \
assert_contains \
"OpenFold3 not ready. Check: docker ps | grep openfold" \
"ready"
should_run "T1.6" && run_test "T1.6" "GPU accessible" \
"nvidia-smi --query-gpu=memory.used,memory.total --format=csv,noheader 2>&1" \
assert_contains \
"GPU not accessible. Check NVIDIA driver." \
"MiB"
should_run "T1.7" && run_test "T1.7" "FHIR server reachable" \
"curl -sf -o /dev/null -w '%{http_code}' https://r4.smarthealthit.org/metadata" \
assert_equals \
"FHIR unreachable. Check network/DNS." \
"200"
}
# ═══════════════════════════════════════════════════════════════════════
# Level 2: OpenShell + Sandbox Health (~1 minute)
# ═══════════════════════════════════════════════════════════════════════
run_level2() {
echo ""
echo "═══ Level 2: OpenShell + Sandbox Health ═══"
echo ""
should_run "T2.1" && run_test "T2.1" "Gateway connected" \
"openshell status 2>&1" \
assert_contains \
"Gateway down. Run: OPENSHELL_K3S_ARGS='--kubelet-arg=cgroup-driver=systemd' openshell gateway start" \
"Connected"
should_run "T2.2" && run_test "T2.2" "Sandbox exists and ready" \
"openshell sandbox list 2>&1" \
assert_contains \
"Sandbox not found. Run: bash scripts/setup_sandbox.sh" \
"Ready"
should_run "T2.3" && run_test "T2.3" "Forward running on 18789" \
"openshell forward list 2>&1" \
assert_contains \
"Forward dead. Run: openshell forward stop 18789 clinical-sandbox; openshell forward start -d 18789 clinical-sandbox" \
"running"
should_run "T2.3b" && run_test "T2.3b" "Gateway HTTP responding on 18789" \
"curl -sf -m 5 -o /dev/null -w %{http_code} http://127.0.0.1:18789/__openclaw__/health 2>&1 || curl -sf -m 5 -o /dev/null -w %{http_code} http://127.0.0.1:18789/ 2>&1" \
assert_contains \
"Gateway HTTP not responding. Forward exists but no listener — re-run scripts/restart_sandbox.sh inside the sandbox or check /tmp/gw.log for the os.networkInterfaces() crash (needs openclaw-os-shim.js loaded via NODE_OPTIONS=--require)." \
"200"
should_run "T2.4" && run_test "T2.4" "FHIR from sandbox (curl)" \
"_sandbox 'curl -sf https://r4.smarthealthit.org/Patient?_count=1 -o /dev/null -w %{http_code}'" \
assert_equals \
"FHIR blocked by sandbox policy. Check fhir section + python binary wildcards." \
"200"
should_run "T2.5" && run_test "T2.5" "Inference from sandbox" \
"_sandbox 'curl -sk https://inference.local/v1/models'" \
assert_contains \
"Inference not routed. Check: openshell inference get" \
"nemotron"
should_run "T2.6" && run_test "T2.6" "OpenFold3 health from sandbox" \
"_sandbox 'curl -sf http://${BRIDGE_IP}:8000/v1/health/ready'" \
assert_contains \
"OpenFold3 health check blocked by sandbox. Check openfold3 policy." \
"ready"
should_run "T2.6b" && run_test "T2.6b" "OpenFold3 predict endpoint reachable from sandbox" \
"_sandbox 'curl -s -o /dev/null -w %{http_code} -X POST -H \"Content-Type: application/json\" -d \"{\\\"dummy\\\": true}\" http://${BRIDGE_IP}:8000/biology/openfold/openfold3/predict'" \
assert_not_contains \
"OpenFold3 predict blocked (HTTP 403). Sandbox policy may have L7 rules that break plain HTTP." \
"403"
should_run "T2.6c" && run_test "T2.6c" "OpenFold3 predict accepts POST from sandbox" \
"_sandbox 'curl -s -w \"\n%{http_code}\" -X POST -H \"Content-Type: application/json\" -d \"{\\\"inputs\\\":[{\\\"input_id\\\":\\\"test\\\",\\\"molecules\\\":[{\\\"type\\\":\\\"protein\\\",\\\"id\\\":\\\"A\\\",\\\"sequence\\\":\\\"MKTVRQERLKSIVRI\\\",\\\"msa\\\":{\\\"main\\\":{\\\"a3m\\\":{\\\"alignment\\\":\\\">q\\\\nMKTVRQERLKSIVRI\\\",\\\"format\\\":\\\"a3m\\\"}}}}],\\\"output_format\\\":\\\"pdb\\\"}]}\" http://${BRIDGE_IP}:8000/biology/openfold/openfold3/predict 2>&1 | tail -1'" \
assert_contains \
"OpenFold3 predict endpoint rejected POST from sandbox. Check sandbox policy and OpenFold3 NIM status." \
"200"
should_run "T2.7" && run_test "T2.7" "Outbound traffic blocked (security)" \
"_sandbox 'curl --max-time 3 https://google.com 2>&1; echo EXIT_CODE=\$?'" \
assert_not_contains \
"SECURITY FAILURE: Outbound traffic NOT blocked!" \
"EXIT_CODE=0"
should_run "T2.8" && run_test "T2.8" "Python packages available" \
"_sandbox 'python -c \"import subprocess, json, pandas, matplotlib; print(\\\"OK\\\")\"'" \
assert_contains \
"Python packages missing. Sandbox may need recreation." \
"OK"
}
# ═══════════════════════════════════════════════════════════════════════
# Level 3: OpenClaw Configuration Correctness (~2 minutes)
# ═══════════════════════════════════════════════════════════════════════
run_level3() {
echo ""
echo "═══ Level 3: OpenClaw Configuration ═══"
echo ""
# -- Gateway process and logs --
echo " --- Gateway ---"
should_run "T3.1" && run_test "T3.1" "Gateway process alive" \
"_sandbox 'pgrep -f openclaw-gateway > /dev/null && echo ALIVE || echo DEAD'" \
assert_contains \
"OpenClaw gateway not running. Restart it." \
"ALIVE"
should_run "T3.2" && run_test "T3.2" "Gateway model correct" \
"_sandbox 'grep \"agent model\" /tmp/gw.log 2>/dev/null | tail -1'" \
warn:assert_contains \
"Wrong model. Check ~/.openclaw/openclaw.json" \
"local-ollama/nemotron-3-super"
should_run "T3.3" && run_test "T3.3" "Gateway no errors" \
"_sandbox 'head -50 /tmp/gw.log 2>/dev/null | grep -iE \"\\[error\\]|\\[fatal\\]|crashed|segfault\" | grep -cv apply_patch'" \
assert_equals \
"Gateway has startup errors. Run: _sandbox head -50 /tmp/gw.log" \
"0"
# -- Model and auth --
echo " --- Model & Auth ---"
should_run "T3.4" && run_test "T3.4" "Model auth OK (not missing)" \
"_sandbox 'openclaw models list 2>&1 | grep nemotron | grep -c missing; true'" \
assert_equals \
"Auth profile missing. Recreate auth-profiles.json for all agents." \
"0"
local agents="main patient-data labs-vitals medications analyst molecular"
local auth_suffix=a
for agent in $agents; do
should_run "T3.5${auth_suffix}" && run_test "T3.5${auth_suffix}" "Auth profile exists: $agent" \
"_sandbox 'test -f ~/.openclaw/agents/${agent}/agent/auth-profiles.json && echo EXISTS || echo MISSING'" \
assert_contains \
"Auth profile missing for $agent. Rerun setup step 10." \
"EXISTS"
auth_suffix=$(echo "$auth_suffix" | tr 'a-e' 'b-f')
done
should_run "T3.6" && run_test "T3.6" "Auth profile content valid" \
"_sandbox 'cat ~/.openclaw/agents/main/agent/auth-profiles.json 2>/dev/null'" \
assert_contains \
"Auth profile malformed. Should contain version:1 and provider:local-ollama." \
"local-ollama"
# -- Skills --
echo " --- Skills ---"
should_run "T3.7" && run_test "T3.7" "Skills count (expect 7)" \
"_sandbox 'openclaw skills list 2>&1 | grep -c openclaw-workspace'" \
assert_equals \
"Not all skills loaded. Redeploy to ~/.openclaw/workspace/skills/" \
"7"
local skills="analysis-methods case-summary clinical-delegation clinical-knowledge cohort-compare fhir-basics molecular-viz"
local skill_suffix=a
for skill in $skills; do
should_run "T3.8${skill_suffix}" && run_test "T3.8${skill_suffix}" "Skill loaded: $skill" \
"_sandbox 'openclaw skills list 2>&1 | grep ${skill} | grep -c ready'" \
assert_numeric_gt \
"Skill $skill not loaded. Check ~/.openclaw/workspace/skills/${skill}/SKILL.md" \
"0"
skill_suffix=$(echo "$skill_suffix" | tr 'a-f' 'b-g')
done
should_run "T3.9" && run_test "T3.9" "analysis-methods uses subprocess" \
"_sandbox 'grep -c subprocess ~/.openclaw/workspace/skills/analysis-methods/SKILL.md 2>/dev/null'" \
assert_numeric_gt \
"analysis-methods skill still uses requests. Redeploy updated version." \
"0"
should_run "T3.10" && run_test "T3.10" "fhir-basics uses subprocess" \
"_sandbox 'grep -c subprocess ~/.openclaw/workspace/skills/fhir-basics/SKILL.md 2>/dev/null'" \
assert_numeric_gt \
"fhir-basics skill still uses requests. Redeploy updated version." \
"0"
# -- Agents --
echo " --- Agents ---"
should_run "T3.11" && run_test "T3.11" "Agents count (expect >= 5)" \
"_sandbox 'openclaw agents list 2>&1 | grep -c Workspace:'" \
assert_numeric_gt \
"Not all agents registered. Rerun setup step 9." \
"4"
local agent_suffix=a
for agent in patient-data labs-vitals medications analyst molecular; do
should_run "T3.12${agent_suffix}" && run_test "T3.12${agent_suffix}" "Agent registered: $agent" \
"_sandbox 'openclaw agents list 2>&1 | grep -c ${agent}'" \
assert_numeric_gt \
"Agent $agent not registered." \
"0"
agent_suffix=$(echo "$agent_suffix" | tr 'a-d' 'b-e')
done
# -- IDENTITY.md --
echo " --- IDENTITY.md ---"
should_run "T3.13" && run_test "T3.13" "IDENTITY.md exists" \
"_sandbox 'test -f ~/.openclaw/workspace/IDENTITY.md && echo EXISTS || echo MISSING'" \
assert_contains \
"IDENTITY.md not deployed." \
"EXISTS"
should_run "T3.14" && run_test "T3.14" "IDENTITY.md header correct" \
"_sandbox 'head -1 ~/.openclaw/workspace/IDENTITY.md'" \
assert_contains \
"IDENTITY.md has wrong header." \
"Clinical Intelligence"
should_run "T3.15" && run_test "T3.15" "IDENTITY.md has molecular delegation" \
"_sandbox 'grep -c molecular ~/.openclaw/workspace/IDENTITY.md 2>/dev/null'" \
assert_numeric_gt \
"IDENTITY.md missing molecular agent delegation." \
"0"
should_run "T3.16" && run_test "T3.16" "IDENTITY.md has how-to-work section" \
"_sandbox 'grep -c 'How to work' ~/.openclaw/workspace/IDENTITY.md 2>/dev/null'" \
assert_numeric_gt \
"IDENTITY.md missing How to work section." \
"0"
should_run "T3.17" && run_test "T3.17" "IDENTITY.md has principles" \
"_sandbox 'grep -c Principles ~/.openclaw/workspace/IDENTITY.md 2>/dev/null'" \
assert_numeric_gt \
"IDENTITY.md missing Principles section." \
"0"
# -- openclaw.json --
echo " --- openclaw.json ---"
should_run "T3.18" && run_test "T3.18" "Model in openclaw.json" \
"_sandbox 'python3 -c \"import json,os; d=json.load(open(os.path.expanduser(\\\"~/.openclaw/openclaw.json\\\"))); print(d[\\\"agents\\\"][\\\"defaults\\\"][\\\"model\\\"])\"'" \
assert_contains \
"Wrong model in openclaw.json." \
"local-ollama/nemotron-3-super"
should_run "T3.19" && run_test "T3.19" "allowAgents includes molecular" \
"_sandbox 'python3 -c \"import json,os; d=json.load(open(os.path.expanduser(\\\"~/.openclaw/openclaw.json\\\"))); a=d[\\\"agents\\\"][\\\"list\\\"][0][\\\"subagents\\\"][\\\"allowAgents\\\"]; print(\\\"OK\\\" if \\\"molecular\\\" in a else \\\"MISSING\\\")\"'" \
assert_contains \
"molecular not in allowAgents. Update openclaw.json." \
"OK"
# -- Scripts --
echo " --- Scripts ---"
should_run "T3.20" && run_test "T3.20" "build_viewer.py exists in sandbox" \
"_sandbox 'test -f /sandbox/clinical-intelligence/scripts/build_viewer.py && echo EXISTS || echo MISSING'" \
assert_contains \
"build_viewer.py not uploaded to sandbox." \
"EXISTS"
should_run "T3.21" && run_test "T3.21" "build_viewer.py uses subprocess" \
"_sandbox 'grep -c subprocess.run /sandbox/clinical-intelligence/scripts/build_viewer.py 2>/dev/null'" \
assert_numeric_gt \
"build_viewer.py still uses urllib.request. Deploy updated version." \
"0"
should_run "T3.22" && run_test "T3.22" "validate_and_run.py exists" \
"_sandbox 'test -f /sandbox/clinical-intelligence/scripts/validate_and_run.py && echo EXISTS || echo MISSING'" \
assert_contains \
"validate_and_run.py not uploaded to sandbox." \
"EXISTS"
# -- Smoke test --
echo " --- Smoke Test ---"
should_run "T3.23" && run_test "T3.23" "Agent responds to prompt" \
"_sandbox 'cd /sandbox/clinical-intelligence && openclaw agent --local --session-id smoke-\$\$ --thinking off --message \"Say OK\" --timeout 60 2>&1 | tail -5'" \
assert_contains \
"Agent cannot respond. Check all Level 3 tests above first." \
"OK"
}
# ═══════════════════════════════════════════════════════════════════════
# Level 4: Agent Functional Tests (~20-30 minutes)
# ═══════════════════════════════════════════════════════════════════════
run_level4() {
echo ""
echo "═══ Level 4: Agent Functional Tests ═══"
echo ""
should_run "T4.1" && run_test "T4.1" "Cohort count (expect 47)" \
"_sandbox 'cd /sandbox/clinical-intelligence && openclaw agent --local --session-id t41-\$\$ --thinking off --timeout 300 --message \"Find all diabetic patients and count them\" 2>&1 | tail -20'" \
assert_contains \
"Agent failed cohort query. Check FHIR access + analysis-methods skill." \
"47"
should_run "T4.2" && run_test "T4.2" "Lab retrieval (HbA1c value)" \
"_sandbox 'cd /sandbox/clinical-intelligence && openclaw agent --local --session-id t42-\$\$ --thinking off --timeout 300 --message \"Get the latest HbA1c for patient 9eb43ac3-7c1e-4e25-94cd-4b2c43f7234e\" 2>&1 | tail -20'" \
assert_output_not_empty \
"Agent failed lab retrieval."
should_run "T4.3" && run_test "T4.3" "Code execution (print 42)" \
"_sandbox 'cd /sandbox/clinical-intelligence && openclaw agent --local --session-id t43-\$\$ --thinking off --timeout 120 --message \"Write a Python script that prints 42 and execute it\" 2>&1 | tail -10'" \
assert_contains \
"Agent cannot execute code." \
"42"
should_run "T4.4" && run_test "T4.4" "Molecular visualization" \
"_sandbox 'cd /sandbox/clinical-intelligence && rm -f ~/.openclaw/canvas/atorvastatin*.html && openclaw agent --local --session-id t44-\$\$ --thinking off --timeout 300 --message \"Show me the 3D structure of atorvastatin bound to its target HMG-CoA reductase\" 2>&1 | tail -10; ls -la ~/.openclaw/canvas/atorvastatin*.html 2>/dev/null | wc -l'" \
assert_numeric_gt \
"Molecular viz failed. Check OpenFold3 access + build_viewer.py." \
"0"
# -- OpenFold3 / molecular-viz tests --
echo " --- OpenFold3 / Molecular Viz ---"
should_run "T4.5" && run_test "T4.5" "OpenFold3 prediction response schema" \
"_sandbox 'curl -sf --max-time 300 -X POST -H \"Content-Type: application/json\" -d \"{\\\"inputs\\\":[{\\\"input_id\\\":\\\"schema-test\\\",\\\"molecules\\\":[{\\\"type\\\":\\\"protein\\\",\\\"id\\\":\\\"A\\\",\\\"sequence\\\":\\\"FVNQHLCGSHLVEALYLVCGERGFFYTPKT\\\",\\\"msa\\\":{\\\"main\\\":{\\\"a3m\\\":{\\\"alignment\\\":\\\">q\\\\nFVNQHLCGSHLVEALYLVCGERGFFYTPKT\\\",\\\"format\\\":\\\"a3m\\\"}}}}],\\\"output_format\\\":\\\"pdb\\\"}]}\" http://${BRIDGE_IP}:8000/biology/openfold/openfold3/predict | python3 -c \"import sys,json; r=json.load(sys.stdin); o=r[\\\"outputs\\\"][0][\\\"structures_with_scores\\\"][0]; assert \\\"structure\\\" in o, \\\"missing structure\\\"; assert \\\"confidence_score\\\" in o, \\\"missing confidence_score\\\"; assert \\\"complex_plddt_score\\\" in o, \\\"missing plddt\\\"; assert \\\"ptm_score\\\" in o, \\\"missing ptm\\\"; print(\\\"SCHEMA_OK\\\")\"'" \
assert_contains \
"OpenFold3 response missing expected fields (structure, confidence_score, plddt, ptm). Check NIM version." \
"SCHEMA_OK"
should_run "T4.6" && run_test "T4.6" "OpenFold3 confidence scores are numeric" \
"_sandbox 'curl -sf --max-time 300 -X POST -H \"Content-Type: application/json\" -d \"{\\\"inputs\\\":[{\\\"input_id\\\":\\\"score-test\\\",\\\"molecules\\\":[{\\\"type\\\":\\\"protein\\\",\\\"id\\\":\\\"A\\\",\\\"sequence\\\":\\\"FVNQHLCGSHLVEALYLVCGERGFFYTPKT\\\",\\\"msa\\\":{\\\"main\\\":{\\\"a3m\\\":{\\\"alignment\\\":\\\">q\\\\nFVNQHLCGSHLVEALYLVCGERGFFYTPKT\\\",\\\"format\\\":\\\"a3m\\\"}}}}],\\\"output_format\\\":\\\"pdb\\\"}]}\" http://${BRIDGE_IP}:8000/biology/openfold/openfold3/predict | python3 -c \"import sys,json; r=json.load(sys.stdin); o=r[\\\"outputs\\\"][0][\\\"structures_with_scores\\\"][0]; plddt=float(o[\\\"complex_plddt_score\\\"]); ptm=float(o[\\\"ptm_score\\\"]); conf=float(o[\\\"confidence_score\\\"]); iptm=float(o.get(\\\"iptm_score\\\",0)); print(f\\\"pLDDT={plddt:.1f} pTM={ptm:.2f} ipTM={iptm:.2f} conf={conf:.2f}\\\"); assert plddt > 0, \\\"pLDDT not positive\\\"; assert ptm >= 0, \\\"pTM negative\\\"; print(\\\"SCORES_OK\\\")\"'" \
assert_contains \
"Confidence scores not numeric or out of range. Check OpenFold3 prediction output." \
"SCORES_OK"
should_run "T4.7" && run_test "T4.7" "build_viewer.py HTML output valid" \
"_sandbox 'cd /sandbox/clinical-intelligence && rm -f ~/.openclaw/canvas/metformin_complex.html ~/.openclaw/canvas/metformin_complex.pdb && python3 scripts/build_viewer.py --drug metformin --openfold-host ${BRIDGE_IP} 2>&1; cat ~/.openclaw/canvas/metformin_complex.html 2>/dev/null | python3 -c \"import sys; html=sys.stdin.read(); checks=[\\\"3Dmol\\\" in html, \\\"ATOM\\\" in html or \\\"HETATM\\\" in html, \\\"pLDDT\\\" in html, \\\"confidence\\\" in html.lower()]; print(f\\\"3Dmol={checks[0]} PDB={checks[1]} pLDDT={checks[2]} conf={checks[3]}\\\"); print(\\\"HTML_OK\\\" if all(checks) else \\\"HTML_FAIL\\\")\"'" \
assert_contains \
"build_viewer.py HTML missing 3Dmol.js, PDB structure data, or confidence scores. Check script output." \
"HTML_OK"
should_run "T4.8" && run_test "T4.8" "build_viewer.py creates file in canvas" \
"_sandbox 'test -f ~/.openclaw/canvas/metformin_complex.html && stat -c %s ~/.openclaw/canvas/metformin_complex.html || echo 0'" \
assert_numeric_gt \
"build_viewer.py did not create output file in canvas dir. Run T4.7 first." \
"1000"
}
# ═══════════════════════════════════════════════════════════════════════
# Level 5: End-to-End Integration (~15 minutes)
# ═══════════════════════════════════════════════════════════════════════
run_level5() {
echo ""
echo "═══ Level 5: End-to-End Integration ═══"
echo ""
should_run "T5.1" && run_test "T5.1" "Cross-condition analysis (diabetes + hypertension + eGFR)" \
"_sandbox 'cd /sandbox/clinical-intelligence && openclaw agent --local --session-id t51-\$\$ --thinking off --timeout 600 --message \"Find all diabetic patients that also have hypertension. For the overlap, get their eGFR. Flag anyone with eGFR below 60 as kidney disease risk.\" 2>&1 | tail -30'" \
assert_contains \
"Cross-condition query failed. Run Level 4 tests individually to isolate." \
"24"
}
# ═══════════════════════════════════════════════════════════════════════
# Main
# ═══════════════════════════════════════════════════════════════════════
echo ""
echo "╔════════════════════════════════════════════════╗"
echo "║ Clinical Intelligence Test Suite ║"
echo "║ Max level: $MAX_LEVEL"
echo "╚════════════════════════════════════════════════╝"
(( MAX_LEVEL >= 1 )) && run_level1
(( MAX_LEVEL >= 2 )) && run_level2
(( MAX_LEVEL >= 3 )) && run_level3
(( MAX_LEVEL >= 4 )) && run_level4
(( MAX_LEVEL >= 5 )) && run_level5
print_summary