mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-22 18:13:52 +00:00
307 lines
12 KiB
Python
Executable File
307 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
LLM Benchmark Script: vLLM vs Ollama Performance Comparison
|
|
Compares performance metrics between vLLM and Ollama deployments
|
|
"""
|
|
|
|
import asyncio
|
|
import aiohttp
|
|
import time
|
|
import json
|
|
import statistics
|
|
import argparse
|
|
from typing import List, Dict, Any
|
|
from dataclasses import dataclass
|
|
import sys
|
|
|
|
@dataclass
|
|
class BenchmarkResult:
|
|
service: str
|
|
model: str
|
|
prompt_tokens: int
|
|
completion_tokens: int
|
|
total_tokens: int
|
|
response_time: float
|
|
tokens_per_second: float
|
|
first_token_time: float = 0.0
|
|
error: str = ""
|
|
|
|
class LLMBenchmark:
|
|
def __init__(self):
|
|
self.vllm_url = "http://localhost:8001"
|
|
self.ollama_url = "http://localhost:11434"
|
|
|
|
async def test_vllm(self, session: aiohttp.ClientSession, prompt: str, max_tokens: int = 100) -> BenchmarkResult:
|
|
"""Test vLLM performance"""
|
|
start_time = time.time()
|
|
|
|
payload = {
|
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
"prompt": prompt,
|
|
"max_tokens": max_tokens,
|
|
"temperature": 0.7,
|
|
"stream": False
|
|
}
|
|
|
|
try:
|
|
async with session.post(f"{self.vllm_url}/v1/completions", json=payload) as response:
|
|
if response.status != 200:
|
|
error_text = await response.text()
|
|
return BenchmarkResult(
|
|
service="vLLM",
|
|
model="Llama-3.1-8B-Instruct",
|
|
prompt_tokens=0,
|
|
completion_tokens=0,
|
|
total_tokens=0,
|
|
response_time=0,
|
|
tokens_per_second=0,
|
|
error=f"HTTP {response.status}: {error_text}"
|
|
)
|
|
|
|
result = await response.json()
|
|
end_time = time.time()
|
|
|
|
response_time = end_time - start_time
|
|
usage = result.get("usage", {})
|
|
prompt_tokens = usage.get("prompt_tokens", 0)
|
|
completion_tokens = usage.get("completion_tokens", 0)
|
|
total_tokens = usage.get("total_tokens", 0)
|
|
|
|
tokens_per_second = completion_tokens / response_time if response_time > 0 else 0
|
|
|
|
return BenchmarkResult(
|
|
service="vLLM",
|
|
model="Llama-3.1-8B-Instruct",
|
|
prompt_tokens=prompt_tokens,
|
|
completion_tokens=completion_tokens,
|
|
total_tokens=total_tokens,
|
|
response_time=response_time,
|
|
tokens_per_second=tokens_per_second
|
|
)
|
|
|
|
except Exception as e:
|
|
return BenchmarkResult(
|
|
service="vLLM",
|
|
model="Llama-3.1-8B-Instruct",
|
|
prompt_tokens=0,
|
|
completion_tokens=0,
|
|
total_tokens=0,
|
|
response_time=0,
|
|
tokens_per_second=0,
|
|
error=str(e)
|
|
)
|
|
|
|
async def test_ollama(self, session: aiohttp.ClientSession, prompt: str, max_tokens: int = 100) -> BenchmarkResult:
|
|
"""Test Ollama performance"""
|
|
start_time = time.time()
|
|
|
|
payload = {
|
|
"model": "llama3.1:8b",
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {
|
|
"num_predict": max_tokens,
|
|
"temperature": 0.7
|
|
}
|
|
}
|
|
|
|
try:
|
|
async with session.post(f"{self.ollama_url}/api/generate", json=payload) as response:
|
|
if response.status != 200:
|
|
error_text = await response.text()
|
|
return BenchmarkResult(
|
|
service="Ollama",
|
|
model="llama3.1:8b",
|
|
prompt_tokens=0,
|
|
completion_tokens=0,
|
|
total_tokens=0,
|
|
response_time=0,
|
|
tokens_per_second=0,
|
|
error=f"HTTP {response.status}: {error_text}"
|
|
)
|
|
|
|
result = await response.json()
|
|
end_time = time.time()
|
|
|
|
response_time = end_time - start_time
|
|
|
|
# Ollama response format
|
|
prompt_eval_count = result.get("prompt_eval_count", 0)
|
|
eval_count = result.get("eval_count", 0)
|
|
total_tokens = prompt_eval_count + eval_count
|
|
|
|
tokens_per_second = eval_count / response_time if response_time > 0 else 0
|
|
|
|
return BenchmarkResult(
|
|
service="Ollama",
|
|
model="llama3.1:8b",
|
|
prompt_tokens=prompt_eval_count,
|
|
completion_tokens=eval_count,
|
|
total_tokens=total_tokens,
|
|
response_time=response_time,
|
|
tokens_per_second=tokens_per_second
|
|
)
|
|
|
|
except Exception as e:
|
|
return BenchmarkResult(
|
|
service="Ollama",
|
|
model="llama3.1:8b",
|
|
prompt_tokens=0,
|
|
completion_tokens=0,
|
|
total_tokens=0,
|
|
response_time=0,
|
|
tokens_per_second=0,
|
|
error=str(e)
|
|
)
|
|
|
|
async def run_single_test(self, prompt: str, max_tokens: int = 100) -> tuple[BenchmarkResult, BenchmarkResult]:
|
|
"""Run a single test comparing both services"""
|
|
async with aiohttp.ClientSession() as session:
|
|
# Test both services concurrently
|
|
vllm_task = self.test_vllm(session, prompt, max_tokens)
|
|
ollama_task = self.test_ollama(session, prompt, max_tokens)
|
|
|
|
vllm_result, ollama_result = await asyncio.gather(vllm_task, ollama_task)
|
|
return vllm_result, ollama_result
|
|
|
|
async def run_benchmark(self, prompts: List[str], max_tokens: int = 100, runs_per_prompt: int = 3) -> Dict[str, List[BenchmarkResult]]:
|
|
"""Run comprehensive benchmark"""
|
|
results = {"vLLM": [], "Ollama": []}
|
|
|
|
print(f"Running benchmark with {len(prompts)} prompts, {runs_per_prompt} runs each...")
|
|
print(f"Max tokens per completion: {max_tokens}")
|
|
print("=" * 60)
|
|
|
|
for i, prompt in enumerate(prompts, 1):
|
|
print(f"\nPrompt {i}/{len(prompts)}: {prompt[:50]}...")
|
|
|
|
for run in range(runs_per_prompt):
|
|
print(f" Run {run + 1}/{runs_per_prompt}...", end=" ")
|
|
|
|
vllm_result, ollama_result = await self.run_single_test(prompt, max_tokens)
|
|
|
|
results["vLLM"].append(vllm_result)
|
|
results["Ollama"].append(ollama_result)
|
|
|
|
# Print quick results
|
|
if vllm_result.error:
|
|
print(f"vLLM: ERROR - {vllm_result.error}")
|
|
else:
|
|
print(f"vLLM: {vllm_result.response_time:.2f}s ({vllm_result.tokens_per_second:.1f} tok/s)", end=" | ")
|
|
|
|
if ollama_result.error:
|
|
print(f"Ollama: ERROR - {ollama_result.error}")
|
|
else:
|
|
print(f"Ollama: {ollama_result.response_time:.2f}s ({ollama_result.tokens_per_second:.1f} tok/s)")
|
|
|
|
# Small delay between runs
|
|
await asyncio.sleep(1)
|
|
|
|
return results
|
|
|
|
def analyze_results(self, results: Dict[str, List[BenchmarkResult]]):
|
|
"""Analyze and print benchmark results"""
|
|
print("\n" + "=" * 80)
|
|
print("BENCHMARK RESULTS ANALYSIS")
|
|
print("=" * 80)
|
|
|
|
for service_name, service_results in results.items():
|
|
print(f"\n{service_name} Results:")
|
|
print("-" * 40)
|
|
|
|
# Filter out errors
|
|
valid_results = [r for r in service_results if not r.error]
|
|
error_results = [r for r in service_results if r.error]
|
|
|
|
if error_results:
|
|
print(f"Errors: {len(error_results)}/{len(service_results)}")
|
|
for error in set(r.error for r in error_results):
|
|
print(f" - {error}")
|
|
print()
|
|
|
|
if not valid_results:
|
|
print("No valid results to analyze.")
|
|
continue
|
|
|
|
# Calculate statistics
|
|
response_times = [r.response_time for r in valid_results]
|
|
tokens_per_second = [r.tokens_per_second for r in valid_results]
|
|
completion_tokens = [r.completion_tokens for r in valid_results]
|
|
|
|
print(f"Valid runs: {len(valid_results)}")
|
|
print(f"Response time (avg): {statistics.mean(response_times):.3f}s")
|
|
print(f"Response time (median): {statistics.median(response_times):.3f}s")
|
|
print(f"Response time (min/max): {min(response_times):.3f}s / {max(response_times):.3f}s")
|
|
print(f"Tokens/second (avg): {statistics.mean(tokens_per_second):.1f}")
|
|
print(f"Tokens/second (median): {statistics.median(tokens_per_second):.1f}")
|
|
print(f"Tokens/second (min/max): {min(tokens_per_second):.1f} / {max(tokens_per_second):.1f}")
|
|
print(f"Completion tokens (avg): {statistics.mean(completion_tokens):.1f}")
|
|
|
|
# Comparison
|
|
vllm_valid = [r for r in results["vLLM"] if not r.error]
|
|
ollama_valid = [r for r in results["Ollama"] if not r.error]
|
|
|
|
if vllm_valid and ollama_valid:
|
|
print("\n" + "=" * 40)
|
|
print("PERFORMANCE COMPARISON")
|
|
print("=" * 40)
|
|
|
|
vllm_avg_response = statistics.mean([r.response_time for r in vllm_valid])
|
|
ollama_avg_response = statistics.mean([r.response_time for r in ollama_valid])
|
|
|
|
vllm_avg_tokens_sec = statistics.mean([r.tokens_per_second for r in vllm_valid])
|
|
ollama_avg_tokens_sec = statistics.mean([r.tokens_per_second for r in ollama_valid])
|
|
|
|
if vllm_avg_response < ollama_avg_response:
|
|
speedup = ollama_avg_response / vllm_avg_response
|
|
print(f"🏆 vLLM is {speedup:.2f}x FASTER in response time")
|
|
else:
|
|
speedup = vllm_avg_response / ollama_avg_response
|
|
print(f"🏆 Ollama is {speedup:.2f}x FASTER in response time")
|
|
|
|
if vllm_avg_tokens_sec > ollama_avg_tokens_sec:
|
|
throughput_ratio = vllm_avg_tokens_sec / ollama_avg_tokens_sec
|
|
print(f"🚀 vLLM has {throughput_ratio:.2f}x HIGHER throughput")
|
|
else:
|
|
throughput_ratio = ollama_avg_tokens_sec / vllm_avg_tokens_sec
|
|
print(f"🚀 Ollama has {throughput_ratio:.2f}x HIGHER throughput")
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Benchmark vLLM vs Ollama")
|
|
parser.add_argument("--max-tokens", type=int, default=100, help="Max tokens per completion")
|
|
parser.add_argument("--runs", type=int, default=3, help="Number of runs per prompt")
|
|
parser.add_argument("--quick", action="store_true", help="Run quick test with fewer prompts")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Test prompts
|
|
if args.quick:
|
|
prompts = [
|
|
"What is the capital of France?",
|
|
"Explain quantum computing in simple terms.",
|
|
]
|
|
else:
|
|
prompts = [
|
|
"What is the capital of France?",
|
|
"Explain quantum computing in simple terms.",
|
|
"Write a short story about a robot learning to paint.",
|
|
"What are the benefits of renewable energy?",
|
|
"Describe the process of photosynthesis.",
|
|
"How does machine learning work?",
|
|
]
|
|
|
|
benchmark = LLMBenchmark()
|
|
|
|
try:
|
|
results = asyncio.run(benchmark.run_benchmark(prompts, args.max_tokens, args.runs))
|
|
benchmark.analyze_results(results)
|
|
except KeyboardInterrupt:
|
|
print("\nBenchmark interrupted by user.")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"\nBenchmark failed: {e}")
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|