"""
Local LLM Cost Calculator
========================
Calculates real operating costs for local LLM inference including:
1. Electricity consumption during inference
2. Hardware amortization (GPU/CPU depreciation)
3. Optional cloud compute costs
No more fantasy $0.00 costs - local models have real expenses.
"""
import logging
import os
from enum import Enum
from typing import Optional
logger = logging.getLogger(__name__)
[docs]
class CostPolicy(Enum):
"""Cost calculation policies for local LLMs"""
CALCULATE = "calculate" # Calculate real costs
NULL_FAIL = "null_fail" # Set to null and fail pipeline
ZERO_LEGACY = "zero_legacy" # Legacy $0.00 (deprecated)
[docs]
class LocalCostCalculator:
"""
Calculate real operating costs for local LLM inference.
Cost components:
1. Electricity: GPU/CPU power consumption during inference
2. Hardware amortization: Depreciation of compute hardware
3. Cloud costs: If running on rented cloud infrastructure
"""
[docs]
def __init__(
self,
policy: str = "calculate",
electricity_rate_usd_per_kwh: float = None,
hardware_cost_usd: float = None,
hardware_lifespan_months: int = 36,
gpu_tdp_watts: float = None,
cpu_tdp_watts: float = None,
):
"""
Initialize cost calculator.
Args:
policy: "calculate", "null_fail", or "zero_legacy"
electricity_rate_usd_per_kwh: Local electricity rate (default: auto-detect)
hardware_cost_usd: Total hardware cost for amortization
hardware_lifespan_months: Hardware depreciation period
gpu_tdp_watts: GPU power consumption (default: auto-detect)
cpu_tdp_watts: CPU power consumption (default: auto-detect)
"""
self.policy = CostPolicy(policy)
# Electricity pricing (USD per kWh)
self.electricity_rate = electricity_rate_usd_per_kwh or self._get_default_electricity_rate()
# Hardware costs
self.hardware_cost = hardware_cost_usd or self._estimate_hardware_cost()
self.hardware_lifespan_months = hardware_lifespan_months
# Power consumption (watts)
self.gpu_tdp = gpu_tdp_watts or self._estimate_gpu_power()
self.cpu_tdp = cpu_tdp_watts or self._estimate_cpu_power()
logger.info(
f"LocalCostCalculator initialized: policy={policy}, "
f"electricity=${self.electricity_rate:.4f}/kWh, "
f"hardware=${self.hardware_cost:,.0f}, "
f"gpu={self.gpu_tdp}W, cpu={self.cpu_tdp}W",
)
[docs]
def calculate_inference_cost(
self,
latency_ms: float,
tokens: int,
model: str,
provider: str = "ollama",
) -> Optional[float]:
"""
Calculate the real cost of local LLM inference.
Args:
latency_ms: Inference time in milliseconds
tokens: Total tokens processed
model: Model name for optimization estimation
provider: Local provider (ollama, lm_studio, etc.)
Returns:
Cost in USD, or None if null_fail policy
Raises:
ValueError: If null_fail policy is enabled
"""
if self.policy == CostPolicy.NULL_FAIL:
raise ValueError(
f"Local LLM cost is null (policy=null_fail). "
f"Configure real cost calculation or use cloud models. "
f"Model: {model}, Tokens: {tokens}, Latency: {latency_ms}ms",
)
if self.policy == CostPolicy.ZERO_LEGACY:
logger.warning("Using deprecated zero cost policy for local LLMs")
return 0.0
# Calculate electricity cost
inference_time_hours = latency_ms / (1000 * 3600) # Convert ms to hours
# Estimate GPU utilization based on model size and provider
gpu_utilization = self._estimate_gpu_utilization(model, provider, tokens)
cpu_utilization = self._estimate_cpu_utilization(model, provider)
# Power consumption during inference
gpu_power_kwh = (self.gpu_tdp * gpu_utilization * inference_time_hours) / 1000
cpu_power_kwh = (self.cpu_tdp * cpu_utilization * inference_time_hours) / 1000
electricity_cost = (gpu_power_kwh + cpu_power_kwh) * self.electricity_rate
# Hardware amortization cost
# Spread hardware cost over expected lifespan and usage
hours_per_month = 24 * 30 # Assume 24/7 usage for conservative estimate
total_hardware_hours = self.hardware_lifespan_months * hours_per_month
hardware_cost_per_hour = self.hardware_cost / total_hardware_hours
amortization_cost = hardware_cost_per_hour * inference_time_hours
total_cost = electricity_cost + amortization_cost
logger.debug(
f"Local cost breakdown: electricity=${electricity_cost:.6f}, "
f"amortization=${amortization_cost:.6f}, total=${total_cost:.6f} "
f"(model={model}, {tokens}tok, {latency_ms}ms)",
)
return round(total_cost, 6)
def _get_default_electricity_rate(self) -> float:
"""Get default electricity rate based on environment or region."""
# Try environment variable first
rate = os.environ.get("ORKA_ELECTRICITY_RATE_USD_KWH")
if rate:
try:
return float(rate)
except ValueError:
pass
# Default rates by common regions (USD per kWh, 2025)
default_rates = {
"US": 0.16, # US average residential
"EU": 0.28, # EU average
"DE": 0.32, # Germany (high)
"NO": 0.10, # Norway (low, hydro)
"CN": 0.08, # China
"JP": 0.26, # Japan
"KR": 0.20, # South Korea
"AU": 0.25, # Australia
"CA": 0.13, # Canada
"UK": 0.31, # United Kingdom
}
# Try to detect region from environment or use conservative estimate
region = os.environ.get("ORKA_REGION", "EU")
return default_rates.get(region, 0.20) # Conservative global average
def _estimate_hardware_cost(self) -> float:
"""Estimate total hardware cost for amortization."""
# Try environment variable
cost = os.environ.get("ORKA_HARDWARE_COST_USD")
if cost:
try:
return float(cost)
except ValueError:
pass
# Estimate based on detected GPU
try:
import GPUtil
gpus = GPUtil.getGPUs()
if gpus:
gpu_name = gpus[0].name.lower()
# Hardware cost estimates (USD, 2025 prices)
gpu_costs = {
"rtx 4090": 1800,
"rtx 4080": 1200,
"rtx 4070": 800,
"rtx 3090": 1000,
"rtx 3080": 700,
"a100": 15000,
"h100": 30000,
"v100": 8000,
"a6000": 5000,
"a5000": 2500,
"titan": 2500,
}
for name_pattern, cost in gpu_costs.items():
if name_pattern in gpu_name:
# Add estimated system cost (CPU, RAM, storage, etc.)
system_cost = cost * 0.5 # System typically 50% of GPU cost
return cost + system_cost
except ImportError:
pass
# Conservative default for unknown hardware
return 2000 # ~$2K total system cost
def _estimate_gpu_power(self) -> float:
"""Estimate GPU power consumption in watts."""
# Try environment variable
power = os.environ.get("ORKA_GPU_TDP_WATTS")
if power:
try:
return float(power)
except ValueError:
pass
# Try to detect GPU and estimate TDP
try:
import GPUtil
gpus = GPUtil.getGPUs()
if gpus:
gpu_name = gpus[0].name.lower()
# TDP estimates for common GPUs (watts)
gpu_tdp = {
"rtx 4090": 450,
"rtx 4080": 320,
"rtx 4070": 200,
"rtx 3090": 350,
"rtx 3080": 320,
"a100": 400,
"h100": 700,
"v100": 300,
"a6000": 300,
"a5000": 230,
"titan": 250,
}
for name_pattern, tdp in gpu_tdp.items():
if name_pattern in gpu_name:
return tdp
except ImportError:
pass
# Conservative default
return 250 # Typical high-end GPU
def _estimate_cpu_power(self) -> float:
"""Estimate CPU power consumption in watts."""
# Try environment variable
power = os.environ.get("ORKA_CPU_TDP_WATTS")
if power:
try:
return float(power)
except ValueError:
pass
# Estimate based on CPU cores
try:
import psutil
cpu_count = psutil.cpu_count(logical=False) # Physical cores
# Estimate ~15W per physical core for modern CPUs under load
return cpu_count * 15
except ImportError:
pass
# Conservative default
return 120 # Typical 8-core CPU
def _estimate_gpu_utilization(self, model: str, provider: str, tokens: int) -> float:
"""Estimate GPU utilization during inference (0-1)."""
# Larger models and more tokens = higher utilization
model_lower = model.lower()
# Base utilization by model size
if any(size in model_lower for size in ["70b", "72b", "405b"]):
base_util = 0.95 # Large models max out GPU
elif any(size in model_lower for size in ["30b", "32b", "34b"]):
base_util = 0.85 # Medium-large models
elif any(size in model_lower for size in ["13b", "14b", "15b"]):
base_util = 0.70 # Medium models
elif any(size in model_lower for size in ["7b", "8b", "9b"]):
base_util = 0.60 # Small models
elif any(size in model_lower for size in ["3b", "1b", "1.5b"]):
base_util = 0.40 # Tiny models
else:
base_util = 0.70 # Unknown, assume medium
# Adjust for token count (more tokens = sustained load)
if tokens > 2000:
token_multiplier = 1.1
elif tokens > 1000:
token_multiplier = 1.05
else:
token_multiplier = 1.0
return min(1.0, base_util * token_multiplier)
def _estimate_cpu_utilization(self, model: str, provider: str) -> float:
"""Estimate CPU utilization during inference (0-1)."""
# CPU usage depends on provider and model
if provider.lower() == "ollama":
return 0.30 # Ollama uses CPU for preprocessing
elif provider.lower() in ["lm_studio", "lmstudio"]:
return 0.25 # LM Studio optimized
else:
return 0.35 # Generic providers
# Global instance - can be configured via environment
_default_calculator = None
[docs]
def get_cost_calculator() -> LocalCostCalculator:
"""Get the global cost calculator instance."""
global _default_calculator
if _default_calculator is None:
policy = os.environ.get("ORKA_LOCAL_COST_POLICY", "calculate")
_default_calculator = LocalCostCalculator(policy=policy)
return _default_calculator
[docs]
def calculate_local_llm_cost(
latency_ms: float,
tokens: int,
model: str,
provider: str = "ollama",
) -> Optional[float]:
"""
Calculate local LLM inference cost.
Convenience function that uses the global calculator.
Returns:
Cost in USD, or None if null_fail policy
Raises:
ValueError: If null_fail policy is enabled
"""
calculator = get_cost_calculator()
return calculator.calculate_inference_cost(latency_ms, tokens, model, provider)