Source code for orka.agents.local_cost_calculator

"""
Local LLM Cost Calculator
========================

Calculates real operating costs for local LLM inference including:
1. Electricity consumption during inference
2. Hardware amortization (GPU/CPU depreciation)
3. Optional cloud compute costs

No more fantasy $0.00 costs - local models have real expenses.
"""

import logging
import os
from enum import Enum
from typing import Optional

logger = logging.getLogger(__name__)


[docs] class CostPolicy(Enum): """Cost calculation policies for local LLMs""" CALCULATE = "calculate" # Calculate real costs NULL_FAIL = "null_fail" # Set to null and fail pipeline ZERO_LEGACY = "zero_legacy" # Legacy $0.00 (deprecated)
[docs] class LocalCostCalculator: """ Calculate real operating costs for local LLM inference. Cost components: 1. Electricity: GPU/CPU power consumption during inference 2. Hardware amortization: Depreciation of compute hardware 3. Cloud costs: If running on rented cloud infrastructure """
[docs] def __init__( self, policy: str = "calculate", electricity_rate_usd_per_kwh: float = None, hardware_cost_usd: float = None, hardware_lifespan_months: int = 36, gpu_tdp_watts: float = None, cpu_tdp_watts: float = None, ): """ Initialize cost calculator. Args: policy: "calculate", "null_fail", or "zero_legacy" electricity_rate_usd_per_kwh: Local electricity rate (default: auto-detect) hardware_cost_usd: Total hardware cost for amortization hardware_lifespan_months: Hardware depreciation period gpu_tdp_watts: GPU power consumption (default: auto-detect) cpu_tdp_watts: CPU power consumption (default: auto-detect) """ self.policy = CostPolicy(policy) # Electricity pricing (USD per kWh) self.electricity_rate = electricity_rate_usd_per_kwh or self._get_default_electricity_rate() # Hardware costs self.hardware_cost = hardware_cost_usd or self._estimate_hardware_cost() self.hardware_lifespan_months = hardware_lifespan_months # Power consumption (watts) self.gpu_tdp = gpu_tdp_watts or self._estimate_gpu_power() self.cpu_tdp = cpu_tdp_watts or self._estimate_cpu_power() logger.info( f"LocalCostCalculator initialized: policy={policy}, " f"electricity=${self.electricity_rate:.4f}/kWh, " f"hardware=${self.hardware_cost:,.0f}, " f"gpu={self.gpu_tdp}W, cpu={self.cpu_tdp}W", )
[docs] def calculate_inference_cost( self, latency_ms: float, tokens: int, model: str, provider: str = "ollama", ) -> Optional[float]: """ Calculate the real cost of local LLM inference. Args: latency_ms: Inference time in milliseconds tokens: Total tokens processed model: Model name for optimization estimation provider: Local provider (ollama, lm_studio, etc.) Returns: Cost in USD, or None if null_fail policy Raises: ValueError: If null_fail policy is enabled """ if self.policy == CostPolicy.NULL_FAIL: raise ValueError( f"Local LLM cost is null (policy=null_fail). " f"Configure real cost calculation or use cloud models. " f"Model: {model}, Tokens: {tokens}, Latency: {latency_ms}ms", ) if self.policy == CostPolicy.ZERO_LEGACY: logger.warning("Using deprecated zero cost policy for local LLMs") return 0.0 # Calculate electricity cost inference_time_hours = latency_ms / (1000 * 3600) # Convert ms to hours # Estimate GPU utilization based on model size and provider gpu_utilization = self._estimate_gpu_utilization(model, provider, tokens) cpu_utilization = self._estimate_cpu_utilization(model, provider) # Power consumption during inference gpu_power_kwh = (self.gpu_tdp * gpu_utilization * inference_time_hours) / 1000 cpu_power_kwh = (self.cpu_tdp * cpu_utilization * inference_time_hours) / 1000 electricity_cost = (gpu_power_kwh + cpu_power_kwh) * self.electricity_rate # Hardware amortization cost # Spread hardware cost over expected lifespan and usage hours_per_month = 24 * 30 # Assume 24/7 usage for conservative estimate total_hardware_hours = self.hardware_lifespan_months * hours_per_month hardware_cost_per_hour = self.hardware_cost / total_hardware_hours amortization_cost = hardware_cost_per_hour * inference_time_hours total_cost = electricity_cost + amortization_cost logger.debug( f"Local cost breakdown: electricity=${electricity_cost:.6f}, " f"amortization=${amortization_cost:.6f}, total=${total_cost:.6f} " f"(model={model}, {tokens}tok, {latency_ms}ms)", ) return round(total_cost, 6)
def _get_default_electricity_rate(self) -> float: """Get default electricity rate based on environment or region.""" # Try environment variable first rate = os.environ.get("ORKA_ELECTRICITY_RATE_USD_KWH") if rate: try: return float(rate) except ValueError: pass # Default rates by common regions (USD per kWh, 2025) default_rates = { "US": 0.16, # US average residential "EU": 0.28, # EU average "DE": 0.32, # Germany (high) "NO": 0.10, # Norway (low, hydro) "CN": 0.08, # China "JP": 0.26, # Japan "KR": 0.20, # South Korea "AU": 0.25, # Australia "CA": 0.13, # Canada "UK": 0.31, # United Kingdom } # Try to detect region from environment or use conservative estimate region = os.environ.get("ORKA_REGION", "EU") return default_rates.get(region, 0.20) # Conservative global average def _estimate_hardware_cost(self) -> float: """Estimate total hardware cost for amortization.""" # Try environment variable cost = os.environ.get("ORKA_HARDWARE_COST_USD") if cost: try: return float(cost) except ValueError: pass # Estimate based on detected GPU try: import GPUtil gpus = GPUtil.getGPUs() if gpus: gpu_name = gpus[0].name.lower() # Hardware cost estimates (USD, 2025 prices) gpu_costs = { "rtx 4090": 1800, "rtx 4080": 1200, "rtx 4070": 800, "rtx 3090": 1000, "rtx 3080": 700, "a100": 15000, "h100": 30000, "v100": 8000, "a6000": 5000, "a5000": 2500, "titan": 2500, } for name_pattern, cost in gpu_costs.items(): if name_pattern in gpu_name: # Add estimated system cost (CPU, RAM, storage, etc.) system_cost = cost * 0.5 # System typically 50% of GPU cost return cost + system_cost except ImportError: pass # Conservative default for unknown hardware return 2000 # ~$2K total system cost def _estimate_gpu_power(self) -> float: """Estimate GPU power consumption in watts.""" # Try environment variable power = os.environ.get("ORKA_GPU_TDP_WATTS") if power: try: return float(power) except ValueError: pass # Try to detect GPU and estimate TDP try: import GPUtil gpus = GPUtil.getGPUs() if gpus: gpu_name = gpus[0].name.lower() # TDP estimates for common GPUs (watts) gpu_tdp = { "rtx 4090": 450, "rtx 4080": 320, "rtx 4070": 200, "rtx 3090": 350, "rtx 3080": 320, "a100": 400, "h100": 700, "v100": 300, "a6000": 300, "a5000": 230, "titan": 250, } for name_pattern, tdp in gpu_tdp.items(): if name_pattern in gpu_name: return tdp except ImportError: pass # Conservative default return 250 # Typical high-end GPU def _estimate_cpu_power(self) -> float: """Estimate CPU power consumption in watts.""" # Try environment variable power = os.environ.get("ORKA_CPU_TDP_WATTS") if power: try: return float(power) except ValueError: pass # Estimate based on CPU cores try: import psutil cpu_count = psutil.cpu_count(logical=False) # Physical cores # Estimate ~15W per physical core for modern CPUs under load return cpu_count * 15 except ImportError: pass # Conservative default return 120 # Typical 8-core CPU def _estimate_gpu_utilization(self, model: str, provider: str, tokens: int) -> float: """Estimate GPU utilization during inference (0-1).""" # Larger models and more tokens = higher utilization model_lower = model.lower() # Base utilization by model size if any(size in model_lower for size in ["70b", "72b", "405b"]): base_util = 0.95 # Large models max out GPU elif any(size in model_lower for size in ["30b", "32b", "34b"]): base_util = 0.85 # Medium-large models elif any(size in model_lower for size in ["13b", "14b", "15b"]): base_util = 0.70 # Medium models elif any(size in model_lower for size in ["7b", "8b", "9b"]): base_util = 0.60 # Small models elif any(size in model_lower for size in ["3b", "1b", "1.5b"]): base_util = 0.40 # Tiny models else: base_util = 0.70 # Unknown, assume medium # Adjust for token count (more tokens = sustained load) if tokens > 2000: token_multiplier = 1.1 elif tokens > 1000: token_multiplier = 1.05 else: token_multiplier = 1.0 return min(1.0, base_util * token_multiplier) def _estimate_cpu_utilization(self, model: str, provider: str) -> float: """Estimate CPU utilization during inference (0-1).""" # CPU usage depends on provider and model if provider.lower() == "ollama": return 0.30 # Ollama uses CPU for preprocessing elif provider.lower() in ["lm_studio", "lmstudio"]: return 0.25 # LM Studio optimized else: return 0.35 # Generic providers
# Global instance - can be configured via environment _default_calculator = None
[docs] def get_cost_calculator() -> LocalCostCalculator: """Get the global cost calculator instance.""" global _default_calculator if _default_calculator is None: policy = os.environ.get("ORKA_LOCAL_COST_POLICY", "calculate") _default_calculator = LocalCostCalculator(policy=policy) return _default_calculator
[docs] def calculate_local_llm_cost( latency_ms: float, tokens: int, model: str, provider: str = "ollama", ) -> Optional[float]: """ Calculate local LLM inference cost. Convenience function that uses the global calculator. Returns: Cost in USD, or None if null_fail policy Raises: ValueError: If null_fail policy is enabled """ calculator = get_cost_calculator() return calculator.calculate_inference_cost(latency_ms, tokens, model, provider)