Source code for lrs.inference.evaluator

"""Hybrid Expected Free Energy evaluation."""

from typing import List, Dict, Any, Optional


from lrs.core.free_energy import (
    calculate_expected_free_energy,
    PolicyEvaluation
)


[docs]
class HybridGEvaluator:
    """
    Evaluate policies using both LLM priors and mathematical statistics.
    
    G_hybrid = (1 - λ) * G_math + λ * G_llm
    
    Where:
    - G_math: Calculated from historical execution statistics
    - G_llm: Derived from LLM's self-assessed success prob and info gain
    - λ: Interpolation factor (adaptive based on precision)
    
    Intuition:
    - Low precision → trust LLM more (world model unreliable, use semantics)
    - High precision → trust math more (world model accurate, use statistics)
    
    Examples:
        >>> evaluator = HybridGEvaluator()
        >>> 
        >>> # LLM proposal with self-assessment
        >>> proposal = {
        ...     'policy': [tool_a, tool_b],
        ...     'llm_success_prob': 0.7,
        ...     'llm_info_gain': 0.4
        ... }
        >>> 
        >>> # Evaluate with hybrid approach
        >>> G = evaluator.evaluate_hybrid(
        ...     proposal, state, preferences, precision=0.5
        ... )
    """
    

[docs]
    def __init__(
        self,
        lambda_fn: Optional[callable] = None,
        epistemic_weight: float = 1.0
    ):
        """
        Initialize hybrid evaluator.
        
        Args:
            lambda_fn: Function mapping precision → interpolation weight
                      Default: λ = 1 - precision (trust LLM when uncertain)
            epistemic_weight: Weight for epistemic value in G calculation
        """
        self.epistemic_weight = epistemic_weight
        
        # Default lambda function: inverse of precision
        # Low precision → high λ → trust LLM
        # High precision → low λ → trust math
        if lambda_fn is None:
            self.lambda_fn = lambda p: 1.0 - p
        else:
            self.lambda_fn = lambda_fn

    

[docs]
    def evaluate_hybrid(
        self,
        proposal: Dict[str, Any],
        state: Dict[str, Any],
        preferences: Dict[str, float],
        precision: float,
        historical_stats: Optional[Dict[str, Dict]] = None
    ) -> float:
        """
        Evaluate policy using hybrid approach.
        
        Args:
            proposal: Policy proposal with 'policy', 'llm_success_prob', 'llm_info_gain'
            state: Current agent state
            preferences: Reward function
            precision: Current precision value
            historical_stats: Optional execution history
        
        Returns:
            Hybrid G value
        
        Examples:
            >>> G = evaluator.evaluate_hybrid(proposal, state, preferences, precision=0.3)
            >>> # Low precision → G weighted toward LLM's assessment
        """
        policy = proposal['policy']
        
        # Calculate mathematical G
        G_math = calculate_expected_free_energy(
            policy=policy,
            state=state,
            preferences=preferences,
            historical_stats=historical_stats,
            epistemic_weight=self.epistemic_weight
        )
        
        # Calculate LLM-derived G
        G_llm = self._calculate_llm_g(proposal, preferences)
        
        # Adaptive interpolation
        lambda_weight = self.lambda_fn(precision)
        
        # Hybrid G
        G_hybrid = (1 - lambda_weight) * G_math + lambda_weight * G_llm
        
        return G_hybrid

    
    def _calculate_llm_g(
        self,
        proposal: Dict[str, Any],
        preferences: Dict[str, float]
    ) -> float:
        """
        Calculate G from LLM's self-assessment.
        
        Uses the LLM's estimated success probability and information gain
        to compute an Expected Free Energy value.
        
        Args:
            proposal: Must contain 'llm_success_prob' and 'llm_info_gain'
            preferences: Reward function
        
        Returns:
            G value derived from LLM estimates
        """
        # Extract LLM assessments
        success_prob = proposal.get('llm_success_prob', 0.5)
        info_gain = proposal.get('llm_info_gain', 0.5)
        
        # Epistemic value ≈ info_gain (from LLM)
        epistemic = info_gain * self.epistemic_weight
        
        # Pragmatic value ≈ expected reward (from LLM success prob)
        success_reward = preferences.get('success', 0.0)
        error_penalty = preferences.get('error', 0.0)
        
        pragmatic = success_prob * success_reward + (1 - success_prob) * error_penalty
        
        # G = Epistemic - Pragmatic
        G_llm = epistemic - pragmatic
        
        return G_llm
    

[docs]
    def evaluate_all(
        self,
        proposals: List[Dict[str, Any]],
        state: Dict[str, Any],
        preferences: Dict[str, float],
        precision: float,
        historical_stats: Optional[Dict[str, Dict]] = None
    ) -> List[PolicyEvaluation]:
        """
        Evaluate multiple proposals.
        
        Args:
            proposals: List of policy proposals
            state: Current state
            preferences: Reward function
            precision: Current precision
            historical_stats: Execution history
        
        Returns:
            List of PolicyEvaluation objects
        """
        evaluations = []
        
        for proposal in proposals:
            policy = proposal['policy']
            
            # Calculate hybrid G
            G_hybrid = self.evaluate_hybrid(
                proposal, state, preferences, precision, historical_stats
            )
            
            # Also calculate pure mathematical G for comparison
            G_math = calculate_expected_free_energy(
                policy, state, preferences, historical_stats
            )
            
            # Estimate success probability
            if 'llm_success_prob' in proposal:
                success_prob = proposal['llm_success_prob']
            else:
                success_prob = 0.5
            
            # Create evaluation
            evaluation = PolicyEvaluation(
                epistemic_value=proposal.get('llm_info_gain', 0.5),
                pragmatic_value=success_prob,
                total_G=G_hybrid,
                expected_success_prob=success_prob,
                components={
                    'G_hybrid': G_hybrid,
                    'G_math': G_math,
                    'G_llm': self._calculate_llm_g(proposal, preferences),
                    'lambda': self.lambda_fn(precision),
                    'strategy': proposal.get('strategy', 'unknown')
                }
            )
            
            evaluations.append(evaluation)
        
        return evaluations





[docs]
def compare_math_vs_llm(
    proposal: Dict[str, Any],
    state: Dict[str, Any],
    preferences: Dict[str, float],
    historical_stats: Optional[Dict[str, Dict]] = None
) -> Dict[str, float]:
    """
    Compare mathematical vs LLM-based G calculation.
    
    Useful for debugging and understanding how the hybrid evaluator works.
    
    Args:
        proposal: Policy proposal with LLM assessments
        state: Current state
        preferences: Reward function
        historical_stats: Execution history
    
    Returns:
        Dict with 'G_math', 'G_llm', and 'difference'
    
    Examples:
        >>> comparison = compare_math_vs_llm(proposal, state, preferences)
        >>> print(f"Math G: {comparison['G_math']:.2f}")
        >>> print(f"LLM G: {comparison['G_llm']:.2f}")
        >>> print(f"Difference: {comparison['difference']:.2f}")
    """
    evaluator = HybridGEvaluator()
    
    policy = proposal['policy']
    
    # Mathematical G
    G_math = calculate_expected_free_energy(
        policy, state, preferences, historical_stats
    )
    
    # LLM G
    G_llm = evaluator._calculate_llm_g(proposal, preferences)
    
    return {
        'G_math': G_math,
        'G_llm': G_llm,
        'difference': abs(G_math - G_llm)
    }