TechScout/techscout/technology/evaluator.py

"""
Capability Evaluator for TechScout.

Evaluates how well each technology matches the user's capability need.
Produces structured capability-fit assessments.

This is Stage 5 of the Capability-Technology Matching pipeline.
"""

import json
import logging
from typing import List, Optional

from ..extraction.llm_client import OllamaClient
from ..capability.types import CapabilityNeed, CapabilityCriterion
from .types import (
    GroupedTechnology,
    CapabilityMatch,
    CriterionResult,
    EvaluatedTechnology,
)

logger = logging.getLogger(__name__)


class CapabilityEvaluator:
    """
    Evaluates technologies against capability needs.

    For each technology, produces:
    - Per-criterion assessments (SUPPORTS/PARTIAL/DOES_NOT_SUPPORT/UNKNOWN)
    - Overall fit score (0-100)
    - Narrative explanation
    - Strengths, limitations, unknowns
    - Investigation recommendation
    """

    EVALUATION_PROMPT = """Evaluate whether this technology addresses the user's capability need.

USER'S CAPABILITY NEED:
{functional_need}

TECHNOLOGY:
Name: {tech_name}
Type: {tech_type}
Description: {description}
Capabilities: {capabilities}
Developer(s): {developers}
Maturity: TRL {trl_estimate} (Evidence: {trl_evidence})

EVALUATION CRITERIA:
{criteria_text}

INSTRUCTIONS:
For each criterion, assess:
- SUPPORTS: Technology clearly provides this capability
- PARTIAL: Technology partially addresses this, with limitations
- DOES_NOT_SUPPORT: Technology does not address this
- UNKNOWN: Insufficient information to determine

Respond with JSON:
{{
    "criteria_results": [
        {{
            "criterion": "criterion text",
            "weight": "must_have|should_have|nice_to_have",
            "assessment": "SUPPORTS|PARTIAL|DOES_NOT_SUPPORT|UNKNOWN",
            "evidence": "Explanation with evidence from technology description"
        }}
    ],
    "how_it_addresses_need": "2-3 sentences explaining how this technology could address the capability need",
    "key_strengths": ["strength 1", "strength 2"],
    "key_limitations": ["limitation 1", "limitation 2"],
    "key_unknowns": ["unknown 1", "unknown 2"],
    "investigation_worthy": true/false,
    "investigation_rationale": "Why this technology is/isn't worth investigating further"
}}"""

    def __init__(
        self,
        ollama_client: Optional[OllamaClient] = None,
        model: str = "mistral-nemo:12b"
    ):
        self.client = ollama_client or OllamaClient()
        self.model = model

    def evaluate_all(
        self,
        technologies: List[GroupedTechnology],
        capability_need: CapabilityNeed,
        capability_criteria: List[CapabilityCriterion]
    ) -> List[EvaluatedTechnology]:
        """
        Evaluate all technologies against capability criteria.

        Args:
            technologies: List of grouped technologies to evaluate
            capability_need: Structured capability need
            capability_criteria: List of evaluation criteria

        Returns:
            List of evaluated technologies with fit assessments
        """
        logger.info(f"Evaluating {len(technologies)} technologies against capability criteria...")

        evaluated = []
        for tech in technologies:
            match = self._evaluate_technology(tech, capability_need, capability_criteria)
            evaluated.append(EvaluatedTechnology(
                technology=tech,
                capability_match=match
            ))

        # Sort by fit score descending
        evaluated.sort(key=lambda e: e.capability_match.fit_score, reverse=True)

        logger.info(f"Evaluation complete. Top score: {evaluated[0].capability_match.fit_score if evaluated else 0}")

        return evaluated

    def _evaluate_technology(
        self,
        tech: GroupedTechnology,
        capability_need: CapabilityNeed,
        capability_criteria: List[CapabilityCriterion]
    ) -> CapabilityMatch:
        """Evaluate a single technology against capability criteria."""

        # Build criteria text for prompt
        criteria_text = ""
        for i, criterion in enumerate(capability_criteria, 1):
            criteria_text += f"{i}. [{criterion.weight.upper()}] {criterion.criterion}\n"
            criteria_text += f"   Keywords: {', '.join(criterion.keywords[:5])}\n"

        # Build developers text
        developers_text = ", ".join(d.name for d in tech.developers) if tech.developers else "Unknown"

        prompt = self.EVALUATION_PROMPT.format(
            functional_need=capability_need.functional_need,
            tech_name=tech.canonical_name,
            tech_type=tech.technology_type,
            description=tech.description[:500],
            capabilities="; ".join(tech.capabilities[:5]),
            developers=developers_text,
            trl_estimate=tech.trl_estimate or "Unknown",
            trl_evidence="; ".join(tech.trl_evidence[:3]) if tech.trl_evidence else "None",
            criteria_text=criteria_text
        )

        response = self.client.generate(
            prompt=prompt,
            model=self.model,
            temperature=0.1,
            format="json"
        )

        if not response.success:
            logger.warning(f"LLM evaluation failed for: {tech.canonical_name}")
            return self._create_fallback_match(tech, capability_criteria)

        try:
            data = json.loads(response.content)
        except json.JSONDecodeError:
            data = self.client.extract_json_from_text(response.content)
            if not data:
                return self._create_fallback_match(tech, capability_criteria)

        return self._build_capability_match(data, capability_criteria)

    def _build_capability_match(
        self,
        data: dict,
        capability_criteria: List[CapabilityCriterion]
    ) -> CapabilityMatch:
        """Build CapabilityMatch from LLM response."""

        # Parse criteria results
        criteria_results = []
        results_data = data.get("criteria_results", [])

        for result in results_data:
            assessment = result.get("assessment", "UNKNOWN")
            if assessment not in ("SUPPORTS", "PARTIAL", "DOES_NOT_SUPPORT", "UNKNOWN"):
                assessment = "UNKNOWN"

            weight = result.get("weight", "should_have")
            if weight not in ("must_have", "should_have", "nice_to_have"):
                weight = "should_have"

            criteria_results.append(CriterionResult(
                criterion=result.get("criterion", ""),
                weight=weight,
                assessment=assessment,
                evidence=result.get("evidence", ""),
                source=result.get("source")
            ))

        # Calculate fit score
        fit_score = self._calculate_fit_score(criteria_results)

        # Determine overall fit
        overall_fit = self._determine_overall_fit(fit_score, criteria_results)

        return CapabilityMatch(
            overall_fit=overall_fit,
            fit_score=fit_score,
            criteria_results=criteria_results,
            how_it_addresses_need=data.get("how_it_addresses_need", ""),
            key_strengths=data.get("key_strengths", []),
            key_limitations=data.get("key_limitations", []),
            key_unknowns=data.get("key_unknowns", []),
            investigation_worthy=data.get("investigation_worthy", fit_score >= 50),
            investigation_rationale=data.get("investigation_rationale", "")
        )

    def _calculate_fit_score(self, criteria_results: List[CriterionResult]) -> int:
        """
        Calculate fit score based on criteria results.

        Scoring:
        - must_have SUPPORTS: +30 points
        - must_have PARTIAL: +15 points
        - must_have DOES_NOT_SUPPORT: 0 points (caps overall)
        - should_have SUPPORTS: +15 points
        - should_have PARTIAL: +8 points
        - nice_to_have SUPPORTS: +5 points
        - nice_to_have PARTIAL: +3 points
        - UNKNOWN: +5 points (benefit of doubt)

        Normalized to 0-100.
        """
        if not criteria_results:
            return 50  # Neutral score if no criteria

        total_points = 0
        max_points = 0

        for result in criteria_results:
            weight = result.weight
            assessment = result.assessment

            # Calculate max possible points for this criterion
            if weight == "must_have":
                max_points += 30
            elif weight == "should_have":
                max_points += 15
            else:  # nice_to_have
                max_points += 5

            # Calculate actual points
            if assessment == "SUPPORTS":
                if weight == "must_have":
                    total_points += 30
                elif weight == "should_have":
                    total_points += 15
                else:
                    total_points += 5
            elif assessment == "PARTIAL":
                if weight == "must_have":
                    total_points += 15
                elif weight == "should_have":
                    total_points += 8
                else:
                    total_points += 3
            elif assessment == "UNKNOWN":
                # Benefit of doubt
                total_points += 5
            # DOES_NOT_SUPPORT adds 0 points

        # Normalize to 0-100
        if max_points > 0:
            score = int((total_points / max_points) * 100)
        else:
            score = 50

        return min(100, max(0, score))

    def _determine_overall_fit(
        self,
        score: int,
        criteria_results: List[CriterionResult]
    ) -> str:
        """Determine overall fit category based on score and criteria results."""

        # Check for must_have failures
        must_have_failures = sum(
            1 for r in criteria_results
            if r.weight == "must_have" and r.assessment == "DOES_NOT_SUPPORT"
        )

        # Check for too many unknowns
        unknown_count = sum(1 for r in criteria_results if r.assessment == "UNKNOWN")
        unknown_ratio = unknown_count / len(criteria_results) if criteria_results else 0

        # If most criteria are unknown, mark as UNCERTAIN
        if unknown_ratio > 0.5:
            return "UNCERTAIN"

        # Apply score-based categorization with must_have penalty
        if must_have_failures >= 2:
            return "LOW"
        elif must_have_failures == 1:
            # Cap at MEDIUM if any must_have fails
            if score >= 50:
                return "MEDIUM"
            else:
                return "LOW"
        else:
            # No must_have failures
            if score >= 75:
                return "HIGH"
            elif score >= 50:
                return "MEDIUM"
            elif score >= 25:
                return "LOW"
            else:
                return "LOW"

    def _create_fallback_match(
        self,
        tech: GroupedTechnology,
        capability_criteria: List[CapabilityCriterion]
    ) -> CapabilityMatch:
        """Create a fallback capability match using heuristics."""

        # Simple keyword-based assessment
        tech_text = f"{tech.canonical_name} {tech.description} {' '.join(tech.capabilities)}".lower()

        criteria_results = []
        for criterion in capability_criteria:
            # Check if any keywords appear in tech text
            keyword_matches = sum(1 for kw in criterion.keywords if kw.lower() in tech_text)

            if keyword_matches >= 2:
                assessment = "SUPPORTS"
            elif keyword_matches == 1:
                assessment = "PARTIAL"
            else:
                assessment = "UNKNOWN"

            criteria_results.append(CriterionResult(
                criterion=criterion.criterion,
                weight=criterion.weight,
                assessment=assessment,
                evidence=f"Heuristic: {keyword_matches} keyword matches",
                source=None
            ))

        fit_score = self._calculate_fit_score(criteria_results)
        overall_fit = self._determine_overall_fit(fit_score, criteria_results)

        return CapabilityMatch(
            overall_fit=overall_fit,
            fit_score=fit_score,
            criteria_results=criteria_results,
            how_it_addresses_need="Assessment based on keyword matching (LLM unavailable)",
            key_strengths=[],
            key_limitations=["Full LLM assessment unavailable"],
            key_unknowns=["Detailed capability analysis not performed"],
            investigation_worthy=fit_score >= 50,
            investigation_rationale="Based on keyword matching only"
        )