TechScout/techscout/extraction/scorer.py

"""
Result Scoring for TechScout.

Scores search results based on problem-solution fit using LLM analysis
and heuristic factors.
"""

import json
import logging
import math
from dataclasses import dataclass
from datetime import datetime
from typing import List, Dict, Any, Optional

from .llm_client import OllamaClient
from ..search.base import SearchResult

logger = logging.getLogger(__name__)


@dataclass
class ScoringWeights:
    """Weights for different scoring factors."""
    technical_relevance: float = 0.35
    trl_match: float = 0.20
    recency: float = 0.15
    source_authority: float = 0.15
    funding_indicator: float = 0.15


class ResultScorer:
    """
    Scores and ranks search results based on problem-solution fit.
    """

    SOURCE_AUTHORITY = {
        "sbir": 0.9,
        "patent": 0.85,
        "contract": 0.8,
        "government": 0.85,
        "academic": 0.75,
        "news": 0.6,
        "web": 0.5,
    }

    def __init__(
        self,
        ollama_client: Optional[OllamaClient] = None,
        model: str = "mistral-nemo:12b",
        weights: Optional[ScoringWeights] = None
    ):
        self.client = ollama_client or OllamaClient()
        self.model = model
        self.weights = weights or ScoringWeights()

    def score_results(
        self,
        results: List[SearchResult],
        capability_gap: str,
        target_trl: tuple = (4, 7),
        use_llm: bool = True
    ) -> List[SearchResult]:
        """
        Score all results and return sorted by final score.

        Args:
            results: List of search results to score
            capability_gap: Original capability gap description
            target_trl: Target TRL range (min, max)
            use_llm: Whether to use LLM for relevance scoring

        Returns:
            Sorted list of results with scores populated
        """
        if not results:
            return []

        # Score each result
        for result in results:
            # LLM-based relevance score
            if use_llm:
                result.relevance_score = self._score_relevance_llm(
                    result, capability_gap
                )
            else:
                result.relevance_score = self._score_relevance_heuristic(
                    result, capability_gap
                )

            # Calculate component scores
            trl_score = self._score_trl(result.trl_estimate, target_trl)
            recency_score = self._score_recency(result.published_date)
            authority_score = self.SOURCE_AUTHORITY.get(result.source_type, 0.5)
            funding_score = self._score_funding(result.award_amount)

            # Calculate final weighted score
            result.final_score = (
                self.weights.technical_relevance * result.relevance_score +
                self.weights.trl_match * trl_score +
                self.weights.recency * recency_score +
                self.weights.source_authority * authority_score +
                self.weights.funding_indicator * funding_score
            )

        # Sort by final score descending
        results.sort(key=lambda x: x.final_score, reverse=True)

        return results

    def _score_relevance_llm(
        self,
        result: SearchResult,
        capability_gap: str
    ) -> float:
        """Score relevance using LLM."""
        prompt = f"""Rate how well this technology/solution addresses the capability gap.

CAPABILITY GAP:
{capability_gap}

POTENTIAL SOLUTION:
Title: {result.title}
Description: {result.snippet}
Organization: {result.organization or 'Unknown'}
Source: {result.source_type}

Rate the relevance from 0.0 to 1.0 where:
- 0.0-0.2: Not relevant
- 0.2-0.4: Tangentially related
- 0.4-0.6: Somewhat relevant
- 0.6-0.8: Highly relevant
- 0.8-1.0: Direct solution

Respond with JSON:
{{
    "score": 0.0-1.0,
    "rationale": "Brief explanation"
}}"""

        response = self.client.generate(
            prompt=prompt,
            model=self.model,
            temperature=0.1,
            format="json"
        )

        if not response.success:
            return self._score_relevance_heuristic(result, capability_gap)

        try:
            data = json.loads(response.content)
            return min(1.0, max(0.0, float(data.get("score", 0.5))))
        except (json.JSONDecodeError, ValueError, TypeError):
            return self._score_relevance_heuristic(result, capability_gap)

    def _score_relevance_heuristic(
        self,
        result: SearchResult,
        capability_gap: str
    ) -> float:
        """Score relevance using keyword matching heuristics."""
        # Simple keyword overlap scoring
        gap_words = set(capability_gap.lower().split())
        result_text = f"{result.title} {result.snippet}".lower()
        result_words = set(result_text.split())

        # Remove common words
        stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
                     'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
                     'would', 'could', 'should', 'may', 'might', 'must', 'shall',
                     'can', 'need', 'for', 'of', 'to', 'in', 'on', 'at', 'by',
                     'from', 'with', 'and', 'or', 'not', 'that', 'this', 'it'}

        gap_words -= stopwords
        result_words -= stopwords

        if not gap_words:
            return 0.5

        overlap = len(gap_words & result_words)
        score = overlap / len(gap_words)

        return min(1.0, score)

    def _score_trl(self, trl: Optional[int], target_range: tuple) -> float:
        """Score based on TRL match."""
        if trl is None:
            return 0.5  # Unknown TRL

        min_trl, max_trl = target_range

        if min_trl <= trl <= max_trl:
            return 1.0
        elif trl < min_trl:
            # Too early stage
            distance = min_trl - trl
            return max(0.2, 1.0 - (distance * 0.2))
        else:
            # Too mature (might be less innovative)
            distance = trl - max_trl
            return max(0.4, 1.0 - (distance * 0.15))

    def _score_recency(self, date_str: Optional[str]) -> float:
        """Score based on how recent the result is."""
        if not date_str:
            return 0.5

        try:
            # Try to parse various date formats
            for fmt in ["%Y-%m-%d", "%Y-%m", "%Y", "%m/%d/%Y"]:
                try:
                    date = datetime.strptime(date_str[:len(fmt.replace('%', '0'))], fmt)
                    break
                except ValueError:
                    continue
            else:
                # Try to extract year
                import re
                year_match = re.search(r'20\d{2}', date_str)
                if year_match:
                    date = datetime(int(year_match.group()), 6, 1)
                else:
                    return 0.5

            # Calculate age in years
            age_days = (datetime.now() - date).days
            age_years = age_days / 365.25

            # Exponential decay with 2-year half-life
            score = math.exp(-0.347 * age_years)  # ln(2)/2 ≈ 0.347
            return max(0.1, min(1.0, score))

        except Exception:
            return 0.5

    def _score_funding(self, amount: Optional[float]) -> float:
        """Score based on funding amount (higher = more validated)."""
        if amount is None:
            return 0.5

        # Log scale scoring
        if amount <= 0:
            return 0.3
        elif amount < 100000:  # < $100K
            return 0.5
        elif amount < 500000:  # $100K - $500K
            return 0.65
        elif amount < 1000000:  # $500K - $1M
            return 0.75
        elif amount < 5000000:  # $1M - $5M
            return 0.85
        else:  # > $5M
            return 0.95

    def batch_score_relevance(
        self,
        results: List[SearchResult],
        capability_gap: str,
        batch_size: int = 5
    ) -> List[float]:
        """
        Score multiple results in batches for efficiency.

        Args:
            results: Results to score
            capability_gap: Capability gap description
            batch_size: How many to score per LLM call

        Returns:
            List of relevance scores
        """
        scores = []

        for i in range(0, len(results), batch_size):
            batch = results[i:i + batch_size]

            # Build batch prompt
            items = []
            for j, result in enumerate(batch):
                items.append(f"""
Item {j + 1}:
Title: {result.title}
Description: {result.snippet[:200]}
Organization: {result.organization or 'Unknown'}""")

            prompt = f"""Rate how well each item addresses this capability gap.

CAPABILITY GAP:
{capability_gap}

ITEMS TO SCORE:
{''.join(items)}

Respond with JSON:
{{
    "scores": [
        {{"item": 1, "score": 0.0-1.0}},
        {{"item": 2, "score": 0.0-1.0}},
        ...
    ]
}}"""

            response = self.client.generate(
                prompt=prompt,
                model=self.model,
                temperature=0.1,
                format="json"
            )

            if response.success:
                try:
                    data = json.loads(response.content)
                    batch_scores = {s["item"]: s["score"] for s in data.get("scores", [])}
                    for j in range(len(batch)):
                        scores.append(batch_scores.get(j + 1, 0.5))
                except (json.JSONDecodeError, KeyError):
                    scores.extend([0.5] * len(batch))
            else:
                scores.extend([0.5] * len(batch))

        return scores