TechScout/techscout/technology/extractor.py

"""
Technology Extractor for TechScout.

Extracts specific, nameable technologies from search results.
Filters out results that are just topic discussions or generic content.

This is Stage 3 of the Capability-Technology Matching pipeline.
"""

import json
import logging
import uuid
from typing import List, Optional, Tuple
from dataclasses import dataclass

from ..extraction.llm_client import OllamaClient
from ..search.base import SearchResult
from ..capability.types import CapabilityNeed, TechnologyIndicators
from .types import ExtractedTechnology

logger = logging.getLogger(__name__)


@dataclass
class ExtractionResult:
    """Result of technology extraction from search results."""
    technologies: List[ExtractedTechnology]
    discarded_ids: List[str]           # IDs of results that weren't technologies
    extraction_stats: dict


class TechnologyExtractor:
    """
    Extracts specific technologies from search results.

    Uses a combination of:
    1. Heuristic pre-filtering (source type, keyword indicators)
    2. LLM-based extraction (for actual technology identification)
    """

    EXTRACTION_PROMPT = """Analyze this search result and determine if it describes a SPECIFIC TECHNOLOGY.

CONTEXT - User is looking for:
{functional_need}

SEARCH RESULT:
Source Type: {source_type}
Title: {title}
Content: {snippet}
Organization: {organization}

INSTRUCTIONS:
1. Does this describe a SPECIFIC, NAMEABLE technology?
   - YES examples: "SPAD Array Detector", "Quantum dot infrared sensor", "ML-based tracking algorithm"
   - NO examples: Topic discussions, overviews, problem descriptions, generic categories like "sensor technology"

2. If YES, extract technology details. If NO, explain why.

Respond with JSON:
{{
    "is_technology": true/false,
    "reason": "Why this is/isn't a specific technology",
    "technology": {{
        "name": "Specific technology name",
        "type": "sensor|algorithm|material|system|platform|device|method|software|other",
        "description": "What it is (1-2 sentences)",
        "capabilities": ["capability 1", "capability 2"],
        "mechanism": "How it works (if described)",
        "developer": "Organization name if mentioned",
        "developer_type": "company|university|government|lab|unknown",
        "trl_indicators": ["Phase II SBIR", "prototype demonstrated", etc.]
    }}
}}

If is_technology is false, omit the technology object."""

    BATCH_EXTRACTION_PROMPT = """Analyze these search results and identify which describe SPECIFIC TECHNOLOGIES.

CONTEXT - User is looking for:
{functional_need}

SEARCH RESULTS:
{results_text}

For each result, determine if it describes a specific, nameable technology (not generic category or topic discussion).

Respond with JSON:
{{
    "extractions": [
        {{
            "item_number": 1,
            "is_technology": true/false,
            "reason": "brief explanation",
            "technology": {{...}} or null
        }},
        ...
    ]
}}

Technology object schema (when is_technology is true):
{{
    "name": "Specific technology name",
    "type": "sensor|algorithm|material|system|platform|device|method|software|other",
    "description": "What it is",
    "capabilities": ["cap1", "cap2"],
    "developer": "Organization if mentioned",
    "trl_indicators": ["evidence of maturity"]
}}"""

    # Source types more likely to describe actual technologies
    HIGH_TECH_LIKELIHOOD_SOURCES = {"sbir", "patent"}
    MEDIUM_TECH_LIKELIHOOD_SOURCES = {"contract"}
    LOW_TECH_LIKELIHOOD_SOURCES = {"news", "web", "academic"}

    def __init__(
        self,
        ollama_client: Optional[OllamaClient] = None,
        model: str = "mistral-nemo:12b"
    ):
        self.client = ollama_client or OllamaClient()
        self.model = model

    def extract_all(
        self,
        results: List[SearchResult],
        capability_need: CapabilityNeed,
        technology_indicators: Optional[TechnologyIndicators] = None,
        batch_size: int = 5
    ) -> ExtractionResult:
        """
        Extract technologies from all search results.

        Args:
            results: List of search results to process
            capability_need: Structured capability need for context
            technology_indicators: Positive/negative indicators
            batch_size: How many results to process per LLM call

        Returns:
            ExtractionResult with extracted technologies and stats
        """
        logger.info(f"Extracting technologies from {len(results)} results...")

        technologies = []
        discarded_ids = []

        # Separate by likelihood of being a technology
        high_priority = []
        medium_priority = []
        low_priority = []

        for result in results:
            if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES:
                high_priority.append(result)
            elif result.source_type in self.MEDIUM_TECH_LIKELIHOOD_SOURCES:
                medium_priority.append(result)
            else:
                low_priority.append(result)

        # Process high priority (all)
        if high_priority:
            logger.info(f"Processing {len(high_priority)} high-priority results (SBIR/patents)...")
            techs, discarded = self._process_batch_results(
                high_priority, capability_need, batch_size
            )
            technologies.extend(techs)
            discarded_ids.extend(discarded)

        # Process medium priority (all)
        if medium_priority:
            logger.info(f"Processing {len(medium_priority)} medium-priority results (contracts)...")
            techs, discarded = self._process_batch_results(
                medium_priority, capability_need, batch_size
            )
            technologies.extend(techs)
            discarded_ids.extend(discarded)

        # Process low priority with pre-filtering
        if low_priority:
            # Apply heuristic filter first
            filtered_low = self._heuristic_filter(low_priority, technology_indicators)
            logger.info(f"Processing {len(filtered_low)}/{len(low_priority)} low-priority results (passed heuristic filter)...")

            if filtered_low:
                techs, discarded = self._process_batch_results(
                    filtered_low, capability_need, batch_size
                )
                technologies.extend(techs)
                discarded_ids.extend(discarded)

            # Add filtered-out results to discarded
            filtered_out_ids = [
                r.url for r in low_priority if r not in filtered_low
            ]
            discarded_ids.extend(filtered_out_ids)

        logger.info(f"Extracted {len(technologies)} technologies, discarded {len(discarded_ids)} results")

        return ExtractionResult(
            technologies=technologies,
            discarded_ids=discarded_ids,
            extraction_stats={
                "total_results": len(results),
                "high_priority_processed": len(high_priority),
                "medium_priority_processed": len(medium_priority),
                "low_priority_processed": len(low_priority),
                "technologies_extracted": len(technologies),
                "results_discarded": len(discarded_ids),
            }
        )

    def _heuristic_filter(
        self,
        results: List[SearchResult],
        indicators: Optional[TechnologyIndicators]
    ) -> List[SearchResult]:
        """Apply heuristic filtering to identify likely technology results."""
        if not indicators:
            # Default indicators
            positive = {"prototype", "demonstrated", "system", "device", "sensor",
                       "developed", "patent", "phase ii", "tested", "deployed"}
            negative = {"overview", "challenge", "review", "introduction", "survey",
                       "future", "potential", "could", "might", "problems"}
        else:
            positive = set(w.lower() for w in indicators.positive)
            negative = set(w.lower() for w in indicators.negative)

        filtered = []
        for result in results:
            text = f"{result.title} {result.snippet}".lower()

            # Count positive and negative indicators
            pos_count = sum(1 for term in positive if term in text)
            neg_count = sum(1 for term in negative if term in text)

            # Include if more positive than negative, or any positive found
            if pos_count > 0 and pos_count >= neg_count:
                filtered.append(result)
            elif pos_count > 0:
                filtered.append(result)

        return filtered

    def _process_batch_results(
        self,
        results: List[SearchResult],
        capability_need: CapabilityNeed,
        batch_size: int
    ) -> Tuple[List[ExtractedTechnology], List[str]]:
        """Process results in batches using LLM."""
        technologies = []
        discarded = []

        for i in range(0, len(results), batch_size):
            batch = results[i:i + batch_size]

            if len(batch) == 1:
                # Single result - use single extraction
                result = batch[0]
                tech = self._extract_single(result, capability_need)
                if tech:
                    technologies.append(tech)
                else:
                    discarded.append(result.url)
            else:
                # Multiple results - use batch extraction
                batch_techs, batch_discarded = self._extract_batch(batch, capability_need)
                technologies.extend(batch_techs)
                discarded.extend(batch_discarded)

        return technologies, discarded

    def _extract_single(
        self,
        result: SearchResult,
        capability_need: CapabilityNeed
    ) -> Optional[ExtractedTechnology]:
        """Extract technology from a single result."""
        prompt = self.EXTRACTION_PROMPT.format(
            functional_need=capability_need.functional_need,
            source_type=result.source_type,
            title=result.title,
            snippet=result.snippet[:500],
            organization=result.organization or "Unknown"
        )

        response = self.client.generate(
            prompt=prompt,
            model=self.model,
            temperature=0.1,
            format="json"
        )

        if not response.success:
            logger.warning(f"LLM extraction failed for: {result.title[:50]}")
            return None

        try:
            data = json.loads(response.content)
        except json.JSONDecodeError:
            data = self.client.extract_json_from_text(response.content)
            if not data:
                return None

        if not data.get("is_technology", False):
            return None

        tech_data = data.get("technology", {})
        if not tech_data or not tech_data.get("name"):
            return None

        return self._build_extracted_technology(result, tech_data)

    def _extract_batch(
        self,
        results: List[SearchResult],
        capability_need: CapabilityNeed
    ) -> Tuple[List[ExtractedTechnology], List[str]]:
        """Extract technologies from a batch of results."""
        # Build results text
        results_text = ""
        for i, result in enumerate(results, 1):
            results_text += f"""
Item {i}:
Source: {result.source_type}
Title: {result.title}
Organization: {result.organization or 'Unknown'}
Content: {result.snippet[:300]}
---"""

        prompt = self.BATCH_EXTRACTION_PROMPT.format(
            functional_need=capability_need.functional_need,
            results_text=results_text
        )

        response = self.client.generate(
            prompt=prompt,
            model=self.model,
            temperature=0.1,
            format="json"
        )

        technologies = []
        discarded = []

        if not response.success:
            logger.warning("Batch extraction failed, falling back to individual extraction")
            for result in results:
                tech = self._extract_single(result, capability_need)
                if tech:
                    technologies.append(tech)
                else:
                    discarded.append(result.url)
            return technologies, discarded

        try:
            data = json.loads(response.content)
        except json.JSONDecodeError:
            data = self.client.extract_json_from_text(response.content)
            if not data:
                # Fallback to individual extraction
                for result in results:
                    tech = self._extract_single(result, capability_need)
                    if tech:
                        technologies.append(tech)
                    else:
                        discarded.append(result.url)
                return technologies, discarded

        extractions = data.get("extractions", [])

        for extraction in extractions:
            item_num = extraction.get("item_number", 0)
            if 1 <= item_num <= len(results):
                result = results[item_num - 1]

                if extraction.get("is_technology", False):
                    tech_data = extraction.get("technology", {})
                    if tech_data and tech_data.get("name"):
                        tech = self._build_extracted_technology(result, tech_data)
                        if tech:
                            technologies.append(tech)
                            continue

                discarded.append(result.url)

        return technologies, discarded

    def _build_extracted_technology(
        self,
        result: SearchResult,
        tech_data: dict
    ) -> ExtractedTechnology:
        """Build ExtractedTechnology from extraction data."""
        tech_id = str(uuid.uuid4())[:8]

        # Estimate TRL from indicators
        trl_indicators = tech_data.get("trl_indicators", [])
        trl_estimate = self._estimate_trl(trl_indicators, result)

        return ExtractedTechnology(
            id=tech_id,
            source_result_id=result.url,
            extraction_confidence=0.8 if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES else 0.6,
            name=tech_data.get("name", ""),
            technology_type=tech_data.get("type", "system"),
            description=tech_data.get("description", ""),
            capabilities=tech_data.get("capabilities", []),
            mechanism=tech_data.get("mechanism"),
            developer=tech_data.get("developer") or result.organization,
            developer_type=tech_data.get("developer_type", "unknown"),
            trl_estimate=trl_estimate,
            trl_evidence=trl_indicators,
            source_type=result.source_type,
            source_url=result.url,
            source_title=result.title,
            source_snippet=result.snippet,
        )

    def _estimate_trl(
        self,
        trl_indicators: List[str],
        result: SearchResult
    ) -> Optional[int]:
        """Estimate TRL from indicators and source type."""
        # Use existing TRL if available
        if result.trl_estimate:
            return result.trl_estimate

        # Estimate from indicators
        indicators_lower = " ".join(trl_indicators).lower()

        if any(term in indicators_lower for term in ["deployed", "operational", "fielded"]):
            return 9
        elif any(term in indicators_lower for term in ["production", "qualified"]):
            return 8
        elif any(term in indicators_lower for term in ["phase iii", "demonstration"]):
            return 7
        elif any(term in indicators_lower for term in ["prototype", "phase ii"]):
            return 5
        elif any(term in indicators_lower for term in ["phase i", "laboratory"]):
            return 4
        elif any(term in indicators_lower for term in ["concept", "proof"]):
            return 3
        elif any(term in indicators_lower for term in ["research", "basic"]):
            return 2

        # Estimate from source type
        source_trl_defaults = {
            "sbir": 4,
            "patent": 5,
            "contract": 6,
            "news": 5,
            "web": 5,
        }
        return source_trl_defaults.get(result.source_type, 5)