TechScout/techscout/capability/parser.py

"""
Capability Parser for TechScout.

Transforms natural language capability descriptions into structured
capability needs, evaluation criteria, and optimized search queries.

This replaces the simpler query decomposition with capability-focused parsing.
"""

import json
import logging
from typing import Optional, List

from ..extraction.llm_client import OllamaClient
from .types import (
    ParsedCapability,
    CapabilityNeed,
    CapabilityCriterion,
    TechnologyIndicators,
)

logger = logging.getLogger(__name__)


class CapabilityParser:
    """
    Parses capability gap descriptions into structured capability needs.

    Output is used for:
    1. Optimized search queries (finding technologies, not topic discussions)
    2. Technology extraction criteria (what makes something a technology)
    3. Capability fit evaluation (does technology X provide capability Y)
    """

    SYSTEM_PROMPT = """You are a defense technology analyst specializing in capability gap analysis.

Your task is to analyze a natural language description of a capability need and:
1. Extract the core functional requirements
2. Define criteria for evaluating whether a technology addresses the need
3. Generate search queries optimized for finding TECHNOLOGIES (not topic discussions)
4. Identify indicators that distinguish actual technologies from general content

Think about what specific, nameable technologies could solve the problem.
Focus on finding solutions, not understanding problems."""

    CAPABILITY_PARSE_PROMPT = """Analyze this capability need and generate a structured capability assessment.

CAPABILITY NEED:
{query}

Respond with a JSON object containing:

{{
    "understanding": "Your 2-3 sentence technical understanding of what's needed",

    "technical_domains": ["relevant", "technical", "domains"],

    "capability_need": {{
        "functional_need": "The core capability being sought (1-2 sentences)",
        "domain": "Primary domain (e.g., Space Domain Awareness, Cyber, Electronic Warfare)",
        "implied_constraints": ["constraint1", "constraint2"],
        "technology_types_sought": ["sensor", "algorithm", "material", "system", "platform"]
    }},

    "capability_criteria": [
        {{
            "criterion": "What the technology must do to address this need",
            "weight": "must_have",
            "keywords": ["terms", "indicating", "this", "criterion", "is", "met"]
        }},
        {{
            "criterion": "Secondary capability that would be valuable",
            "weight": "should_have",
            "keywords": ["relevant", "keywords"]
        }},
        {{
            "criterion": "Nice-to-have capability",
            "weight": "nice_to_have",
            "keywords": ["keywords"]
        }}
    ],

    "search_queries": [
        "technology-focused query 1 (include terms like: prototype, system, device, demonstrated)",
        "technology-focused query 2",
        "technology-focused query 3",
        "technology-focused query 4",
        "technology-focused query 5"
    ],

    "sbir_queries": [
        "SBIR-optimized query focusing on R&D and development terms",
        "Another SBIR query with different technical angle"
    ],

    "patent_queries": [
        "Patent search with technical terminology and apparatus/method/system",
        "Alternative patent query"
    ],

    "news_queries": [
        "News query for recent technology announcements",
        "Industry/defense news query"
    ],

    "keywords": ["key", "technical", "terms"],

    "exclusions": ["terms", "to", "exclude", "overview", "introduction", "challenge"],

    "technology_indicators": {{
        "positive": ["prototype", "demonstrated", "developed", "patent", "Phase II", "system", "device", "sensor", "algorithm", "tested", "deployed"],
        "negative": ["overview", "challenge", "problem", "review", "survey", "introduction", "future", "potential", "could", "might"]
    }},

    "target_trl": {{
        "min": 4,
        "max": 7,
        "rationale": "Why this TRL range"
    }}
}}

IMPORTANT:
- Generate at least 3 capability_criteria (at least 1 must_have)
- Search queries should find TECHNOLOGIES, not topic discussions
- Include technology indicator terms (prototype, demonstrated, etc.) in queries
- Exclude vague terms (overview, challenge, potential) where possible"""

    def __init__(
        self,
        ollama_client: Optional[OllamaClient] = None,
        model: str = "mistral-nemo:12b"
    ):
        self.client = ollama_client or OllamaClient()
        self.model = model

    def parse(self, query: str) -> ParsedCapability:
        """
        Parse a capability gap description into structured capability needs.

        Args:
            query: Natural language capability gap description

        Returns:
            ParsedCapability with structured needs, criteria, and search queries
        """
        logger.info(f"Parsing capability: {query[:100]}...")

        prompt = self.CAPABILITY_PARSE_PROMPT.format(query=query)

        response = self.client.generate(
            prompt=prompt,
            model=self.model,
            system=self.SYSTEM_PROMPT,
            temperature=0.2,
            format="json"
        )

        if not response.success:
            logger.error(f"LLM call failed: {response.error}")
            return self._create_fallback(query, response.error)

        # Parse response
        try:
            data = json.loads(response.content)
        except json.JSONDecodeError:
            data = self.client.extract_json_from_text(response.content)
            if not data:
                logger.error("Failed to parse LLM response as JSON")
                return self._create_fallback(query, "Failed to parse LLM response")

        return self._build_parsed_capability(query, data)

    def _build_parsed_capability(self, query: str, data: dict) -> ParsedCapability:
        """Build ParsedCapability from LLM response data."""

        # Extract capability need
        cap_need_data = data.get("capability_need", {})
        capability_need = CapabilityNeed(
            functional_need=cap_need_data.get("functional_need", ""),
            domain=cap_need_data.get("domain", ""),
            implied_constraints=cap_need_data.get("implied_constraints", []),
            technology_types_sought=cap_need_data.get("technology_types_sought", []),
        )

        # Extract capability criteria
        criteria_data = data.get("capability_criteria", [])
        capability_criteria = []
        for c in criteria_data:
            weight = c.get("weight", "should_have")
            if weight not in ("must_have", "should_have", "nice_to_have"):
                weight = "should_have"
            capability_criteria.append(CapabilityCriterion(
                criterion=c.get("criterion", ""),
                weight=weight,
                keywords=c.get("keywords", []),
            ))

        # Ensure at least one must_have criterion
        if not any(c.weight == "must_have" for c in capability_criteria):
            if capability_criteria:
                capability_criteria[0].weight = "must_have"
            else:
                capability_criteria.append(CapabilityCriterion(
                    criterion=capability_need.functional_need or "Address the capability need",
                    weight="must_have",
                    keywords=data.get("keywords", []),
                ))

        # Extract technology indicators
        indicators_data = data.get("technology_indicators", {})
        technology_indicators = TechnologyIndicators(
            positive=indicators_data.get("positive", [
                "prototype", "demonstrated", "developed", "patent",
                "Phase II", "system", "device", "sensor", "tested"
            ]),
            negative=indicators_data.get("negative", [
                "overview", "challenge", "problem", "review",
                "survey", "introduction", "future", "potential"
            ]),
        )

        # Extract TRL range
        trl_data = data.get("target_trl", {})
        trl_range = (
            trl_data.get("min", 4),
            trl_data.get("max", 7)
        )

        return ParsedCapability(
            original_query=query,
            understanding=data.get("understanding", ""),
            technical_domains=data.get("technical_domains", []),
            search_queries=data.get("search_queries", []),
            sbir_queries=data.get("sbir_queries", []),
            patent_queries=data.get("patent_queries", []),
            news_queries=data.get("news_queries", []),
            keywords=data.get("keywords", []),
            exclusions=data.get("exclusions", []),
            target_trl_range=trl_range,
            capability_need=capability_need,
            capability_criteria=capability_criteria,
            technology_indicators=technology_indicators,
            success=True,
        )

    def _create_fallback(self, query: str, error: str) -> ParsedCapability:
        """Create a fallback ParsedCapability when LLM fails."""
        # Generate basic queries from the input
        words = query.split()
        basic_queries = [
            query,
            " ".join(words[:min(5, len(words))]) + " technology",
            " ".join(words[:min(5, len(words))]) + " system prototype",
        ]

        return ParsedCapability(
            original_query=query,
            understanding="",
            technical_domains=[],
            search_queries=basic_queries,
            sbir_queries=basic_queries[:2],
            patent_queries=basic_queries[:2],
            news_queries=basic_queries[:2],
            keywords=words[:10],
            exclusions=[],
            capability_need=CapabilityNeed(
                functional_need=query,
                domain="",
                implied_constraints=[],
                technology_types_sought=["system", "technology"],
            ),
            capability_criteria=[
                CapabilityCriterion(
                    criterion="Address the stated capability need",
                    weight="must_have",
                    keywords=words[:5],
                )
            ],
            technology_indicators=TechnologyIndicators(
                positive=["prototype", "demonstrated", "system", "device"],
                negative=["overview", "challenge", "review"],
            ),
            success=False,
            error=error,
        )