TechScout/techscout/extraction/decomposer.py

"""
Query Decomposition for TechScout.

Takes a natural language capability gap description and decomposes it
into specific technical search queries optimized for different data sources.
"""

import json
import logging
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional

from .llm_client import OllamaClient

logger = logging.getLogger(__name__)


@dataclass
class DecomposedQuery:
    """Result of query decomposition."""
    original_query: str
    understanding: str                    # LLM's understanding of the problem
    technical_domains: List[str]          # Identified technical areas
    search_queries: List[str]             # General search queries
    sbir_queries: List[str]               # Optimized for SBIR search
    patent_queries: List[str]             # Optimized for patent search
    news_queries: List[str]               # Optimized for news search
    keywords: List[str]                   # Key technical terms
    exclusions: List[str]                 # Terms to exclude (reduce noise)
    target_trl_range: tuple = (4, 7)      # Target TRL range
    success: bool = True
    error: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        return {
            "original_query": self.original_query,
            "understanding": self.understanding,
            "technical_domains": self.technical_domains,
            "search_queries": self.search_queries,
            "sbir_queries": self.sbir_queries,
            "patent_queries": self.patent_queries,
            "news_queries": self.news_queries,
            "keywords": self.keywords,
            "exclusions": self.exclusions,
            "target_trl_range": list(self.target_trl_range),
            "success": self.success,
            "error": self.error
        }


class QueryDecomposer:
    """
    Decomposes capability gap descriptions into technical search queries.
    """

    SYSTEM_PROMPT = """You are a defense technology analyst specializing in translating operational capability gaps into technical search strategies.

Your task is to take a natural language description of a capability need and:
1. Understand the underlying technical problem
2. Identify relevant technical domains and approaches
3. Generate optimized search queries for different data sources
4. Identify key technical terms and exclusions

Be specific and technical. Think about what technologies, methods, and approaches could solve the problem.
Consider both mature solutions and emerging technologies."""

    DECOMPOSITION_PROMPT = """Analyze this capability gap and generate search strategies:

CAPABILITY GAP:
{query}

Respond with a JSON object containing:
{{
    "understanding": "Your technical understanding of what's needed (2-3 sentences)",
    "technical_domains": ["list", "of", "relevant", "technical", "domains"],
    "search_queries": [
        "general technical search query 1",
        "general technical search query 2",
        "general technical search query 3",
        "general technical search query 4",
        "general technical search query 5"
    ],
    "sbir_queries": [
        "SBIR-optimized query focusing on R&D terms",
        "Another SBIR query with different technical angle"
    ],
    "patent_queries": [
        "Patent search query with technical terminology",
        "Alternative patent query with different approach"
    ],
    "news_queries": [
        "News query for recent developments",
        "Industry news angle query"
    ],
    "keywords": ["key", "technical", "terms", "for", "filtering"],
    "exclusions": ["terms", "to", "exclude", "reduce", "noise"],
    "target_trl": {{
        "min": 4,
        "max": 7,
        "rationale": "Why this TRL range is appropriate"
    }}
}}

Generate diverse queries that approach the problem from different angles.
SBIR queries should use R&D terminology.
Patent queries should use precise technical language.
News queries should be more accessible.

Provide at least 5 general search queries covering different technical approaches."""

    def __init__(
        self,
        ollama_client: Optional[OllamaClient] = None,
        model: str = "mistral-nemo:12b"
    ):
        self.client = ollama_client or OllamaClient()
        self.model = model

    def decompose(self, query: str) -> DecomposedQuery:
        """
        Decompose a capability gap into search queries.

        Args:
            query: Natural language capability gap description

        Returns:
            DecomposedQuery with optimized search strategies
        """
        prompt = self.DECOMPOSITION_PROMPT.format(query=query)

        response = self.client.generate(
            prompt=prompt,
            model=self.model,
            system=self.SYSTEM_PROMPT,
            temperature=0.2,
            format="json"
        )

        if not response.success:
            return DecomposedQuery(
                original_query=query,
                understanding="",
                technical_domains=[],
                search_queries=[],
                sbir_queries=[],
                patent_queries=[],
                news_queries=[],
                keywords=[],
                exclusions=[],
                success=False,
                error=response.error
            )

        # Parse response
        try:
            data = json.loads(response.content)
        except json.JSONDecodeError:
            data = self.client.extract_json_from_text(response.content)
            if not data:
                return DecomposedQuery(
                    original_query=query,
                    understanding="",
                    technical_domains=[],
                    search_queries=[],
                    sbir_queries=[],
                    patent_queries=[],
                    news_queries=[],
                    keywords=[],
                    exclusions=[],
                    success=False,
                    error="Failed to parse LLM response"
                )

        # Extract TRL range
        trl_data = data.get("target_trl", {})
        trl_range = (
            trl_data.get("min", 4),
            trl_data.get("max", 7)
        )

        return DecomposedQuery(
            original_query=query,
            understanding=data.get("understanding", ""),
            technical_domains=data.get("technical_domains", []),
            search_queries=data.get("search_queries", []),
            sbir_queries=data.get("sbir_queries", []),
            patent_queries=data.get("patent_queries", []),
            news_queries=data.get("news_queries", []),
            keywords=data.get("keywords", []),
            exclusions=data.get("exclusions", []),
            target_trl_range=trl_range,
            success=True
        )

    def enhance_query(self, base_query: str, context: str) -> List[str]:
        """
        Generate additional queries based on initial results context.

        Args:
            base_query: Original capability gap
            context: Summary of initial findings

        Returns:
            List of refined search queries
        """
        prompt = f"""Based on initial search results, generate refined queries.

ORIGINAL CAPABILITY GAP:
{base_query}

INITIAL FINDINGS:
{context}

Generate 3-5 refined search queries that:
1. Drill deeper into promising approaches found
2. Explore adjacent technical areas
3. Target specific companies or technologies identified

Respond with a JSON object:
{{
    "refined_queries": ["query1", "query2", "query3"]
}}"""

        response = self.client.generate(
            prompt=prompt,
            model=self.model,
            temperature=0.3,
            format="json"
        )

        if not response.success:
            return []

        try:
            data = json.loads(response.content)
            return data.get("refined_queries", [])
        except json.JSONDecodeError:
            return []