""" Query Decomposition for TechScout. Takes a natural language capability gap description and decomposes it into specific technical search queries optimized for different data sources. """ import json import logging from dataclasses import dataclass, field from typing import List, Dict, Any, Optional from .llm_client import OllamaClient logger = logging.getLogger(__name__) @dataclass class DecomposedQuery: """Result of query decomposition.""" original_query: str understanding: str # LLM's understanding of the problem technical_domains: List[str] # Identified technical areas search_queries: List[str] # General search queries sbir_queries: List[str] # Optimized for SBIR search patent_queries: List[str] # Optimized for patent search news_queries: List[str] # Optimized for news search keywords: List[str] # Key technical terms exclusions: List[str] # Terms to exclude (reduce noise) target_trl_range: tuple = (4, 7) # Target TRL range success: bool = True error: Optional[str] = None def to_dict(self) -> Dict[str, Any]: return { "original_query": self.original_query, "understanding": self.understanding, "technical_domains": self.technical_domains, "search_queries": self.search_queries, "sbir_queries": self.sbir_queries, "patent_queries": self.patent_queries, "news_queries": self.news_queries, "keywords": self.keywords, "exclusions": self.exclusions, "target_trl_range": list(self.target_trl_range), "success": self.success, "error": self.error } class QueryDecomposer: """ Decomposes capability gap descriptions into technical search queries. """ SYSTEM_PROMPT = """You are a defense technology analyst specializing in translating operational capability gaps into technical search strategies. Your task is to take a natural language description of a capability need and: 1. Understand the underlying technical problem 2. Identify relevant technical domains and approaches 3. Generate optimized search queries for different data sources 4. Identify key technical terms and exclusions Be specific and technical. Think about what technologies, methods, and approaches could solve the problem. Consider both mature solutions and emerging technologies.""" DECOMPOSITION_PROMPT = """Analyze this capability gap and generate search strategies: CAPABILITY GAP: {query} Respond with a JSON object containing: {{ "understanding": "Your technical understanding of what's needed (2-3 sentences)", "technical_domains": ["list", "of", "relevant", "technical", "domains"], "search_queries": [ "general technical search query 1", "general technical search query 2", "general technical search query 3", "general technical search query 4", "general technical search query 5" ], "sbir_queries": [ "SBIR-optimized query focusing on R&D terms", "Another SBIR query with different technical angle" ], "patent_queries": [ "Patent search query with technical terminology", "Alternative patent query with different approach" ], "news_queries": [ "News query for recent developments", "Industry news angle query" ], "keywords": ["key", "technical", "terms", "for", "filtering"], "exclusions": ["terms", "to", "exclude", "reduce", "noise"], "target_trl": {{ "min": 4, "max": 7, "rationale": "Why this TRL range is appropriate" }} }} Generate diverse queries that approach the problem from different angles. SBIR queries should use R&D terminology. Patent queries should use precise technical language. News queries should be more accessible. Provide at least 5 general search queries covering different technical approaches.""" def __init__( self, ollama_client: Optional[OllamaClient] = None, model: str = "mistral-nemo:12b" ): self.client = ollama_client or OllamaClient() self.model = model def decompose(self, query: str) -> DecomposedQuery: """ Decompose a capability gap into search queries. Args: query: Natural language capability gap description Returns: DecomposedQuery with optimized search strategies """ prompt = self.DECOMPOSITION_PROMPT.format(query=query) response = self.client.generate( prompt=prompt, model=self.model, system=self.SYSTEM_PROMPT, temperature=0.2, format="json" ) if not response.success: return DecomposedQuery( original_query=query, understanding="", technical_domains=[], search_queries=[], sbir_queries=[], patent_queries=[], news_queries=[], keywords=[], exclusions=[], success=False, error=response.error ) # Parse response try: data = json.loads(response.content) except json.JSONDecodeError: data = self.client.extract_json_from_text(response.content) if not data: return DecomposedQuery( original_query=query, understanding="", technical_domains=[], search_queries=[], sbir_queries=[], patent_queries=[], news_queries=[], keywords=[], exclusions=[], success=False, error="Failed to parse LLM response" ) # Extract TRL range trl_data = data.get("target_trl", {}) trl_range = ( trl_data.get("min", 4), trl_data.get("max", 7) ) return DecomposedQuery( original_query=query, understanding=data.get("understanding", ""), technical_domains=data.get("technical_domains", []), search_queries=data.get("search_queries", []), sbir_queries=data.get("sbir_queries", []), patent_queries=data.get("patent_queries", []), news_queries=data.get("news_queries", []), keywords=data.get("keywords", []), exclusions=data.get("exclusions", []), target_trl_range=trl_range, success=True ) def enhance_query(self, base_query: str, context: str) -> List[str]: """ Generate additional queries based on initial results context. Args: base_query: Original capability gap context: Summary of initial findings Returns: List of refined search queries """ prompt = f"""Based on initial search results, generate refined queries. ORIGINAL CAPABILITY GAP: {base_query} INITIAL FINDINGS: {context} Generate 3-5 refined search queries that: 1. Drill deeper into promising approaches found 2. Explore adjacent technical areas 3. Target specific companies or technologies identified Respond with a JSON object: {{ "refined_queries": ["query1", "query2", "query3"] }}""" response = self.client.generate( prompt=prompt, model=self.model, temperature=0.3, format="json" ) if not response.success: return [] try: data = json.loads(response.content) return data.get("refined_queries", []) except json.JSONDecodeError: return []