239 lines
7.7 KiB
Python
239 lines
7.7 KiB
Python
"""
|
|
Query Decomposition for TechScout.
|
|
|
|
Takes a natural language capability gap description and decomposes it
|
|
into specific technical search queries optimized for different data sources.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
from .llm_client import OllamaClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class DecomposedQuery:
|
|
"""Result of query decomposition."""
|
|
original_query: str
|
|
understanding: str # LLM's understanding of the problem
|
|
technical_domains: List[str] # Identified technical areas
|
|
search_queries: List[str] # General search queries
|
|
sbir_queries: List[str] # Optimized for SBIR search
|
|
patent_queries: List[str] # Optimized for patent search
|
|
news_queries: List[str] # Optimized for news search
|
|
keywords: List[str] # Key technical terms
|
|
exclusions: List[str] # Terms to exclude (reduce noise)
|
|
target_trl_range: tuple = (4, 7) # Target TRL range
|
|
success: bool = True
|
|
error: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"original_query": self.original_query,
|
|
"understanding": self.understanding,
|
|
"technical_domains": self.technical_domains,
|
|
"search_queries": self.search_queries,
|
|
"sbir_queries": self.sbir_queries,
|
|
"patent_queries": self.patent_queries,
|
|
"news_queries": self.news_queries,
|
|
"keywords": self.keywords,
|
|
"exclusions": self.exclusions,
|
|
"target_trl_range": list(self.target_trl_range),
|
|
"success": self.success,
|
|
"error": self.error
|
|
}
|
|
|
|
|
|
class QueryDecomposer:
|
|
"""
|
|
Decomposes capability gap descriptions into technical search queries.
|
|
"""
|
|
|
|
SYSTEM_PROMPT = """You are a defense technology analyst specializing in translating operational capability gaps into technical search strategies.
|
|
|
|
Your task is to take a natural language description of a capability need and:
|
|
1. Understand the underlying technical problem
|
|
2. Identify relevant technical domains and approaches
|
|
3. Generate optimized search queries for different data sources
|
|
4. Identify key technical terms and exclusions
|
|
|
|
Be specific and technical. Think about what technologies, methods, and approaches could solve the problem.
|
|
Consider both mature solutions and emerging technologies."""
|
|
|
|
DECOMPOSITION_PROMPT = """Analyze this capability gap and generate search strategies:
|
|
|
|
CAPABILITY GAP:
|
|
{query}
|
|
|
|
Respond with a JSON object containing:
|
|
{{
|
|
"understanding": "Your technical understanding of what's needed (2-3 sentences)",
|
|
"technical_domains": ["list", "of", "relevant", "technical", "domains"],
|
|
"search_queries": [
|
|
"general technical search query 1",
|
|
"general technical search query 2",
|
|
"general technical search query 3",
|
|
"general technical search query 4",
|
|
"general technical search query 5"
|
|
],
|
|
"sbir_queries": [
|
|
"SBIR-optimized query focusing on R&D terms",
|
|
"Another SBIR query with different technical angle"
|
|
],
|
|
"patent_queries": [
|
|
"Patent search query with technical terminology",
|
|
"Alternative patent query with different approach"
|
|
],
|
|
"news_queries": [
|
|
"News query for recent developments",
|
|
"Industry news angle query"
|
|
],
|
|
"keywords": ["key", "technical", "terms", "for", "filtering"],
|
|
"exclusions": ["terms", "to", "exclude", "reduce", "noise"],
|
|
"target_trl": {{
|
|
"min": 4,
|
|
"max": 7,
|
|
"rationale": "Why this TRL range is appropriate"
|
|
}}
|
|
}}
|
|
|
|
Generate diverse queries that approach the problem from different angles.
|
|
SBIR queries should use R&D terminology.
|
|
Patent queries should use precise technical language.
|
|
News queries should be more accessible.
|
|
|
|
Provide at least 5 general search queries covering different technical approaches."""
|
|
|
|
def __init__(
|
|
self,
|
|
ollama_client: Optional[OllamaClient] = None,
|
|
model: str = "mistral-nemo:12b"
|
|
):
|
|
self.client = ollama_client or OllamaClient()
|
|
self.model = model
|
|
|
|
def decompose(self, query: str) -> DecomposedQuery:
|
|
"""
|
|
Decompose a capability gap into search queries.
|
|
|
|
Args:
|
|
query: Natural language capability gap description
|
|
|
|
Returns:
|
|
DecomposedQuery with optimized search strategies
|
|
"""
|
|
prompt = self.DECOMPOSITION_PROMPT.format(query=query)
|
|
|
|
response = self.client.generate(
|
|
prompt=prompt,
|
|
model=self.model,
|
|
system=self.SYSTEM_PROMPT,
|
|
temperature=0.2,
|
|
format="json"
|
|
)
|
|
|
|
if not response.success:
|
|
return DecomposedQuery(
|
|
original_query=query,
|
|
understanding="",
|
|
technical_domains=[],
|
|
search_queries=[],
|
|
sbir_queries=[],
|
|
patent_queries=[],
|
|
news_queries=[],
|
|
keywords=[],
|
|
exclusions=[],
|
|
success=False,
|
|
error=response.error
|
|
)
|
|
|
|
# Parse response
|
|
try:
|
|
data = json.loads(response.content)
|
|
except json.JSONDecodeError:
|
|
data = self.client.extract_json_from_text(response.content)
|
|
if not data:
|
|
return DecomposedQuery(
|
|
original_query=query,
|
|
understanding="",
|
|
technical_domains=[],
|
|
search_queries=[],
|
|
sbir_queries=[],
|
|
patent_queries=[],
|
|
news_queries=[],
|
|
keywords=[],
|
|
exclusions=[],
|
|
success=False,
|
|
error="Failed to parse LLM response"
|
|
)
|
|
|
|
# Extract TRL range
|
|
trl_data = data.get("target_trl", {})
|
|
trl_range = (
|
|
trl_data.get("min", 4),
|
|
trl_data.get("max", 7)
|
|
)
|
|
|
|
return DecomposedQuery(
|
|
original_query=query,
|
|
understanding=data.get("understanding", ""),
|
|
technical_domains=data.get("technical_domains", []),
|
|
search_queries=data.get("search_queries", []),
|
|
sbir_queries=data.get("sbir_queries", []),
|
|
patent_queries=data.get("patent_queries", []),
|
|
news_queries=data.get("news_queries", []),
|
|
keywords=data.get("keywords", []),
|
|
exclusions=data.get("exclusions", []),
|
|
target_trl_range=trl_range,
|
|
success=True
|
|
)
|
|
|
|
def enhance_query(self, base_query: str, context: str) -> List[str]:
|
|
"""
|
|
Generate additional queries based on initial results context.
|
|
|
|
Args:
|
|
base_query: Original capability gap
|
|
context: Summary of initial findings
|
|
|
|
Returns:
|
|
List of refined search queries
|
|
"""
|
|
prompt = f"""Based on initial search results, generate refined queries.
|
|
|
|
ORIGINAL CAPABILITY GAP:
|
|
{base_query}
|
|
|
|
INITIAL FINDINGS:
|
|
{context}
|
|
|
|
Generate 3-5 refined search queries that:
|
|
1. Drill deeper into promising approaches found
|
|
2. Explore adjacent technical areas
|
|
3. Target specific companies or technologies identified
|
|
|
|
Respond with a JSON object:
|
|
{{
|
|
"refined_queries": ["query1", "query2", "query3"]
|
|
}}"""
|
|
|
|
response = self.client.generate(
|
|
prompt=prompt,
|
|
model=self.model,
|
|
temperature=0.3,
|
|
format="json"
|
|
)
|
|
|
|
if not response.success:
|
|
return []
|
|
|
|
try:
|
|
data = json.loads(response.content)
|
|
return data.get("refined_queries", [])
|
|
except json.JSONDecodeError:
|
|
return []
|