TechScout/techscout/extraction/decomposer.py

239 lines
7.7 KiB
Python

"""
Query Decomposition for TechScout.
Takes a natural language capability gap description and decomposes it
into specific technical search queries optimized for different data sources.
"""
import json
import logging
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
from .llm_client import OllamaClient
logger = logging.getLogger(__name__)
@dataclass
class DecomposedQuery:
"""Result of query decomposition."""
original_query: str
understanding: str # LLM's understanding of the problem
technical_domains: List[str] # Identified technical areas
search_queries: List[str] # General search queries
sbir_queries: List[str] # Optimized for SBIR search
patent_queries: List[str] # Optimized for patent search
news_queries: List[str] # Optimized for news search
keywords: List[str] # Key technical terms
exclusions: List[str] # Terms to exclude (reduce noise)
target_trl_range: tuple = (4, 7) # Target TRL range
success: bool = True
error: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return {
"original_query": self.original_query,
"understanding": self.understanding,
"technical_domains": self.technical_domains,
"search_queries": self.search_queries,
"sbir_queries": self.sbir_queries,
"patent_queries": self.patent_queries,
"news_queries": self.news_queries,
"keywords": self.keywords,
"exclusions": self.exclusions,
"target_trl_range": list(self.target_trl_range),
"success": self.success,
"error": self.error
}
class QueryDecomposer:
"""
Decomposes capability gap descriptions into technical search queries.
"""
SYSTEM_PROMPT = """You are a defense technology analyst specializing in translating operational capability gaps into technical search strategies.
Your task is to take a natural language description of a capability need and:
1. Understand the underlying technical problem
2. Identify relevant technical domains and approaches
3. Generate optimized search queries for different data sources
4. Identify key technical terms and exclusions
Be specific and technical. Think about what technologies, methods, and approaches could solve the problem.
Consider both mature solutions and emerging technologies."""
DECOMPOSITION_PROMPT = """Analyze this capability gap and generate search strategies:
CAPABILITY GAP:
{query}
Respond with a JSON object containing:
{{
"understanding": "Your technical understanding of what's needed (2-3 sentences)",
"technical_domains": ["list", "of", "relevant", "technical", "domains"],
"search_queries": [
"general technical search query 1",
"general technical search query 2",
"general technical search query 3",
"general technical search query 4",
"general technical search query 5"
],
"sbir_queries": [
"SBIR-optimized query focusing on R&D terms",
"Another SBIR query with different technical angle"
],
"patent_queries": [
"Patent search query with technical terminology",
"Alternative patent query with different approach"
],
"news_queries": [
"News query for recent developments",
"Industry news angle query"
],
"keywords": ["key", "technical", "terms", "for", "filtering"],
"exclusions": ["terms", "to", "exclude", "reduce", "noise"],
"target_trl": {{
"min": 4,
"max": 7,
"rationale": "Why this TRL range is appropriate"
}}
}}
Generate diverse queries that approach the problem from different angles.
SBIR queries should use R&D terminology.
Patent queries should use precise technical language.
News queries should be more accessible.
Provide at least 5 general search queries covering different technical approaches."""
def __init__(
self,
ollama_client: Optional[OllamaClient] = None,
model: str = "mistral-nemo:12b"
):
self.client = ollama_client or OllamaClient()
self.model = model
def decompose(self, query: str) -> DecomposedQuery:
"""
Decompose a capability gap into search queries.
Args:
query: Natural language capability gap description
Returns:
DecomposedQuery with optimized search strategies
"""
prompt = self.DECOMPOSITION_PROMPT.format(query=query)
response = self.client.generate(
prompt=prompt,
model=self.model,
system=self.SYSTEM_PROMPT,
temperature=0.2,
format="json"
)
if not response.success:
return DecomposedQuery(
original_query=query,
understanding="",
technical_domains=[],
search_queries=[],
sbir_queries=[],
patent_queries=[],
news_queries=[],
keywords=[],
exclusions=[],
success=False,
error=response.error
)
# Parse response
try:
data = json.loads(response.content)
except json.JSONDecodeError:
data = self.client.extract_json_from_text(response.content)
if not data:
return DecomposedQuery(
original_query=query,
understanding="",
technical_domains=[],
search_queries=[],
sbir_queries=[],
patent_queries=[],
news_queries=[],
keywords=[],
exclusions=[],
success=False,
error="Failed to parse LLM response"
)
# Extract TRL range
trl_data = data.get("target_trl", {})
trl_range = (
trl_data.get("min", 4),
trl_data.get("max", 7)
)
return DecomposedQuery(
original_query=query,
understanding=data.get("understanding", ""),
technical_domains=data.get("technical_domains", []),
search_queries=data.get("search_queries", []),
sbir_queries=data.get("sbir_queries", []),
patent_queries=data.get("patent_queries", []),
news_queries=data.get("news_queries", []),
keywords=data.get("keywords", []),
exclusions=data.get("exclusions", []),
target_trl_range=trl_range,
success=True
)
def enhance_query(self, base_query: str, context: str) -> List[str]:
"""
Generate additional queries based on initial results context.
Args:
base_query: Original capability gap
context: Summary of initial findings
Returns:
List of refined search queries
"""
prompt = f"""Based on initial search results, generate refined queries.
ORIGINAL CAPABILITY GAP:
{base_query}
INITIAL FINDINGS:
{context}
Generate 3-5 refined search queries that:
1. Drill deeper into promising approaches found
2. Explore adjacent technical areas
3. Target specific companies or technologies identified
Respond with a JSON object:
{{
"refined_queries": ["query1", "query2", "query3"]
}}"""
response = self.client.generate(
prompt=prompt,
model=self.model,
temperature=0.3,
format="json"
)
if not response.success:
return []
try:
data = json.loads(response.content)
return data.get("refined_queries", [])
except json.JSONDecodeError:
return []