286 lines
10 KiB
Python
286 lines
10 KiB
Python
"""
|
|
Capability Parser for TechScout.
|
|
|
|
Transforms natural language capability descriptions into structured
|
|
capability needs, evaluation criteria, and optimized search queries.
|
|
|
|
This replaces the simpler query decomposition with capability-focused parsing.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from typing import Optional, List
|
|
|
|
from ..extraction.llm_client import OllamaClient
|
|
from .types import (
|
|
ParsedCapability,
|
|
CapabilityNeed,
|
|
CapabilityCriterion,
|
|
TechnologyIndicators,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CapabilityParser:
|
|
"""
|
|
Parses capability gap descriptions into structured capability needs.
|
|
|
|
Output is used for:
|
|
1. Optimized search queries (finding technologies, not topic discussions)
|
|
2. Technology extraction criteria (what makes something a technology)
|
|
3. Capability fit evaluation (does technology X provide capability Y)
|
|
"""
|
|
|
|
SYSTEM_PROMPT = """You are a defense technology analyst specializing in capability gap analysis.
|
|
|
|
Your task is to analyze a natural language description of a capability need and:
|
|
1. Extract the core functional requirements
|
|
2. Define criteria for evaluating whether a technology addresses the need
|
|
3. Generate search queries optimized for finding TECHNOLOGIES (not topic discussions)
|
|
4. Identify indicators that distinguish actual technologies from general content
|
|
|
|
Think about what specific, nameable technologies could solve the problem.
|
|
Focus on finding solutions, not understanding problems."""
|
|
|
|
CAPABILITY_PARSE_PROMPT = """Analyze this capability need and generate a structured capability assessment.
|
|
|
|
CAPABILITY NEED:
|
|
{query}
|
|
|
|
Respond with a JSON object containing:
|
|
|
|
{{
|
|
"understanding": "Your 2-3 sentence technical understanding of what's needed",
|
|
|
|
"technical_domains": ["relevant", "technical", "domains"],
|
|
|
|
"capability_need": {{
|
|
"functional_need": "The core capability being sought (1-2 sentences)",
|
|
"domain": "Primary domain (e.g., Space Domain Awareness, Cyber, Electronic Warfare)",
|
|
"implied_constraints": ["constraint1", "constraint2"],
|
|
"technology_types_sought": ["sensor", "algorithm", "material", "system", "platform"]
|
|
}},
|
|
|
|
"capability_criteria": [
|
|
{{
|
|
"criterion": "What the technology must do to address this need",
|
|
"weight": "must_have",
|
|
"keywords": ["terms", "indicating", "this", "criterion", "is", "met"]
|
|
}},
|
|
{{
|
|
"criterion": "Secondary capability that would be valuable",
|
|
"weight": "should_have",
|
|
"keywords": ["relevant", "keywords"]
|
|
}},
|
|
{{
|
|
"criterion": "Nice-to-have capability",
|
|
"weight": "nice_to_have",
|
|
"keywords": ["keywords"]
|
|
}}
|
|
],
|
|
|
|
"search_queries": [
|
|
"technology-focused query 1 (include terms like: prototype, system, device, demonstrated)",
|
|
"technology-focused query 2",
|
|
"technology-focused query 3",
|
|
"technology-focused query 4",
|
|
"technology-focused query 5"
|
|
],
|
|
|
|
"sbir_queries": [
|
|
"SBIR-optimized query focusing on R&D and development terms",
|
|
"Another SBIR query with different technical angle"
|
|
],
|
|
|
|
"patent_queries": [
|
|
"Patent search with technical terminology and apparatus/method/system",
|
|
"Alternative patent query"
|
|
],
|
|
|
|
"news_queries": [
|
|
"News query for recent technology announcements",
|
|
"Industry/defense news query"
|
|
],
|
|
|
|
"keywords": ["key", "technical", "terms"],
|
|
|
|
"exclusions": ["terms", "to", "exclude", "overview", "introduction", "challenge"],
|
|
|
|
"technology_indicators": {{
|
|
"positive": ["prototype", "demonstrated", "developed", "patent", "Phase II", "system", "device", "sensor", "algorithm", "tested", "deployed"],
|
|
"negative": ["overview", "challenge", "problem", "review", "survey", "introduction", "future", "potential", "could", "might"]
|
|
}},
|
|
|
|
"target_trl": {{
|
|
"min": 4,
|
|
"max": 7,
|
|
"rationale": "Why this TRL range"
|
|
}}
|
|
}}
|
|
|
|
IMPORTANT:
|
|
- Generate at least 3 capability_criteria (at least 1 must_have)
|
|
- Search queries should find TECHNOLOGIES, not topic discussions
|
|
- Include technology indicator terms (prototype, demonstrated, etc.) in queries
|
|
- Exclude vague terms (overview, challenge, potential) where possible"""
|
|
|
|
def __init__(
|
|
self,
|
|
ollama_client: Optional[OllamaClient] = None,
|
|
model: str = "mistral-nemo:12b"
|
|
):
|
|
self.client = ollama_client or OllamaClient()
|
|
self.model = model
|
|
|
|
def parse(self, query: str) -> ParsedCapability:
|
|
"""
|
|
Parse a capability gap description into structured capability needs.
|
|
|
|
Args:
|
|
query: Natural language capability gap description
|
|
|
|
Returns:
|
|
ParsedCapability with structured needs, criteria, and search queries
|
|
"""
|
|
logger.info(f"Parsing capability: {query[:100]}...")
|
|
|
|
prompt = self.CAPABILITY_PARSE_PROMPT.format(query=query)
|
|
|
|
response = self.client.generate(
|
|
prompt=prompt,
|
|
model=self.model,
|
|
system=self.SYSTEM_PROMPT,
|
|
temperature=0.2,
|
|
format="json"
|
|
)
|
|
|
|
if not response.success:
|
|
logger.error(f"LLM call failed: {response.error}")
|
|
return self._create_fallback(query, response.error)
|
|
|
|
# Parse response
|
|
try:
|
|
data = json.loads(response.content)
|
|
except json.JSONDecodeError:
|
|
data = self.client.extract_json_from_text(response.content)
|
|
if not data:
|
|
logger.error("Failed to parse LLM response as JSON")
|
|
return self._create_fallback(query, "Failed to parse LLM response")
|
|
|
|
return self._build_parsed_capability(query, data)
|
|
|
|
def _build_parsed_capability(self, query: str, data: dict) -> ParsedCapability:
|
|
"""Build ParsedCapability from LLM response data."""
|
|
|
|
# Extract capability need
|
|
cap_need_data = data.get("capability_need", {})
|
|
capability_need = CapabilityNeed(
|
|
functional_need=cap_need_data.get("functional_need", ""),
|
|
domain=cap_need_data.get("domain", ""),
|
|
implied_constraints=cap_need_data.get("implied_constraints", []),
|
|
technology_types_sought=cap_need_data.get("technology_types_sought", []),
|
|
)
|
|
|
|
# Extract capability criteria
|
|
criteria_data = data.get("capability_criteria", [])
|
|
capability_criteria = []
|
|
for c in criteria_data:
|
|
weight = c.get("weight", "should_have")
|
|
if weight not in ("must_have", "should_have", "nice_to_have"):
|
|
weight = "should_have"
|
|
capability_criteria.append(CapabilityCriterion(
|
|
criterion=c.get("criterion", ""),
|
|
weight=weight,
|
|
keywords=c.get("keywords", []),
|
|
))
|
|
|
|
# Ensure at least one must_have criterion
|
|
if not any(c.weight == "must_have" for c in capability_criteria):
|
|
if capability_criteria:
|
|
capability_criteria[0].weight = "must_have"
|
|
else:
|
|
capability_criteria.append(CapabilityCriterion(
|
|
criterion=capability_need.functional_need or "Address the capability need",
|
|
weight="must_have",
|
|
keywords=data.get("keywords", []),
|
|
))
|
|
|
|
# Extract technology indicators
|
|
indicators_data = data.get("technology_indicators", {})
|
|
technology_indicators = TechnologyIndicators(
|
|
positive=indicators_data.get("positive", [
|
|
"prototype", "demonstrated", "developed", "patent",
|
|
"Phase II", "system", "device", "sensor", "tested"
|
|
]),
|
|
negative=indicators_data.get("negative", [
|
|
"overview", "challenge", "problem", "review",
|
|
"survey", "introduction", "future", "potential"
|
|
]),
|
|
)
|
|
|
|
# Extract TRL range
|
|
trl_data = data.get("target_trl", {})
|
|
trl_range = (
|
|
trl_data.get("min", 4),
|
|
trl_data.get("max", 7)
|
|
)
|
|
|
|
return ParsedCapability(
|
|
original_query=query,
|
|
understanding=data.get("understanding", ""),
|
|
technical_domains=data.get("technical_domains", []),
|
|
search_queries=data.get("search_queries", []),
|
|
sbir_queries=data.get("sbir_queries", []),
|
|
patent_queries=data.get("patent_queries", []),
|
|
news_queries=data.get("news_queries", []),
|
|
keywords=data.get("keywords", []),
|
|
exclusions=data.get("exclusions", []),
|
|
target_trl_range=trl_range,
|
|
capability_need=capability_need,
|
|
capability_criteria=capability_criteria,
|
|
technology_indicators=technology_indicators,
|
|
success=True,
|
|
)
|
|
|
|
def _create_fallback(self, query: str, error: str) -> ParsedCapability:
|
|
"""Create a fallback ParsedCapability when LLM fails."""
|
|
# Generate basic queries from the input
|
|
words = query.split()
|
|
basic_queries = [
|
|
query,
|
|
" ".join(words[:min(5, len(words))]) + " technology",
|
|
" ".join(words[:min(5, len(words))]) + " system prototype",
|
|
]
|
|
|
|
return ParsedCapability(
|
|
original_query=query,
|
|
understanding="",
|
|
technical_domains=[],
|
|
search_queries=basic_queries,
|
|
sbir_queries=basic_queries[:2],
|
|
patent_queries=basic_queries[:2],
|
|
news_queries=basic_queries[:2],
|
|
keywords=words[:10],
|
|
exclusions=[],
|
|
capability_need=CapabilityNeed(
|
|
functional_need=query,
|
|
domain="",
|
|
implied_constraints=[],
|
|
technology_types_sought=["system", "technology"],
|
|
),
|
|
capability_criteria=[
|
|
CapabilityCriterion(
|
|
criterion="Address the stated capability need",
|
|
weight="must_have",
|
|
keywords=words[:5],
|
|
)
|
|
],
|
|
technology_indicators=TechnologyIndicators(
|
|
positive=["prototype", "demonstrated", "system", "device"],
|
|
negative=["overview", "challenge", "review"],
|
|
),
|
|
success=False,
|
|
error=error,
|
|
)
|