457 lines
16 KiB
Python
457 lines
16 KiB
Python
"""
|
|
Technology Extractor for TechScout.
|
|
|
|
Extracts specific, nameable technologies from search results.
|
|
Filters out results that are just topic discussions or generic content.
|
|
|
|
This is Stage 3 of the Capability-Technology Matching pipeline.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import uuid
|
|
from typing import List, Optional, Tuple
|
|
from dataclasses import dataclass
|
|
|
|
from ..extraction.llm_client import OllamaClient
|
|
from ..search.base import SearchResult
|
|
from ..capability.types import CapabilityNeed, TechnologyIndicators
|
|
from .types import ExtractedTechnology
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ExtractionResult:
|
|
"""Result of technology extraction from search results."""
|
|
technologies: List[ExtractedTechnology]
|
|
discarded_ids: List[str] # IDs of results that weren't technologies
|
|
extraction_stats: dict
|
|
|
|
|
|
class TechnologyExtractor:
|
|
"""
|
|
Extracts specific technologies from search results.
|
|
|
|
Uses a combination of:
|
|
1. Heuristic pre-filtering (source type, keyword indicators)
|
|
2. LLM-based extraction (for actual technology identification)
|
|
"""
|
|
|
|
EXTRACTION_PROMPT = """Analyze this search result and determine if it describes a SPECIFIC TECHNOLOGY.
|
|
|
|
CONTEXT - User is looking for:
|
|
{functional_need}
|
|
|
|
SEARCH RESULT:
|
|
Source Type: {source_type}
|
|
Title: {title}
|
|
Content: {snippet}
|
|
Organization: {organization}
|
|
|
|
INSTRUCTIONS:
|
|
1. Does this describe a SPECIFIC, NAMEABLE technology?
|
|
- YES examples: "SPAD Array Detector", "Quantum dot infrared sensor", "ML-based tracking algorithm"
|
|
- NO examples: Topic discussions, overviews, problem descriptions, generic categories like "sensor technology"
|
|
|
|
2. If YES, extract technology details. If NO, explain why.
|
|
|
|
Respond with JSON:
|
|
{{
|
|
"is_technology": true/false,
|
|
"reason": "Why this is/isn't a specific technology",
|
|
"technology": {{
|
|
"name": "Specific technology name",
|
|
"type": "sensor|algorithm|material|system|platform|device|method|software|other",
|
|
"description": "What it is (1-2 sentences)",
|
|
"capabilities": ["capability 1", "capability 2"],
|
|
"mechanism": "How it works (if described)",
|
|
"developer": "Organization name if mentioned",
|
|
"developer_type": "company|university|government|lab|unknown",
|
|
"trl_indicators": ["Phase II SBIR", "prototype demonstrated", etc.]
|
|
}}
|
|
}}
|
|
|
|
If is_technology is false, omit the technology object."""
|
|
|
|
BATCH_EXTRACTION_PROMPT = """Analyze these search results and identify which describe SPECIFIC TECHNOLOGIES.
|
|
|
|
CONTEXT - User is looking for:
|
|
{functional_need}
|
|
|
|
SEARCH RESULTS:
|
|
{results_text}
|
|
|
|
For each result, determine if it describes a specific, nameable technology (not generic category or topic discussion).
|
|
|
|
Respond with JSON:
|
|
{{
|
|
"extractions": [
|
|
{{
|
|
"item_number": 1,
|
|
"is_technology": true/false,
|
|
"reason": "brief explanation",
|
|
"technology": {{...}} or null
|
|
}},
|
|
...
|
|
]
|
|
}}
|
|
|
|
Technology object schema (when is_technology is true):
|
|
{{
|
|
"name": "Specific technology name",
|
|
"type": "sensor|algorithm|material|system|platform|device|method|software|other",
|
|
"description": "What it is",
|
|
"capabilities": ["cap1", "cap2"],
|
|
"developer": "Organization if mentioned",
|
|
"trl_indicators": ["evidence of maturity"]
|
|
}}"""
|
|
|
|
# Source types more likely to describe actual technologies
|
|
HIGH_TECH_LIKELIHOOD_SOURCES = {"sbir", "patent"}
|
|
MEDIUM_TECH_LIKELIHOOD_SOURCES = {"contract"}
|
|
LOW_TECH_LIKELIHOOD_SOURCES = {"news", "web", "academic"}
|
|
|
|
def __init__(
|
|
self,
|
|
ollama_client: Optional[OllamaClient] = None,
|
|
model: str = "mistral-nemo:12b"
|
|
):
|
|
self.client = ollama_client or OllamaClient()
|
|
self.model = model
|
|
|
|
def extract_all(
|
|
self,
|
|
results: List[SearchResult],
|
|
capability_need: CapabilityNeed,
|
|
technology_indicators: Optional[TechnologyIndicators] = None,
|
|
batch_size: int = 5
|
|
) -> ExtractionResult:
|
|
"""
|
|
Extract technologies from all search results.
|
|
|
|
Args:
|
|
results: List of search results to process
|
|
capability_need: Structured capability need for context
|
|
technology_indicators: Positive/negative indicators
|
|
batch_size: How many results to process per LLM call
|
|
|
|
Returns:
|
|
ExtractionResult with extracted technologies and stats
|
|
"""
|
|
logger.info(f"Extracting technologies from {len(results)} results...")
|
|
|
|
technologies = []
|
|
discarded_ids = []
|
|
|
|
# Separate by likelihood of being a technology
|
|
high_priority = []
|
|
medium_priority = []
|
|
low_priority = []
|
|
|
|
for result in results:
|
|
if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES:
|
|
high_priority.append(result)
|
|
elif result.source_type in self.MEDIUM_TECH_LIKELIHOOD_SOURCES:
|
|
medium_priority.append(result)
|
|
else:
|
|
low_priority.append(result)
|
|
|
|
# Process high priority (all)
|
|
if high_priority:
|
|
logger.info(f"Processing {len(high_priority)} high-priority results (SBIR/patents)...")
|
|
techs, discarded = self._process_batch_results(
|
|
high_priority, capability_need, batch_size
|
|
)
|
|
technologies.extend(techs)
|
|
discarded_ids.extend(discarded)
|
|
|
|
# Process medium priority (all)
|
|
if medium_priority:
|
|
logger.info(f"Processing {len(medium_priority)} medium-priority results (contracts)...")
|
|
techs, discarded = self._process_batch_results(
|
|
medium_priority, capability_need, batch_size
|
|
)
|
|
technologies.extend(techs)
|
|
discarded_ids.extend(discarded)
|
|
|
|
# Process low priority with pre-filtering
|
|
if low_priority:
|
|
# Apply heuristic filter first
|
|
filtered_low = self._heuristic_filter(low_priority, technology_indicators)
|
|
logger.info(f"Processing {len(filtered_low)}/{len(low_priority)} low-priority results (passed heuristic filter)...")
|
|
|
|
if filtered_low:
|
|
techs, discarded = self._process_batch_results(
|
|
filtered_low, capability_need, batch_size
|
|
)
|
|
technologies.extend(techs)
|
|
discarded_ids.extend(discarded)
|
|
|
|
# Add filtered-out results to discarded
|
|
filtered_out_ids = [
|
|
r.url for r in low_priority if r not in filtered_low
|
|
]
|
|
discarded_ids.extend(filtered_out_ids)
|
|
|
|
logger.info(f"Extracted {len(technologies)} technologies, discarded {len(discarded_ids)} results")
|
|
|
|
return ExtractionResult(
|
|
technologies=technologies,
|
|
discarded_ids=discarded_ids,
|
|
extraction_stats={
|
|
"total_results": len(results),
|
|
"high_priority_processed": len(high_priority),
|
|
"medium_priority_processed": len(medium_priority),
|
|
"low_priority_processed": len(low_priority),
|
|
"technologies_extracted": len(technologies),
|
|
"results_discarded": len(discarded_ids),
|
|
}
|
|
)
|
|
|
|
def _heuristic_filter(
|
|
self,
|
|
results: List[SearchResult],
|
|
indicators: Optional[TechnologyIndicators]
|
|
) -> List[SearchResult]:
|
|
"""Apply heuristic filtering to identify likely technology results."""
|
|
if not indicators:
|
|
# Default indicators
|
|
positive = {"prototype", "demonstrated", "system", "device", "sensor",
|
|
"developed", "patent", "phase ii", "tested", "deployed"}
|
|
negative = {"overview", "challenge", "review", "introduction", "survey",
|
|
"future", "potential", "could", "might", "problems"}
|
|
else:
|
|
positive = set(w.lower() for w in indicators.positive)
|
|
negative = set(w.lower() for w in indicators.negative)
|
|
|
|
filtered = []
|
|
for result in results:
|
|
text = f"{result.title} {result.snippet}".lower()
|
|
|
|
# Count positive and negative indicators
|
|
pos_count = sum(1 for term in positive if term in text)
|
|
neg_count = sum(1 for term in negative if term in text)
|
|
|
|
# Include if more positive than negative, or any positive found
|
|
if pos_count > 0 and pos_count >= neg_count:
|
|
filtered.append(result)
|
|
elif pos_count > 0:
|
|
filtered.append(result)
|
|
|
|
return filtered
|
|
|
|
def _process_batch_results(
|
|
self,
|
|
results: List[SearchResult],
|
|
capability_need: CapabilityNeed,
|
|
batch_size: int
|
|
) -> Tuple[List[ExtractedTechnology], List[str]]:
|
|
"""Process results in batches using LLM."""
|
|
technologies = []
|
|
discarded = []
|
|
|
|
for i in range(0, len(results), batch_size):
|
|
batch = results[i:i + batch_size]
|
|
|
|
if len(batch) == 1:
|
|
# Single result - use single extraction
|
|
result = batch[0]
|
|
tech = self._extract_single(result, capability_need)
|
|
if tech:
|
|
technologies.append(tech)
|
|
else:
|
|
discarded.append(result.url)
|
|
else:
|
|
# Multiple results - use batch extraction
|
|
batch_techs, batch_discarded = self._extract_batch(batch, capability_need)
|
|
technologies.extend(batch_techs)
|
|
discarded.extend(batch_discarded)
|
|
|
|
return technologies, discarded
|
|
|
|
def _extract_single(
|
|
self,
|
|
result: SearchResult,
|
|
capability_need: CapabilityNeed
|
|
) -> Optional[ExtractedTechnology]:
|
|
"""Extract technology from a single result."""
|
|
prompt = self.EXTRACTION_PROMPT.format(
|
|
functional_need=capability_need.functional_need,
|
|
source_type=result.source_type,
|
|
title=result.title,
|
|
snippet=result.snippet[:500],
|
|
organization=result.organization or "Unknown"
|
|
)
|
|
|
|
response = self.client.generate(
|
|
prompt=prompt,
|
|
model=self.model,
|
|
temperature=0.1,
|
|
format="json"
|
|
)
|
|
|
|
if not response.success:
|
|
logger.warning(f"LLM extraction failed for: {result.title[:50]}")
|
|
return None
|
|
|
|
try:
|
|
data = json.loads(response.content)
|
|
except json.JSONDecodeError:
|
|
data = self.client.extract_json_from_text(response.content)
|
|
if not data:
|
|
return None
|
|
|
|
if not data.get("is_technology", False):
|
|
return None
|
|
|
|
tech_data = data.get("technology", {})
|
|
if not tech_data or not tech_data.get("name"):
|
|
return None
|
|
|
|
return self._build_extracted_technology(result, tech_data)
|
|
|
|
def _extract_batch(
|
|
self,
|
|
results: List[SearchResult],
|
|
capability_need: CapabilityNeed
|
|
) -> Tuple[List[ExtractedTechnology], List[str]]:
|
|
"""Extract technologies from a batch of results."""
|
|
# Build results text
|
|
results_text = ""
|
|
for i, result in enumerate(results, 1):
|
|
results_text += f"""
|
|
Item {i}:
|
|
Source: {result.source_type}
|
|
Title: {result.title}
|
|
Organization: {result.organization or 'Unknown'}
|
|
Content: {result.snippet[:300]}
|
|
---"""
|
|
|
|
prompt = self.BATCH_EXTRACTION_PROMPT.format(
|
|
functional_need=capability_need.functional_need,
|
|
results_text=results_text
|
|
)
|
|
|
|
response = self.client.generate(
|
|
prompt=prompt,
|
|
model=self.model,
|
|
temperature=0.1,
|
|
format="json"
|
|
)
|
|
|
|
technologies = []
|
|
discarded = []
|
|
|
|
if not response.success:
|
|
logger.warning("Batch extraction failed, falling back to individual extraction")
|
|
for result in results:
|
|
tech = self._extract_single(result, capability_need)
|
|
if tech:
|
|
technologies.append(tech)
|
|
else:
|
|
discarded.append(result.url)
|
|
return technologies, discarded
|
|
|
|
try:
|
|
data = json.loads(response.content)
|
|
except json.JSONDecodeError:
|
|
data = self.client.extract_json_from_text(response.content)
|
|
if not data:
|
|
# Fallback to individual extraction
|
|
for result in results:
|
|
tech = self._extract_single(result, capability_need)
|
|
if tech:
|
|
technologies.append(tech)
|
|
else:
|
|
discarded.append(result.url)
|
|
return technologies, discarded
|
|
|
|
extractions = data.get("extractions", [])
|
|
|
|
for extraction in extractions:
|
|
item_num = extraction.get("item_number", 0)
|
|
if 1 <= item_num <= len(results):
|
|
result = results[item_num - 1]
|
|
|
|
if extraction.get("is_technology", False):
|
|
tech_data = extraction.get("technology", {})
|
|
if tech_data and tech_data.get("name"):
|
|
tech = self._build_extracted_technology(result, tech_data)
|
|
if tech:
|
|
technologies.append(tech)
|
|
continue
|
|
|
|
discarded.append(result.url)
|
|
|
|
return technologies, discarded
|
|
|
|
def _build_extracted_technology(
|
|
self,
|
|
result: SearchResult,
|
|
tech_data: dict
|
|
) -> ExtractedTechnology:
|
|
"""Build ExtractedTechnology from extraction data."""
|
|
tech_id = str(uuid.uuid4())[:8]
|
|
|
|
# Estimate TRL from indicators
|
|
trl_indicators = tech_data.get("trl_indicators", [])
|
|
trl_estimate = self._estimate_trl(trl_indicators, result)
|
|
|
|
return ExtractedTechnology(
|
|
id=tech_id,
|
|
source_result_id=result.url,
|
|
extraction_confidence=0.8 if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES else 0.6,
|
|
name=tech_data.get("name", ""),
|
|
technology_type=tech_data.get("type", "system"),
|
|
description=tech_data.get("description", ""),
|
|
capabilities=tech_data.get("capabilities", []),
|
|
mechanism=tech_data.get("mechanism"),
|
|
developer=tech_data.get("developer") or result.organization,
|
|
developer_type=tech_data.get("developer_type", "unknown"),
|
|
trl_estimate=trl_estimate,
|
|
trl_evidence=trl_indicators,
|
|
source_type=result.source_type,
|
|
source_url=result.url,
|
|
source_title=result.title,
|
|
source_snippet=result.snippet,
|
|
)
|
|
|
|
def _estimate_trl(
|
|
self,
|
|
trl_indicators: List[str],
|
|
result: SearchResult
|
|
) -> Optional[int]:
|
|
"""Estimate TRL from indicators and source type."""
|
|
# Use existing TRL if available
|
|
if result.trl_estimate:
|
|
return result.trl_estimate
|
|
|
|
# Estimate from indicators
|
|
indicators_lower = " ".join(trl_indicators).lower()
|
|
|
|
if any(term in indicators_lower for term in ["deployed", "operational", "fielded"]):
|
|
return 9
|
|
elif any(term in indicators_lower for term in ["production", "qualified"]):
|
|
return 8
|
|
elif any(term in indicators_lower for term in ["phase iii", "demonstration"]):
|
|
return 7
|
|
elif any(term in indicators_lower for term in ["prototype", "phase ii"]):
|
|
return 5
|
|
elif any(term in indicators_lower for term in ["phase i", "laboratory"]):
|
|
return 4
|
|
elif any(term in indicators_lower for term in ["concept", "proof"]):
|
|
return 3
|
|
elif any(term in indicators_lower for term in ["research", "basic"]):
|
|
return 2
|
|
|
|
# Estimate from source type
|
|
source_trl_defaults = {
|
|
"sbir": 4,
|
|
"patent": 5,
|
|
"contract": 6,
|
|
"news": 5,
|
|
"web": 5,
|
|
}
|
|
return source_trl_defaults.get(result.source_type, 5)
|