TechScout/techscout/technology/extractor.py

457 lines
16 KiB
Python

"""
Technology Extractor for TechScout.
Extracts specific, nameable technologies from search results.
Filters out results that are just topic discussions or generic content.
This is Stage 3 of the Capability-Technology Matching pipeline.
"""
import json
import logging
import uuid
from typing import List, Optional, Tuple
from dataclasses import dataclass
from ..extraction.llm_client import OllamaClient
from ..search.base import SearchResult
from ..capability.types import CapabilityNeed, TechnologyIndicators
from .types import ExtractedTechnology
logger = logging.getLogger(__name__)
@dataclass
class ExtractionResult:
"""Result of technology extraction from search results."""
technologies: List[ExtractedTechnology]
discarded_ids: List[str] # IDs of results that weren't technologies
extraction_stats: dict
class TechnologyExtractor:
"""
Extracts specific technologies from search results.
Uses a combination of:
1. Heuristic pre-filtering (source type, keyword indicators)
2. LLM-based extraction (for actual technology identification)
"""
EXTRACTION_PROMPT = """Analyze this search result and determine if it describes a SPECIFIC TECHNOLOGY.
CONTEXT - User is looking for:
{functional_need}
SEARCH RESULT:
Source Type: {source_type}
Title: {title}
Content: {snippet}
Organization: {organization}
INSTRUCTIONS:
1. Does this describe a SPECIFIC, NAMEABLE technology?
- YES examples: "SPAD Array Detector", "Quantum dot infrared sensor", "ML-based tracking algorithm"
- NO examples: Topic discussions, overviews, problem descriptions, generic categories like "sensor technology"
2. If YES, extract technology details. If NO, explain why.
Respond with JSON:
{{
"is_technology": true/false,
"reason": "Why this is/isn't a specific technology",
"technology": {{
"name": "Specific technology name",
"type": "sensor|algorithm|material|system|platform|device|method|software|other",
"description": "What it is (1-2 sentences)",
"capabilities": ["capability 1", "capability 2"],
"mechanism": "How it works (if described)",
"developer": "Organization name if mentioned",
"developer_type": "company|university|government|lab|unknown",
"trl_indicators": ["Phase II SBIR", "prototype demonstrated", etc.]
}}
}}
If is_technology is false, omit the technology object."""
BATCH_EXTRACTION_PROMPT = """Analyze these search results and identify which describe SPECIFIC TECHNOLOGIES.
CONTEXT - User is looking for:
{functional_need}
SEARCH RESULTS:
{results_text}
For each result, determine if it describes a specific, nameable technology (not generic category or topic discussion).
Respond with JSON:
{{
"extractions": [
{{
"item_number": 1,
"is_technology": true/false,
"reason": "brief explanation",
"technology": {{...}} or null
}},
...
]
}}
Technology object schema (when is_technology is true):
{{
"name": "Specific technology name",
"type": "sensor|algorithm|material|system|platform|device|method|software|other",
"description": "What it is",
"capabilities": ["cap1", "cap2"],
"developer": "Organization if mentioned",
"trl_indicators": ["evidence of maturity"]
}}"""
# Source types more likely to describe actual technologies
HIGH_TECH_LIKELIHOOD_SOURCES = {"sbir", "patent"}
MEDIUM_TECH_LIKELIHOOD_SOURCES = {"contract"}
LOW_TECH_LIKELIHOOD_SOURCES = {"news", "web", "academic"}
def __init__(
self,
ollama_client: Optional[OllamaClient] = None,
model: str = "mistral-nemo:12b"
):
self.client = ollama_client or OllamaClient()
self.model = model
def extract_all(
self,
results: List[SearchResult],
capability_need: CapabilityNeed,
technology_indicators: Optional[TechnologyIndicators] = None,
batch_size: int = 5
) -> ExtractionResult:
"""
Extract technologies from all search results.
Args:
results: List of search results to process
capability_need: Structured capability need for context
technology_indicators: Positive/negative indicators
batch_size: How many results to process per LLM call
Returns:
ExtractionResult with extracted technologies and stats
"""
logger.info(f"Extracting technologies from {len(results)} results...")
technologies = []
discarded_ids = []
# Separate by likelihood of being a technology
high_priority = []
medium_priority = []
low_priority = []
for result in results:
if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES:
high_priority.append(result)
elif result.source_type in self.MEDIUM_TECH_LIKELIHOOD_SOURCES:
medium_priority.append(result)
else:
low_priority.append(result)
# Process high priority (all)
if high_priority:
logger.info(f"Processing {len(high_priority)} high-priority results (SBIR/patents)...")
techs, discarded = self._process_batch_results(
high_priority, capability_need, batch_size
)
technologies.extend(techs)
discarded_ids.extend(discarded)
# Process medium priority (all)
if medium_priority:
logger.info(f"Processing {len(medium_priority)} medium-priority results (contracts)...")
techs, discarded = self._process_batch_results(
medium_priority, capability_need, batch_size
)
technologies.extend(techs)
discarded_ids.extend(discarded)
# Process low priority with pre-filtering
if low_priority:
# Apply heuristic filter first
filtered_low = self._heuristic_filter(low_priority, technology_indicators)
logger.info(f"Processing {len(filtered_low)}/{len(low_priority)} low-priority results (passed heuristic filter)...")
if filtered_low:
techs, discarded = self._process_batch_results(
filtered_low, capability_need, batch_size
)
technologies.extend(techs)
discarded_ids.extend(discarded)
# Add filtered-out results to discarded
filtered_out_ids = [
r.url for r in low_priority if r not in filtered_low
]
discarded_ids.extend(filtered_out_ids)
logger.info(f"Extracted {len(technologies)} technologies, discarded {len(discarded_ids)} results")
return ExtractionResult(
technologies=technologies,
discarded_ids=discarded_ids,
extraction_stats={
"total_results": len(results),
"high_priority_processed": len(high_priority),
"medium_priority_processed": len(medium_priority),
"low_priority_processed": len(low_priority),
"technologies_extracted": len(technologies),
"results_discarded": len(discarded_ids),
}
)
def _heuristic_filter(
self,
results: List[SearchResult],
indicators: Optional[TechnologyIndicators]
) -> List[SearchResult]:
"""Apply heuristic filtering to identify likely technology results."""
if not indicators:
# Default indicators
positive = {"prototype", "demonstrated", "system", "device", "sensor",
"developed", "patent", "phase ii", "tested", "deployed"}
negative = {"overview", "challenge", "review", "introduction", "survey",
"future", "potential", "could", "might", "problems"}
else:
positive = set(w.lower() for w in indicators.positive)
negative = set(w.lower() for w in indicators.negative)
filtered = []
for result in results:
text = f"{result.title} {result.snippet}".lower()
# Count positive and negative indicators
pos_count = sum(1 for term in positive if term in text)
neg_count = sum(1 for term in negative if term in text)
# Include if more positive than negative, or any positive found
if pos_count > 0 and pos_count >= neg_count:
filtered.append(result)
elif pos_count > 0:
filtered.append(result)
return filtered
def _process_batch_results(
self,
results: List[SearchResult],
capability_need: CapabilityNeed,
batch_size: int
) -> Tuple[List[ExtractedTechnology], List[str]]:
"""Process results in batches using LLM."""
technologies = []
discarded = []
for i in range(0, len(results), batch_size):
batch = results[i:i + batch_size]
if len(batch) == 1:
# Single result - use single extraction
result = batch[0]
tech = self._extract_single(result, capability_need)
if tech:
technologies.append(tech)
else:
discarded.append(result.url)
else:
# Multiple results - use batch extraction
batch_techs, batch_discarded = self._extract_batch(batch, capability_need)
technologies.extend(batch_techs)
discarded.extend(batch_discarded)
return technologies, discarded
def _extract_single(
self,
result: SearchResult,
capability_need: CapabilityNeed
) -> Optional[ExtractedTechnology]:
"""Extract technology from a single result."""
prompt = self.EXTRACTION_PROMPT.format(
functional_need=capability_need.functional_need,
source_type=result.source_type,
title=result.title,
snippet=result.snippet[:500],
organization=result.organization or "Unknown"
)
response = self.client.generate(
prompt=prompt,
model=self.model,
temperature=0.1,
format="json"
)
if not response.success:
logger.warning(f"LLM extraction failed for: {result.title[:50]}")
return None
try:
data = json.loads(response.content)
except json.JSONDecodeError:
data = self.client.extract_json_from_text(response.content)
if not data:
return None
if not data.get("is_technology", False):
return None
tech_data = data.get("technology", {})
if not tech_data or not tech_data.get("name"):
return None
return self._build_extracted_technology(result, tech_data)
def _extract_batch(
self,
results: List[SearchResult],
capability_need: CapabilityNeed
) -> Tuple[List[ExtractedTechnology], List[str]]:
"""Extract technologies from a batch of results."""
# Build results text
results_text = ""
for i, result in enumerate(results, 1):
results_text += f"""
Item {i}:
Source: {result.source_type}
Title: {result.title}
Organization: {result.organization or 'Unknown'}
Content: {result.snippet[:300]}
---"""
prompt = self.BATCH_EXTRACTION_PROMPT.format(
functional_need=capability_need.functional_need,
results_text=results_text
)
response = self.client.generate(
prompt=prompt,
model=self.model,
temperature=0.1,
format="json"
)
technologies = []
discarded = []
if not response.success:
logger.warning("Batch extraction failed, falling back to individual extraction")
for result in results:
tech = self._extract_single(result, capability_need)
if tech:
technologies.append(tech)
else:
discarded.append(result.url)
return technologies, discarded
try:
data = json.loads(response.content)
except json.JSONDecodeError:
data = self.client.extract_json_from_text(response.content)
if not data:
# Fallback to individual extraction
for result in results:
tech = self._extract_single(result, capability_need)
if tech:
technologies.append(tech)
else:
discarded.append(result.url)
return technologies, discarded
extractions = data.get("extractions", [])
for extraction in extractions:
item_num = extraction.get("item_number", 0)
if 1 <= item_num <= len(results):
result = results[item_num - 1]
if extraction.get("is_technology", False):
tech_data = extraction.get("technology", {})
if tech_data and tech_data.get("name"):
tech = self._build_extracted_technology(result, tech_data)
if tech:
technologies.append(tech)
continue
discarded.append(result.url)
return technologies, discarded
def _build_extracted_technology(
self,
result: SearchResult,
tech_data: dict
) -> ExtractedTechnology:
"""Build ExtractedTechnology from extraction data."""
tech_id = str(uuid.uuid4())[:8]
# Estimate TRL from indicators
trl_indicators = tech_data.get("trl_indicators", [])
trl_estimate = self._estimate_trl(trl_indicators, result)
return ExtractedTechnology(
id=tech_id,
source_result_id=result.url,
extraction_confidence=0.8 if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES else 0.6,
name=tech_data.get("name", ""),
technology_type=tech_data.get("type", "system"),
description=tech_data.get("description", ""),
capabilities=tech_data.get("capabilities", []),
mechanism=tech_data.get("mechanism"),
developer=tech_data.get("developer") or result.organization,
developer_type=tech_data.get("developer_type", "unknown"),
trl_estimate=trl_estimate,
trl_evidence=trl_indicators,
source_type=result.source_type,
source_url=result.url,
source_title=result.title,
source_snippet=result.snippet,
)
def _estimate_trl(
self,
trl_indicators: List[str],
result: SearchResult
) -> Optional[int]:
"""Estimate TRL from indicators and source type."""
# Use existing TRL if available
if result.trl_estimate:
return result.trl_estimate
# Estimate from indicators
indicators_lower = " ".join(trl_indicators).lower()
if any(term in indicators_lower for term in ["deployed", "operational", "fielded"]):
return 9
elif any(term in indicators_lower for term in ["production", "qualified"]):
return 8
elif any(term in indicators_lower for term in ["phase iii", "demonstration"]):
return 7
elif any(term in indicators_lower for term in ["prototype", "phase ii"]):
return 5
elif any(term in indicators_lower for term in ["phase i", "laboratory"]):
return 4
elif any(term in indicators_lower for term in ["concept", "proof"]):
return 3
elif any(term in indicators_lower for term in ["research", "basic"]):
return 2
# Estimate from source type
source_trl_defaults = {
"sbir": 4,
"patent": 5,
"contract": 6,
"news": 5,
"web": 5,
}
return source_trl_defaults.get(result.source_type, 5)