TechScout/techscout/technology/extractor.py

457 lines
16 KiB
Python
Raw Normal View History

2026-01-22 13:02:09 -05:00
"""
Technology Extractor for TechScout.
Extracts specific, nameable technologies from search results.
Filters out results that are just topic discussions or generic content.
This is Stage 3 of the Capability-Technology Matching pipeline.
"""
import json
import logging
import uuid
from typing import List, Optional, Tuple
from dataclasses import dataclass
from ..extraction.llm_client import OllamaClient
from ..search.base import SearchResult
from ..capability.types import CapabilityNeed, TechnologyIndicators
from .types import ExtractedTechnology
logger = logging.getLogger(__name__)
@dataclass
class ExtractionResult:
"""Result of technology extraction from search results."""
technologies: List[ExtractedTechnology]
discarded_ids: List[str] # IDs of results that weren't technologies
extraction_stats: dict
class TechnologyExtractor:
"""
Extracts specific technologies from search results.
Uses a combination of:
1. Heuristic pre-filtering (source type, keyword indicators)
2. LLM-based extraction (for actual technology identification)
"""
EXTRACTION_PROMPT = """Analyze this search result and determine if it describes a SPECIFIC TECHNOLOGY.
CONTEXT - User is looking for:
{functional_need}
SEARCH RESULT:
Source Type: {source_type}
Title: {title}
Content: {snippet}
Organization: {organization}
INSTRUCTIONS:
1. Does this describe a SPECIFIC, NAMEABLE technology?
- YES examples: "SPAD Array Detector", "Quantum dot infrared sensor", "ML-based tracking algorithm"
- NO examples: Topic discussions, overviews, problem descriptions, generic categories like "sensor technology"
2. If YES, extract technology details. If NO, explain why.
Respond with JSON:
{{
"is_technology": true/false,
"reason": "Why this is/isn't a specific technology",
"technology": {{
"name": "Specific technology name",
"type": "sensor|algorithm|material|system|platform|device|method|software|other",
"description": "What it is (1-2 sentences)",
"capabilities": ["capability 1", "capability 2"],
"mechanism": "How it works (if described)",
"developer": "Organization name if mentioned",
"developer_type": "company|university|government|lab|unknown",
"trl_indicators": ["Phase II SBIR", "prototype demonstrated", etc.]
}}
}}
If is_technology is false, omit the technology object."""
BATCH_EXTRACTION_PROMPT = """Analyze these search results and identify which describe SPECIFIC TECHNOLOGIES.
CONTEXT - User is looking for:
{functional_need}
SEARCH RESULTS:
{results_text}
For each result, determine if it describes a specific, nameable technology (not generic category or topic discussion).
Respond with JSON:
{{
"extractions": [
{{
"item_number": 1,
"is_technology": true/false,
"reason": "brief explanation",
"technology": {{...}} or null
}},
...
]
}}
Technology object schema (when is_technology is true):
{{
"name": "Specific technology name",
"type": "sensor|algorithm|material|system|platform|device|method|software|other",
"description": "What it is",
"capabilities": ["cap1", "cap2"],
"developer": "Organization if mentioned",
"trl_indicators": ["evidence of maturity"]
}}"""
# Source types more likely to describe actual technologies
HIGH_TECH_LIKELIHOOD_SOURCES = {"sbir", "patent"}
MEDIUM_TECH_LIKELIHOOD_SOURCES = {"contract"}
LOW_TECH_LIKELIHOOD_SOURCES = {"news", "web", "academic"}
def __init__(
self,
ollama_client: Optional[OllamaClient] = None,
model: str = "mistral-nemo:12b"
):
self.client = ollama_client or OllamaClient()
self.model = model
def extract_all(
self,
results: List[SearchResult],
capability_need: CapabilityNeed,
technology_indicators: Optional[TechnologyIndicators] = None,
batch_size: int = 5
) -> ExtractionResult:
"""
Extract technologies from all search results.
Args:
results: List of search results to process
capability_need: Structured capability need for context
technology_indicators: Positive/negative indicators
batch_size: How many results to process per LLM call
Returns:
ExtractionResult with extracted technologies and stats
"""
logger.info(f"Extracting technologies from {len(results)} results...")
technologies = []
discarded_ids = []
# Separate by likelihood of being a technology
high_priority = []
medium_priority = []
low_priority = []
for result in results:
if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES:
high_priority.append(result)
elif result.source_type in self.MEDIUM_TECH_LIKELIHOOD_SOURCES:
medium_priority.append(result)
else:
low_priority.append(result)
# Process high priority (all)
if high_priority:
logger.info(f"Processing {len(high_priority)} high-priority results (SBIR/patents)...")
techs, discarded = self._process_batch_results(
high_priority, capability_need, batch_size
)
technologies.extend(techs)
discarded_ids.extend(discarded)
# Process medium priority (all)
if medium_priority:
logger.info(f"Processing {len(medium_priority)} medium-priority results (contracts)...")
techs, discarded = self._process_batch_results(
medium_priority, capability_need, batch_size
)
technologies.extend(techs)
discarded_ids.extend(discarded)
# Process low priority with pre-filtering
if low_priority:
# Apply heuristic filter first
filtered_low = self._heuristic_filter(low_priority, technology_indicators)
logger.info(f"Processing {len(filtered_low)}/{len(low_priority)} low-priority results (passed heuristic filter)...")
if filtered_low:
techs, discarded = self._process_batch_results(
filtered_low, capability_need, batch_size
)
technologies.extend(techs)
discarded_ids.extend(discarded)
# Add filtered-out results to discarded
filtered_out_ids = [
r.url for r in low_priority if r not in filtered_low
]
discarded_ids.extend(filtered_out_ids)
logger.info(f"Extracted {len(technologies)} technologies, discarded {len(discarded_ids)} results")
return ExtractionResult(
technologies=technologies,
discarded_ids=discarded_ids,
extraction_stats={
"total_results": len(results),
"high_priority_processed": len(high_priority),
"medium_priority_processed": len(medium_priority),
"low_priority_processed": len(low_priority),
"technologies_extracted": len(technologies),
"results_discarded": len(discarded_ids),
}
)
def _heuristic_filter(
self,
results: List[SearchResult],
indicators: Optional[TechnologyIndicators]
) -> List[SearchResult]:
"""Apply heuristic filtering to identify likely technology results."""
if not indicators:
# Default indicators
positive = {"prototype", "demonstrated", "system", "device", "sensor",
"developed", "patent", "phase ii", "tested", "deployed"}
negative = {"overview", "challenge", "review", "introduction", "survey",
"future", "potential", "could", "might", "problems"}
else:
positive = set(w.lower() for w in indicators.positive)
negative = set(w.lower() for w in indicators.negative)
filtered = []
for result in results:
text = f"{result.title} {result.snippet}".lower()
# Count positive and negative indicators
pos_count = sum(1 for term in positive if term in text)
neg_count = sum(1 for term in negative if term in text)
# Include if more positive than negative, or any positive found
if pos_count > 0 and pos_count >= neg_count:
filtered.append(result)
elif pos_count > 0:
filtered.append(result)
return filtered
def _process_batch_results(
self,
results: List[SearchResult],
capability_need: CapabilityNeed,
batch_size: int
) -> Tuple[List[ExtractedTechnology], List[str]]:
"""Process results in batches using LLM."""
technologies = []
discarded = []
for i in range(0, len(results), batch_size):
batch = results[i:i + batch_size]
if len(batch) == 1:
# Single result - use single extraction
result = batch[0]
tech = self._extract_single(result, capability_need)
if tech:
technologies.append(tech)
else:
discarded.append(result.url)
else:
# Multiple results - use batch extraction
batch_techs, batch_discarded = self._extract_batch(batch, capability_need)
technologies.extend(batch_techs)
discarded.extend(batch_discarded)
return technologies, discarded
def _extract_single(
self,
result: SearchResult,
capability_need: CapabilityNeed
) -> Optional[ExtractedTechnology]:
"""Extract technology from a single result."""
prompt = self.EXTRACTION_PROMPT.format(
functional_need=capability_need.functional_need,
source_type=result.source_type,
title=result.title,
snippet=result.snippet[:500],
organization=result.organization or "Unknown"
)
response = self.client.generate(
prompt=prompt,
model=self.model,
temperature=0.1,
format="json"
)
if not response.success:
logger.warning(f"LLM extraction failed for: {result.title[:50]}")
return None
try:
data = json.loads(response.content)
except json.JSONDecodeError:
data = self.client.extract_json_from_text(response.content)
if not data:
return None
if not data.get("is_technology", False):
return None
tech_data = data.get("technology", {})
if not tech_data or not tech_data.get("name"):
return None
return self._build_extracted_technology(result, tech_data)
def _extract_batch(
self,
results: List[SearchResult],
capability_need: CapabilityNeed
) -> Tuple[List[ExtractedTechnology], List[str]]:
"""Extract technologies from a batch of results."""
# Build results text
results_text = ""
for i, result in enumerate(results, 1):
results_text += f"""
Item {i}:
Source: {result.source_type}
Title: {result.title}
Organization: {result.organization or 'Unknown'}
Content: {result.snippet[:300]}
---"""
prompt = self.BATCH_EXTRACTION_PROMPT.format(
functional_need=capability_need.functional_need,
results_text=results_text
)
response = self.client.generate(
prompt=prompt,
model=self.model,
temperature=0.1,
format="json"
)
technologies = []
discarded = []
if not response.success:
logger.warning("Batch extraction failed, falling back to individual extraction")
for result in results:
tech = self._extract_single(result, capability_need)
if tech:
technologies.append(tech)
else:
discarded.append(result.url)
return technologies, discarded
try:
data = json.loads(response.content)
except json.JSONDecodeError:
data = self.client.extract_json_from_text(response.content)
if not data:
# Fallback to individual extraction
for result in results:
tech = self._extract_single(result, capability_need)
if tech:
technologies.append(tech)
else:
discarded.append(result.url)
return technologies, discarded
extractions = data.get("extractions", [])
for extraction in extractions:
item_num = extraction.get("item_number", 0)
if 1 <= item_num <= len(results):
result = results[item_num - 1]
if extraction.get("is_technology", False):
tech_data = extraction.get("technology", {})
if tech_data and tech_data.get("name"):
tech = self._build_extracted_technology(result, tech_data)
if tech:
technologies.append(tech)
continue
discarded.append(result.url)
return technologies, discarded
def _build_extracted_technology(
self,
result: SearchResult,
tech_data: dict
) -> ExtractedTechnology:
"""Build ExtractedTechnology from extraction data."""
tech_id = str(uuid.uuid4())[:8]
# Estimate TRL from indicators
trl_indicators = tech_data.get("trl_indicators", [])
trl_estimate = self._estimate_trl(trl_indicators, result)
return ExtractedTechnology(
id=tech_id,
source_result_id=result.url,
extraction_confidence=0.8 if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES else 0.6,
name=tech_data.get("name", ""),
technology_type=tech_data.get("type", "system"),
description=tech_data.get("description", ""),
capabilities=tech_data.get("capabilities", []),
mechanism=tech_data.get("mechanism"),
developer=tech_data.get("developer") or result.organization,
developer_type=tech_data.get("developer_type", "unknown"),
trl_estimate=trl_estimate,
trl_evidence=trl_indicators,
source_type=result.source_type,
source_url=result.url,
source_title=result.title,
source_snippet=result.snippet,
)
def _estimate_trl(
self,
trl_indicators: List[str],
result: SearchResult
) -> Optional[int]:
"""Estimate TRL from indicators and source type."""
# Use existing TRL if available
if result.trl_estimate:
return result.trl_estimate
# Estimate from indicators
indicators_lower = " ".join(trl_indicators).lower()
if any(term in indicators_lower for term in ["deployed", "operational", "fielded"]):
return 9
elif any(term in indicators_lower for term in ["production", "qualified"]):
return 8
elif any(term in indicators_lower for term in ["phase iii", "demonstration"]):
return 7
elif any(term in indicators_lower for term in ["prototype", "phase ii"]):
return 5
elif any(term in indicators_lower for term in ["phase i", "laboratory"]):
return 4
elif any(term in indicators_lower for term in ["concept", "proof"]):
return 3
elif any(term in indicators_lower for term in ["research", "basic"]):
return 2
# Estimate from source type
source_trl_defaults = {
"sbir": 4,
"patent": 5,
"contract": 6,
"news": 5,
"web": 5,
}
return source_trl_defaults.get(result.source_type, 5)