""" Technology Extractor for TechScout. Extracts specific, nameable technologies from search results. Filters out results that are just topic discussions or generic content. This is Stage 3 of the Capability-Technology Matching pipeline. """ import json import logging import uuid from typing import List, Optional, Tuple from dataclasses import dataclass from ..extraction.llm_client import OllamaClient from ..search.base import SearchResult from ..capability.types import CapabilityNeed, TechnologyIndicators from .types import ExtractedTechnology logger = logging.getLogger(__name__) @dataclass class ExtractionResult: """Result of technology extraction from search results.""" technologies: List[ExtractedTechnology] discarded_ids: List[str] # IDs of results that weren't technologies extraction_stats: dict class TechnologyExtractor: """ Extracts specific technologies from search results. Uses a combination of: 1. Heuristic pre-filtering (source type, keyword indicators) 2. LLM-based extraction (for actual technology identification) """ EXTRACTION_PROMPT = """Analyze this search result and determine if it describes a SPECIFIC TECHNOLOGY. CONTEXT - User is looking for: {functional_need} SEARCH RESULT: Source Type: {source_type} Title: {title} Content: {snippet} Organization: {organization} INSTRUCTIONS: 1. Does this describe a SPECIFIC, NAMEABLE technology? - YES examples: "SPAD Array Detector", "Quantum dot infrared sensor", "ML-based tracking algorithm" - NO examples: Topic discussions, overviews, problem descriptions, generic categories like "sensor technology" 2. If YES, extract technology details. If NO, explain why. Respond with JSON: {{ "is_technology": true/false, "reason": "Why this is/isn't a specific technology", "technology": {{ "name": "Specific technology name", "type": "sensor|algorithm|material|system|platform|device|method|software|other", "description": "What it is (1-2 sentences)", "capabilities": ["capability 1", "capability 2"], "mechanism": "How it works (if described)", "developer": "Organization name if mentioned", "developer_type": "company|university|government|lab|unknown", "trl_indicators": ["Phase II SBIR", "prototype demonstrated", etc.] }} }} If is_technology is false, omit the technology object.""" BATCH_EXTRACTION_PROMPT = """Analyze these search results and identify which describe SPECIFIC TECHNOLOGIES. CONTEXT - User is looking for: {functional_need} SEARCH RESULTS: {results_text} For each result, determine if it describes a specific, nameable technology (not generic category or topic discussion). Respond with JSON: {{ "extractions": [ {{ "item_number": 1, "is_technology": true/false, "reason": "brief explanation", "technology": {{...}} or null }}, ... ] }} Technology object schema (when is_technology is true): {{ "name": "Specific technology name", "type": "sensor|algorithm|material|system|platform|device|method|software|other", "description": "What it is", "capabilities": ["cap1", "cap2"], "developer": "Organization if mentioned", "trl_indicators": ["evidence of maturity"] }}""" # Source types more likely to describe actual technologies HIGH_TECH_LIKELIHOOD_SOURCES = {"sbir", "patent"} MEDIUM_TECH_LIKELIHOOD_SOURCES = {"contract"} LOW_TECH_LIKELIHOOD_SOURCES = {"news", "web", "academic"} def __init__( self, ollama_client: Optional[OllamaClient] = None, model: str = "mistral-nemo:12b" ): self.client = ollama_client or OllamaClient() self.model = model def extract_all( self, results: List[SearchResult], capability_need: CapabilityNeed, technology_indicators: Optional[TechnologyIndicators] = None, batch_size: int = 5 ) -> ExtractionResult: """ Extract technologies from all search results. Args: results: List of search results to process capability_need: Structured capability need for context technology_indicators: Positive/negative indicators batch_size: How many results to process per LLM call Returns: ExtractionResult with extracted technologies and stats """ logger.info(f"Extracting technologies from {len(results)} results...") technologies = [] discarded_ids = [] # Separate by likelihood of being a technology high_priority = [] medium_priority = [] low_priority = [] for result in results: if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES: high_priority.append(result) elif result.source_type in self.MEDIUM_TECH_LIKELIHOOD_SOURCES: medium_priority.append(result) else: low_priority.append(result) # Process high priority (all) if high_priority: logger.info(f"Processing {len(high_priority)} high-priority results (SBIR/patents)...") techs, discarded = self._process_batch_results( high_priority, capability_need, batch_size ) technologies.extend(techs) discarded_ids.extend(discarded) # Process medium priority (all) if medium_priority: logger.info(f"Processing {len(medium_priority)} medium-priority results (contracts)...") techs, discarded = self._process_batch_results( medium_priority, capability_need, batch_size ) technologies.extend(techs) discarded_ids.extend(discarded) # Process low priority with pre-filtering if low_priority: # Apply heuristic filter first filtered_low = self._heuristic_filter(low_priority, technology_indicators) logger.info(f"Processing {len(filtered_low)}/{len(low_priority)} low-priority results (passed heuristic filter)...") if filtered_low: techs, discarded = self._process_batch_results( filtered_low, capability_need, batch_size ) technologies.extend(techs) discarded_ids.extend(discarded) # Add filtered-out results to discarded filtered_out_ids = [ r.url for r in low_priority if r not in filtered_low ] discarded_ids.extend(filtered_out_ids) logger.info(f"Extracted {len(technologies)} technologies, discarded {len(discarded_ids)} results") return ExtractionResult( technologies=technologies, discarded_ids=discarded_ids, extraction_stats={ "total_results": len(results), "high_priority_processed": len(high_priority), "medium_priority_processed": len(medium_priority), "low_priority_processed": len(low_priority), "technologies_extracted": len(technologies), "results_discarded": len(discarded_ids), } ) def _heuristic_filter( self, results: List[SearchResult], indicators: Optional[TechnologyIndicators] ) -> List[SearchResult]: """Apply heuristic filtering to identify likely technology results.""" if not indicators: # Default indicators positive = {"prototype", "demonstrated", "system", "device", "sensor", "developed", "patent", "phase ii", "tested", "deployed"} negative = {"overview", "challenge", "review", "introduction", "survey", "future", "potential", "could", "might", "problems"} else: positive = set(w.lower() for w in indicators.positive) negative = set(w.lower() for w in indicators.negative) filtered = [] for result in results: text = f"{result.title} {result.snippet}".lower() # Count positive and negative indicators pos_count = sum(1 for term in positive if term in text) neg_count = sum(1 for term in negative if term in text) # Include if more positive than negative, or any positive found if pos_count > 0 and pos_count >= neg_count: filtered.append(result) elif pos_count > 0: filtered.append(result) return filtered def _process_batch_results( self, results: List[SearchResult], capability_need: CapabilityNeed, batch_size: int ) -> Tuple[List[ExtractedTechnology], List[str]]: """Process results in batches using LLM.""" technologies = [] discarded = [] for i in range(0, len(results), batch_size): batch = results[i:i + batch_size] if len(batch) == 1: # Single result - use single extraction result = batch[0] tech = self._extract_single(result, capability_need) if tech: technologies.append(tech) else: discarded.append(result.url) else: # Multiple results - use batch extraction batch_techs, batch_discarded = self._extract_batch(batch, capability_need) technologies.extend(batch_techs) discarded.extend(batch_discarded) return technologies, discarded def _extract_single( self, result: SearchResult, capability_need: CapabilityNeed ) -> Optional[ExtractedTechnology]: """Extract technology from a single result.""" prompt = self.EXTRACTION_PROMPT.format( functional_need=capability_need.functional_need, source_type=result.source_type, title=result.title, snippet=result.snippet[:500], organization=result.organization or "Unknown" ) response = self.client.generate( prompt=prompt, model=self.model, temperature=0.1, format="json" ) if not response.success: logger.warning(f"LLM extraction failed for: {result.title[:50]}") return None try: data = json.loads(response.content) except json.JSONDecodeError: data = self.client.extract_json_from_text(response.content) if not data: return None if not data.get("is_technology", False): return None tech_data = data.get("technology", {}) if not tech_data or not tech_data.get("name"): return None return self._build_extracted_technology(result, tech_data) def _extract_batch( self, results: List[SearchResult], capability_need: CapabilityNeed ) -> Tuple[List[ExtractedTechnology], List[str]]: """Extract technologies from a batch of results.""" # Build results text results_text = "" for i, result in enumerate(results, 1): results_text += f""" Item {i}: Source: {result.source_type} Title: {result.title} Organization: {result.organization or 'Unknown'} Content: {result.snippet[:300]} ---""" prompt = self.BATCH_EXTRACTION_PROMPT.format( functional_need=capability_need.functional_need, results_text=results_text ) response = self.client.generate( prompt=prompt, model=self.model, temperature=0.1, format="json" ) technologies = [] discarded = [] if not response.success: logger.warning("Batch extraction failed, falling back to individual extraction") for result in results: tech = self._extract_single(result, capability_need) if tech: technologies.append(tech) else: discarded.append(result.url) return technologies, discarded try: data = json.loads(response.content) except json.JSONDecodeError: data = self.client.extract_json_from_text(response.content) if not data: # Fallback to individual extraction for result in results: tech = self._extract_single(result, capability_need) if tech: technologies.append(tech) else: discarded.append(result.url) return technologies, discarded extractions = data.get("extractions", []) for extraction in extractions: item_num = extraction.get("item_number", 0) if 1 <= item_num <= len(results): result = results[item_num - 1] if extraction.get("is_technology", False): tech_data = extraction.get("technology", {}) if tech_data and tech_data.get("name"): tech = self._build_extracted_technology(result, tech_data) if tech: technologies.append(tech) continue discarded.append(result.url) return technologies, discarded def _build_extracted_technology( self, result: SearchResult, tech_data: dict ) -> ExtractedTechnology: """Build ExtractedTechnology from extraction data.""" tech_id = str(uuid.uuid4())[:8] # Estimate TRL from indicators trl_indicators = tech_data.get("trl_indicators", []) trl_estimate = self._estimate_trl(trl_indicators, result) return ExtractedTechnology( id=tech_id, source_result_id=result.url, extraction_confidence=0.8 if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES else 0.6, name=tech_data.get("name", ""), technology_type=tech_data.get("type", "system"), description=tech_data.get("description", ""), capabilities=tech_data.get("capabilities", []), mechanism=tech_data.get("mechanism"), developer=tech_data.get("developer") or result.organization, developer_type=tech_data.get("developer_type", "unknown"), trl_estimate=trl_estimate, trl_evidence=trl_indicators, source_type=result.source_type, source_url=result.url, source_title=result.title, source_snippet=result.snippet, ) def _estimate_trl( self, trl_indicators: List[str], result: SearchResult ) -> Optional[int]: """Estimate TRL from indicators and source type.""" # Use existing TRL if available if result.trl_estimate: return result.trl_estimate # Estimate from indicators indicators_lower = " ".join(trl_indicators).lower() if any(term in indicators_lower for term in ["deployed", "operational", "fielded"]): return 9 elif any(term in indicators_lower for term in ["production", "qualified"]): return 8 elif any(term in indicators_lower for term in ["phase iii", "demonstration"]): return 7 elif any(term in indicators_lower for term in ["prototype", "phase ii"]): return 5 elif any(term in indicators_lower for term in ["phase i", "laboratory"]): return 4 elif any(term in indicators_lower for term in ["concept", "proof"]): return 3 elif any(term in indicators_lower for term in ["research", "basic"]): return 2 # Estimate from source type source_trl_defaults = { "sbir": 4, "patent": 5, "contract": 6, "news": 5, "web": 5, } return source_trl_defaults.get(result.source_type, 5)