TechScout/techscout/technology/extractor.py

"""
Technology Extractor for TechScout.

Extracts specific, nameable technologies from search results.
Filters out results that are just topic discussions or generic content.

This is Stage 3 of the Capability-Technology Matching pipeline.
"""

import json
import logging
import uuid
from typing import List, Optional, Tuple
from dataclasses import dataclass

from ..extraction.llm_client import OllamaClient
from ..search.base import SearchResult
from ..capability.types import CapabilityNeed, TechnologyIndicators
from .types import ExtractedTechnology

logger = logging.getLogger(__name__)


@dataclass
class ExtractionResult:
    """Result of technology extraction from search results."""
    technologies: List[ExtractedTechnology]
    discarded_ids: List[str]           # IDs of results that weren't technologies
    extraction_stats: dict


class TechnologyExtractor:
    """
    Extracts specific technologies from search results.

    Uses a combination of:
    1. Heuristic pre-filtering (source type, keyword indicators)
    2. LLM-based extraction (for actual technology identification)
    """

    EXTRACTION_PROMPT = """Analyze this search result and determine if it describes a SPECIFIC TECHNOLOGY.

CONTEXT - User is looking for:
{functional_need}

SEARCH RESULT:
Source Type: {source_type}
Title: {title}
Content: {snippet}
Organization: {organization}

INSTRUCTIONS:
1. Does this describe a SPECIFIC, NAMEABLE technology?
   - YES examples: "SPAD Array Detector", "Quantum dot infrared sensor", "ML-based tracking algorithm"
   - NO examples: Topic discussions, overviews, problem descriptions, generic categories like "sensor technology"

2. If YES, extract technology details. If NO, explain why.

Respond with JSON:
{{
    "is_technology": true/false,
    "reason": "Why this is/isn't a specific technology",
    "technology": {{
        "name": "Specific technology name",
        "type": "sensor|algorithm|material|system|platform|device|method|software|other",
        "description": "What it is (1-2 sentences)",
        "capabilities": ["capability 1", "capability 2"],
        "mechanism": "How it works (if described)",
        "developer": "Organization name if mentioned",
        "developer_type": "company|university|government|lab|unknown",
        "trl_indicators": ["Phase II SBIR", "prototype demonstrated", etc.]
    }}
}}

If is_technology is false, omit the technology object."""

    BATCH_EXTRACTION_PROMPT = """Analyze these search results and identify which describe SPECIFIC TECHNOLOGIES.

CONTEXT - User is looking for:
{functional_need}

SEARCH RESULTS:
{results_text}

For each result, determine if it describes a specific, nameable technology (not generic category or topic discussion).

Respond with JSON:
{{
    "extractions": [
        {{
            "item_number": 1,
            "is_technology": true/false,
            "reason": "brief explanation",
            "technology": {{...}} or null
        }},
        ...
    ]
}}

Technology object schema (when is_technology is true):
{{
    "name": "Specific technology name",
    "type": "sensor|algorithm|material|system|platform|device|method|software|other",
    "description": "What it is",
    "capabilities": ["cap1", "cap2"],
    "developer": "Organization if mentioned",
    "trl_indicators": ["evidence of maturity"]
}}"""

    # Source types more likely to describe actual technologies
    HIGH_TECH_LIKELIHOOD_SOURCES = {"sbir", "patent"}
    MEDIUM_TECH_LIKELIHOOD_SOURCES = {"contract"}
    LOW_TECH_LIKELIHOOD_SOURCES = {"news", "web", "academic"}

    def __init__(
        self,
        ollama_client: Optional[OllamaClient] = None,
        model: str = "mistral-nemo:12b"
    ):
        self.client = ollama_client or OllamaClient()
        self.model = model

    def extract_all(
        self,
        results: List[SearchResult],
        capability_need: CapabilityNeed,
        technology_indicators: Optional[TechnologyIndicators] = None,
        batch_size: int = 5
    ) -> ExtractionResult:
        """
        Extract technologies from all search results.

        Args:
            results: List of search results to process
            capability_need: Structured capability need for context
            technology_indicators: Positive/negative indicators
            batch_size: How many results to process per LLM call

        Returns:
            ExtractionResult with extracted technologies and stats
        """
        logger.info(f"Extracting technologies from {len(results)} results...")

        technologies = []
        discarded_ids = []

        # Separate by likelihood of being a technology
        high_priority = []
        medium_priority = []
        low_priority = []

        for result in results:
            if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES:
                high_priority.append(result)
            elif result.source_type in self.MEDIUM_TECH_LIKELIHOOD_SOURCES:
                medium_priority.append(result)
            else:
                low_priority.append(result)

        # Process high priority (all)
        if high_priority:
            logger.info(f"Processing {len(high_priority)} high-priority results (SBIR/patents)...")
            techs, discarded = self._process_batch_results(
                high_priority, capability_need, batch_size
            )
            technologies.extend(techs)
            discarded_ids.extend(discarded)

        # Process medium priority (all)
        if medium_priority:
            logger.info(f"Processing {len(medium_priority)} medium-priority results (contracts)...")
            techs, discarded = self._process_batch_results(
                medium_priority, capability_need, batch_size
            )
            technologies.extend(techs)
            discarded_ids.extend(discarded)

        # Process low priority with pre-filtering
        if low_priority:
            # Apply heuristic filter first
            filtered_low = self._heuristic_filter(low_priority, technology_indicators)
            logger.info(f"Processing {len(filtered_low)}/{len(low_priority)} low-priority results (passed heuristic filter)...")

            if filtered_low:
                techs, discarded = self._process_batch_results(
                    filtered_low, capability_need, batch_size
                )
                technologies.extend(techs)
                discarded_ids.extend(discarded)

            # Add filtered-out results to discarded
            filtered_out_ids = [
                r.url for r in low_priority if r not in filtered_low
            ]
            discarded_ids.extend(filtered_out_ids)

        logger.info(f"Extracted {len(technologies)} technologies, discarded {len(discarded_ids)} results")

        return ExtractionResult(
            technologies=technologies,
            discarded_ids=discarded_ids,
            extraction_stats={
                "total_results": len(results),
                "high_priority_processed": len(high_priority),
                "medium_priority_processed": len(medium_priority),
                "low_priority_processed": len(low_priority),
                "technologies_extracted": len(technologies),
                "results_discarded": len(discarded_ids),
            }
        )

    def _heuristic_filter(
        self,
        results: List[SearchResult],
        indicators: Optional[TechnologyIndicators]
    ) -> List[SearchResult]:
        """Apply heuristic filtering to identify likely technology results."""
        if not indicators:
            # Default indicators
            positive = {"prototype", "demonstrated", "system", "device", "sensor",
                       "developed", "patent", "phase ii", "tested", "deployed"}
            negative = {"overview", "challenge", "review", "introduction", "survey",
                       "future", "potential", "could", "might", "problems"}
        else:
            positive = set(w.lower() for w in indicators.positive)
            negative = set(w.lower() for w in indicators.negative)

        filtered = []
        for result in results:
            text = f"{result.title} {result.snippet}".lower()

            # Count positive and negative indicators
            pos_count = sum(1 for term in positive if term in text)
            neg_count = sum(1 for term in negative if term in text)

            # Include if more positive than negative, or any positive found
            if pos_count > 0 and pos_count >= neg_count:
                filtered.append(result)
            elif pos_count > 0:
                filtered.append(result)

        return filtered

    def _process_batch_results(
        self,
        results: List[SearchResult],
        capability_need: CapabilityNeed,
        batch_size: int
    ) -> Tuple[List[ExtractedTechnology], List[str]]:
        """Process results in batches using LLM."""
        technologies = []
        discarded = []

        for i in range(0, len(results), batch_size):
            batch = results[i:i + batch_size]

            if len(batch) == 1:
                # Single result - use single extraction
                result = batch[0]
                tech = self._extract_single(result, capability_need)
                if tech:
                    technologies.append(tech)
                else:
                    discarded.append(result.url)
            else:
                # Multiple results - use batch extraction
                batch_techs, batch_discarded = self._extract_batch(batch, capability_need)
                technologies.extend(batch_techs)
                discarded.extend(batch_discarded)

        return technologies, discarded

    def _extract_single(
        self,
        result: SearchResult,
        capability_need: CapabilityNeed
    ) -> Optional[ExtractedTechnology]:
        """Extract technology from a single result."""
        prompt = self.EXTRACTION_PROMPT.format(
            functional_need=capability_need.functional_need,
            source_type=result.source_type,
            title=result.title,
            snippet=result.snippet[:500],
            organization=result.organization or "Unknown"
        )

        response = self.client.generate(
            prompt=prompt,
            model=self.model,
            temperature=0.1,
            format="json"
        )

        if not response.success:
            logger.warning(f"LLM extraction failed for: {result.title[:50]}")
            return None

        try:
            data = json.loads(response.content)
        except json.JSONDecodeError:
            data = self.client.extract_json_from_text(response.content)
            if not data:
                return None

        if not data.get("is_technology", False):
            return None

        tech_data = data.get("technology", {})
        if not tech_data or not tech_data.get("name"):
            return None

        return self._build_extracted_technology(result, tech_data)

    def _extract_batch(
        self,
        results: List[SearchResult],
        capability_need: CapabilityNeed
    ) -> Tuple[List[ExtractedTechnology], List[str]]:
        """Extract technologies from a batch of results."""
        # Build results text
        results_text = ""
        for i, result in enumerate(results, 1):
            results_text += f"""
Item {i}:
Source: {result.source_type}
Title: {result.title}
Organization: {result.organization or 'Unknown'}
Content: {result.snippet[:300]}
---"""

        prompt = self.BATCH_EXTRACTION_PROMPT.format(
            functional_need=capability_need.functional_need,
            results_text=results_text
        )

        response = self.client.generate(
            prompt=prompt,
            model=self.model,
            temperature=0.1,
            format="json"
        )

        technologies = []
        discarded = []

        if not response.success:
            logger.warning("Batch extraction failed, falling back to individual extraction")
            for result in results:
                tech = self._extract_single(result, capability_need)
                if tech:
                    technologies.append(tech)
                else:
                    discarded.append(result.url)
            return technologies, discarded

        try:
            data = json.loads(response.content)
        except json.JSONDecodeError:
            data = self.client.extract_json_from_text(response.content)
            if not data:
                # Fallback to individual extraction
                for result in results:
                    tech = self._extract_single(result, capability_need)
                    if tech:
                        technologies.append(tech)
                    else:
                        discarded.append(result.url)
                return technologies, discarded

        extractions = data.get("extractions", [])

        for extraction in extractions:
            item_num = extraction.get("item_number", 0)
            if 1 <= item_num <= len(results):
                result = results[item_num - 1]

                if extraction.get("is_technology", False):
                    tech_data = extraction.get("technology", {})
                    if tech_data and tech_data.get("name"):
                        tech = self._build_extracted_technology(result, tech_data)
                        if tech:
                            technologies.append(tech)
                            continue

                discarded.append(result.url)

        return technologies, discarded

    def _build_extracted_technology(
        self,
        result: SearchResult,
        tech_data: dict
    ) -> ExtractedTechnology:
        """Build ExtractedTechnology from extraction data."""
        tech_id = str(uuid.uuid4())[:8]

        # Estimate TRL from indicators
        trl_indicators = tech_data.get("trl_indicators", [])
        trl_estimate = self._estimate_trl(trl_indicators, result)

        return ExtractedTechnology(
            id=tech_id,
            source_result_id=result.url,
            extraction_confidence=0.8 if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES else 0.6,
            name=tech_data.get("name", ""),
            technology_type=tech_data.get("type", "system"),
            description=tech_data.get("description", ""),
            capabilities=tech_data.get("capabilities", []),
            mechanism=tech_data.get("mechanism"),
            developer=tech_data.get("developer") or result.organization,
            developer_type=tech_data.get("developer_type", "unknown"),
            trl_estimate=trl_estimate,
            trl_evidence=trl_indicators,
            source_type=result.source_type,
            source_url=result.url,
            source_title=result.title,
            source_snippet=result.snippet,
        )

    def _estimate_trl(
        self,
        trl_indicators: List[str],
        result: SearchResult
    ) -> Optional[int]:
        """Estimate TRL from indicators and source type."""
        # Use existing TRL if available
        if result.trl_estimate:
            return result.trl_estimate

        # Estimate from indicators
        indicators_lower = " ".join(trl_indicators).lower()

        if any(term in indicators_lower for term in ["deployed", "operational", "fielded"]):
            return 9
        elif any(term in indicators_lower for term in ["production", "qualified"]):
            return 8
        elif any(term in indicators_lower for term in ["phase iii", "demonstration"]):
            return 7
        elif any(term in indicators_lower for term in ["prototype", "phase ii"]):
            return 5
        elif any(term in indicators_lower for term in ["phase i", "laboratory"]):
            return 4
        elif any(term in indicators_lower for term in ["concept", "proof"]):
            return 3
        elif any(term in indicators_lower for term in ["research", "basic"]):
            return 2

        # Estimate from source type
        source_trl_defaults = {
            "sbir": 4,
            "patent": 5,
            "contract": 6,
            "news": 5,
            "web": 5,
        }
        return source_trl_defaults.get(result.source_type, 5)
Initial commit: TechScout tool suite 2026-01-22 13:02:09 -05:00			`"""`
			`Technology Extractor for TechScout.`

			`Extracts specific, nameable technologies from search results.`
			`Filters out results that are just topic discussions or generic content.`

			`This is Stage 3 of the Capability-Technology Matching pipeline.`
			`"""`

			`import json`
			`import logging`
			`import uuid`
			`from typing import List, Optional, Tuple`
			`from dataclasses import dataclass`

			`from ..extraction.llm_client import OllamaClient`
			`from ..search.base import SearchResult`
			`from ..capability.types import CapabilityNeed, TechnologyIndicators`
			`from .types import ExtractedTechnology`

			`logger = logging.getLogger(__name__)`


			`@dataclass`
			`class ExtractionResult:`
			`"""Result of technology extraction from search results."""`
			`technologies: List[ExtractedTechnology]`
			`discarded_ids: List[str] # IDs of results that weren't technologies`
			`extraction_stats: dict`


			`class TechnologyExtractor:`
			`"""`
			`Extracts specific technologies from search results.`

			`Uses a combination of:`
			`1. Heuristic pre-filtering (source type, keyword indicators)`
			`2. LLM-based extraction (for actual technology identification)`
			`"""`

			`EXTRACTION_PROMPT = """Analyze this search result and determine if it describes a SPECIFIC TECHNOLOGY.`

			`CONTEXT - User is looking for:`
			`{functional_need}`

			`SEARCH RESULT:`
			`Source Type: {source_type}`
			`Title: {title}`
			`Content: {snippet}`
			`Organization: {organization}`

			`INSTRUCTIONS:`
			`1. Does this describe a SPECIFIC, NAMEABLE technology?`
			`- YES examples: "SPAD Array Detector", "Quantum dot infrared sensor", "ML-based tracking algorithm"`
			`- NO examples: Topic discussions, overviews, problem descriptions, generic categories like "sensor technology"`

			`2. If YES, extract technology details. If NO, explain why.`

			`Respond with JSON:`
			`{{`
			`"is_technology": true/false,`
			`"reason": "Why this is/isn't a specific technology",`
			`"technology": {{`
			`"name": "Specific technology name",`
			`"type": "sensor\|algorithm\|material\|system\|platform\|device\|method\|software\|other",`
			`"description": "What it is (1-2 sentences)",`
			`"capabilities": ["capability 1", "capability 2"],`
			`"mechanism": "How it works (if described)",`
			`"developer": "Organization name if mentioned",`
			`"developer_type": "company\|university\|government\|lab\|unknown",`
			`"trl_indicators": ["Phase II SBIR", "prototype demonstrated", etc.]`
			`}}`
			`}}`

			`If is_technology is false, omit the technology object."""`

			`BATCH_EXTRACTION_PROMPT = """Analyze these search results and identify which describe SPECIFIC TECHNOLOGIES.`

			`CONTEXT - User is looking for:`
			`{functional_need}`

			`SEARCH RESULTS:`
			`{results_text}`

			`For each result, determine if it describes a specific, nameable technology (not generic category or topic discussion).`

			`Respond with JSON:`
			`{{`
			`"extractions": [`
			`{{`
			`"item_number": 1,`
			`"is_technology": true/false,`
			`"reason": "brief explanation",`
			`"technology": {{...}} or null`
			`}},`
			`...`
			`]`
			`}}`

			`Technology object schema (when is_technology is true):`
			`{{`
			`"name": "Specific technology name",`
			`"type": "sensor\|algorithm\|material\|system\|platform\|device\|method\|software\|other",`
			`"description": "What it is",`
			`"capabilities": ["cap1", "cap2"],`
			`"developer": "Organization if mentioned",`
			`"trl_indicators": ["evidence of maturity"]`
			`}}"""`

			`# Source types more likely to describe actual technologies`
			`HIGH_TECH_LIKELIHOOD_SOURCES = {"sbir", "patent"}`
			`MEDIUM_TECH_LIKELIHOOD_SOURCES = {"contract"}`
			`LOW_TECH_LIKELIHOOD_SOURCES = {"news", "web", "academic"}`

			`def __init__(`
			`self,`
			`ollama_client: Optional[OllamaClient] = None,`
			`model: str = "mistral-nemo:12b"`
			`):`
			`self.client = ollama_client or OllamaClient()`
			`self.model = model`

			`def extract_all(`
			`self,`
			`results: List[SearchResult],`
			`capability_need: CapabilityNeed,`
			`technology_indicators: Optional[TechnologyIndicators] = None,`
			`batch_size: int = 5`
			`) -> ExtractionResult:`
			`"""`
			`Extract technologies from all search results.`

			`Args:`
			`results: List of search results to process`
			`capability_need: Structured capability need for context`
			`technology_indicators: Positive/negative indicators`
			`batch_size: How many results to process per LLM call`

			`Returns:`
			`ExtractionResult with extracted technologies and stats`
			`"""`
			`logger.info(f"Extracting technologies from {len(results)} results...")`

			`technologies = []`
			`discarded_ids = []`

			`# Separate by likelihood of being a technology`
			`high_priority = []`
			`medium_priority = []`
			`low_priority = []`

			`for result in results:`
			`if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES:`
			`high_priority.append(result)`
			`elif result.source_type in self.MEDIUM_TECH_LIKELIHOOD_SOURCES:`
			`medium_priority.append(result)`
			`else:`
			`low_priority.append(result)`

			`# Process high priority (all)`
			`if high_priority:`
			`logger.info(f"Processing {len(high_priority)} high-priority results (SBIR/patents)...")`
			`techs, discarded = self._process_batch_results(`
			`high_priority, capability_need, batch_size`
			`)`
			`technologies.extend(techs)`
			`discarded_ids.extend(discarded)`

			`# Process medium priority (all)`
			`if medium_priority:`
			`logger.info(f"Processing {len(medium_priority)} medium-priority results (contracts)...")`
			`techs, discarded = self._process_batch_results(`
			`medium_priority, capability_need, batch_size`
			`)`
			`technologies.extend(techs)`
			`discarded_ids.extend(discarded)`

			`# Process low priority with pre-filtering`
			`if low_priority:`
			`# Apply heuristic filter first`
			`filtered_low = self._heuristic_filter(low_priority, technology_indicators)`
			`logger.info(f"Processing {len(filtered_low)}/{len(low_priority)} low-priority results (passed heuristic filter)...")`

			`if filtered_low:`
			`techs, discarded = self._process_batch_results(`
			`filtered_low, capability_need, batch_size`
			`)`
			`technologies.extend(techs)`
			`discarded_ids.extend(discarded)`

			`# Add filtered-out results to discarded`
			`filtered_out_ids = [`
			`r.url for r in low_priority if r not in filtered_low`
			`]`
			`discarded_ids.extend(filtered_out_ids)`

			`logger.info(f"Extracted {len(technologies)} technologies, discarded {len(discarded_ids)} results")`

			`return ExtractionResult(`
			`technologies=technologies,`
			`discarded_ids=discarded_ids,`
			`extraction_stats={`
			`"total_results": len(results),`
			`"high_priority_processed": len(high_priority),`
			`"medium_priority_processed": len(medium_priority),`
			`"low_priority_processed": len(low_priority),`
			`"technologies_extracted": len(technologies),`
			`"results_discarded": len(discarded_ids),`
			`}`
			`)`

			`def _heuristic_filter(`
			`self,`
			`results: List[SearchResult],`
			`indicators: Optional[TechnologyIndicators]`
			`) -> List[SearchResult]:`
			`"""Apply heuristic filtering to identify likely technology results."""`
			`if not indicators:`
			`# Default indicators`
			`positive = {"prototype", "demonstrated", "system", "device", "sensor",`
			`"developed", "patent", "phase ii", "tested", "deployed"}`
			`negative = {"overview", "challenge", "review", "introduction", "survey",`
			`"future", "potential", "could", "might", "problems"}`
			`else:`
			`positive = set(w.lower() for w in indicators.positive)`
			`negative = set(w.lower() for w in indicators.negative)`

			`filtered = []`
			`for result in results:`
			`text = f"{result.title} {result.snippet}".lower()`

			`# Count positive and negative indicators`
			`pos_count = sum(1 for term in positive if term in text)`
			`neg_count = sum(1 for term in negative if term in text)`

			`# Include if more positive than negative, or any positive found`
			`if pos_count > 0 and pos_count >= neg_count:`
			`filtered.append(result)`
			`elif pos_count > 0:`
			`filtered.append(result)`

			`return filtered`

			`def _process_batch_results(`
			`self,`
			`results: List[SearchResult],`
			`capability_need: CapabilityNeed,`
			`batch_size: int`
			`) -> Tuple[List[ExtractedTechnology], List[str]]:`
			`"""Process results in batches using LLM."""`
			`technologies = []`
			`discarded = []`

			`for i in range(0, len(results), batch_size):`
			`batch = results[i:i + batch_size]`

			`if len(batch) == 1:`
			`# Single result - use single extraction`
			`result = batch[0]`
			`tech = self._extract_single(result, capability_need)`
			`if tech:`
			`technologies.append(tech)`
			`else:`
			`discarded.append(result.url)`
			`else:`
			`# Multiple results - use batch extraction`
			`batch_techs, batch_discarded = self._extract_batch(batch, capability_need)`
			`technologies.extend(batch_techs)`
			`discarded.extend(batch_discarded)`

			`return technologies, discarded`

			`def _extract_single(`
			`self,`
			`result: SearchResult,`
			`capability_need: CapabilityNeed`
			`) -> Optional[ExtractedTechnology]:`
			`"""Extract technology from a single result."""`
			`prompt = self.EXTRACTION_PROMPT.format(`
			`functional_need=capability_need.functional_need,`
			`source_type=result.source_type,`
			`title=result.title,`
			`snippet=result.snippet[:500],`
			`organization=result.organization or "Unknown"`
			`)`

			`response = self.client.generate(`
			`prompt=prompt,`
			`model=self.model,`
			`temperature=0.1,`
			`format="json"`
			`)`

			`if not response.success:`
			`logger.warning(f"LLM extraction failed for: {result.title[:50]}")`
			`return None`

			`try:`
			`data = json.loads(response.content)`
			`except json.JSONDecodeError:`
			`data = self.client.extract_json_from_text(response.content)`
			`if not data:`
			`return None`

			`if not data.get("is_technology", False):`
			`return None`

			`tech_data = data.get("technology", {})`
			`if not tech_data or not tech_data.get("name"):`
			`return None`

			`return self._build_extracted_technology(result, tech_data)`

			`def _extract_batch(`
			`self,`
			`results: List[SearchResult],`
			`capability_need: CapabilityNeed`
			`) -> Tuple[List[ExtractedTechnology], List[str]]:`
			`"""Extract technologies from a batch of results."""`
			`# Build results text`
			`results_text = ""`
			`for i, result in enumerate(results, 1):`
			`results_text += f"""`
			`Item {i}:`
			`Source: {result.source_type}`
			`Title: {result.title}`
			`Organization: {result.organization or 'Unknown'}`
			`Content: {result.snippet[:300]}`
			`---"""`

			`prompt = self.BATCH_EXTRACTION_PROMPT.format(`
			`functional_need=capability_need.functional_need,`
			`results_text=results_text`
			`)`

			`response = self.client.generate(`
			`prompt=prompt,`
			`model=self.model,`
			`temperature=0.1,`
			`format="json"`
			`)`

			`technologies = []`
			`discarded = []`

			`if not response.success:`
			`logger.warning("Batch extraction failed, falling back to individual extraction")`
			`for result in results:`
			`tech = self._extract_single(result, capability_need)`
			`if tech:`
			`technologies.append(tech)`
			`else:`
			`discarded.append(result.url)`
			`return technologies, discarded`

			`try:`
			`data = json.loads(response.content)`
			`except json.JSONDecodeError:`
			`data = self.client.extract_json_from_text(response.content)`
			`if not data:`
			`# Fallback to individual extraction`
			`for result in results:`
			`tech = self._extract_single(result, capability_need)`
			`if tech:`
			`technologies.append(tech)`
			`else:`
			`discarded.append(result.url)`
			`return technologies, discarded`

			`extractions = data.get("extractions", [])`

			`for extraction in extractions:`
			`item_num = extraction.get("item_number", 0)`
			`if 1 <= item_num <= len(results):`
			`result = results[item_num - 1]`

			`if extraction.get("is_technology", False):`
			`tech_data = extraction.get("technology", {})`
			`if tech_data and tech_data.get("name"):`
			`tech = self._build_extracted_technology(result, tech_data)`
			`if tech:`
			`technologies.append(tech)`
			`continue`

			`discarded.append(result.url)`

			`return technologies, discarded`

			`def _build_extracted_technology(`
			`self,`
			`result: SearchResult,`
			`tech_data: dict`
			`) -> ExtractedTechnology:`
			`"""Build ExtractedTechnology from extraction data."""`
			`tech_id = str(uuid.uuid4())[:8]`

			`# Estimate TRL from indicators`
			`trl_indicators = tech_data.get("trl_indicators", [])`
			`trl_estimate = self._estimate_trl(trl_indicators, result)`

			`return ExtractedTechnology(`
			`id=tech_id,`
			`source_result_id=result.url,`
			`extraction_confidence=0.8 if result.source_type in self.HIGH_TECH_LIKELIHOOD_SOURCES else 0.6,`
			`name=tech_data.get("name", ""),`
			`technology_type=tech_data.get("type", "system"),`
			`description=tech_data.get("description", ""),`
			`capabilities=tech_data.get("capabilities", []),`
			`mechanism=tech_data.get("mechanism"),`
			`developer=tech_data.get("developer") or result.organization,`
			`developer_type=tech_data.get("developer_type", "unknown"),`
			`trl_estimate=trl_estimate,`
			`trl_evidence=trl_indicators,`
			`source_type=result.source_type,`
			`source_url=result.url,`
			`source_title=result.title,`
			`source_snippet=result.snippet,`
			`)`

			`def _estimate_trl(`
			`self,`
			`trl_indicators: List[str],`
			`result: SearchResult`
			`) -> Optional[int]:`
			`"""Estimate TRL from indicators and source type."""`
			`# Use existing TRL if available`
			`if result.trl_estimate:`
			`return result.trl_estimate`

			`# Estimate from indicators`
			`indicators_lower = " ".join(trl_indicators).lower()`

			`if any(term in indicators_lower for term in ["deployed", "operational", "fielded"]):`
			`return 9`
			`elif any(term in indicators_lower for term in ["production", "qualified"]):`
			`return 8`
			`elif any(term in indicators_lower for term in ["phase iii", "demonstration"]):`
			`return 7`
			`elif any(term in indicators_lower for term in ["prototype", "phase ii"]):`
			`return 5`
			`elif any(term in indicators_lower for term in ["phase i", "laboratory"]):`
			`return 4`
			`elif any(term in indicators_lower for term in ["concept", "proof"]):`
			`return 3`
			`elif any(term in indicators_lower for term in ["research", "basic"]):`
			`return 2`

			`# Estimate from source type`
			`source_trl_defaults = {`
			`"sbir": 4,`
			`"patent": 5,`
			`"contract": 6,`
			`"news": 5,`
			`"web": 5,`
			`}`
			`return source_trl_defaults.get(result.source_type, 5)`