TechScout/techscout/extraction/org_extractor.py

"""
Organization Extractor for TechScout.

Hybrid approach: Fast regex patterns first, LLM fallback for ambiguous cases.
Extracts company/organization names from unstructured text like news articles and web pages.
"""

import re
import logging
from typing import Optional, List, Tuple
from dataclasses import dataclass

logger = logging.getLogger(__name__)


@dataclass
class ExtractionResult:
    """Result of organization extraction."""
    organization: Optional[str]
    confidence: float  # 0.0 to 1.0
    method: str  # "regex", "llm", or "none"


class OrganizationExtractor:
    """
    Extracts organization names from text using hybrid regex + LLM approach.

    Strategy:
    1. Try fast regex patterns for common company suffixes
    2. Try stock ticker extraction (NYSE:XXX, NASDAQ:XXX)
    3. Try news source patterns (BUSINESS WIRE, PR Newswire headers)
    4. Fall back to LLM for ambiguous cases
    """

    def __init__(self, llm_client=None, model: str = "mistral-nemo:12b"):
        self.llm_client = llm_client
        self.model = model

        # Common company suffixes - ordered by specificity
        self.company_suffixes = [
            r"Inc\.",
            r"Inc",
            r"LLC",
            r"L\.L\.C\.",
            r"Corp\.",
            r"Corp",
            r"Corporation",
            r"Ltd\.",
            r"Ltd",
            r"Limited",
            r"Co\.",
            r"Company",
            r"Technologies",
            r"Technology",
            r"Systems",
            r"Solutions",
            r"Industries",
            r"Aerospace",
            r"Defense",
            r"Dynamics",
            r"Sciences",
            r"Labs",
            r"Laboratories",
            r"Group",
            r"Holdings",
            r"Partners",
            r"Ventures",
            r"Analytics",
            r"Robotics",
            r"Space",
            r"Aviation",
        ]

        # Build regex pattern for company names
        suffix_pattern = "|".join(self.company_suffixes)
        # Match: 1-4 capitalized words followed by a suffix
        self.company_pattern = re.compile(
            rf'\b([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)?(?:\s+[A-Z][a-zA-Z]*)?(?:\s+[A-Z][a-zA-Z]*)?)\s+({suffix_pattern})\b',
            re.IGNORECASE
        )

        # Stock ticker patterns
        self.ticker_patterns = [
            re.compile(r'\((?:NYSE|NASDAQ|AMEX|OTC):\s*([A-Z]{1,5})\)', re.IGNORECASE),
            re.compile(r'(?:NYSE|NASDAQ|AMEX|OTC):\s*([A-Z]{1,5})\b', re.IGNORECASE),
            re.compile(r'\(([A-Z]{2,5})\)', re.IGNORECASE),  # Just ticker in parens like (BKSY)
        ]

        # News wire header patterns - company often appears right after location
        self.news_wire_pattern = re.compile(
            r'(?:HERNDON|SAN FRANCISCO|NEW YORK|LOS ANGELES|WASHINGTON|DENVER|SEATTLE|BOSTON|AUSTIN|ARLINGTON|'
            r'BOULDER|HUNTSVILLE|COLORADO SPRINGS|MOUNTAIN VIEW|PALO ALTO|REDMOND|SAN DIEGO|PHOENIX|DALLAS|'
            r'HOUSTON|CHICAGO|ATLANTA|MIAMI|TAMPA|ORLANDO|RESTON|MCLEAN|CHANTILLY|STERLING),?\s+'
            r'(?:[A-Za-z\.]+,?\s*)?--?\s*'  # State abbreviation
            r'(?:\([^)]+\)--?\s*)?'  # Optional source like (BUSINESS WIRE)
            r'([A-Z][A-Za-z]+(?:\s+[A-Z][a-zA-Z]+)*(?:\s+(?:Inc|LLC|Corp|Ltd|Technologies|Technology|Systems|Solutions|'
            r'Aerospace|Defense|Dynamics|Sciences|Space|Aviation)\.?)?)',
            re.IGNORECASE
        )

        # University/research institution patterns
        self.institution_patterns = [
            re.compile(r'\b((?:University of [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?|'
                      r'[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\s+University|'
                      r'MIT|Caltech|Stanford|Harvard|Carnegie Mellon|Georgia Tech|'
                      r'[A-Z][a-z]+\s+Institute of Technology))\b'),
            re.compile(r'\b((?:[A-Z][a-z]+\s+)?(?:National Laboratory|Research Center|Research Institute))\b'),
        ]

        # Known defense/space companies for quick matching
        self.known_companies = {
            "blacksky": "BlackSky Technology Inc.",
            "spacex": "SpaceX",
            "northrop": "Northrop Grumman",
            "lockheed": "Lockheed Martin",
            "raytheon": "Raytheon",
            "boeing": "Boeing",
            "general dynamics": "General Dynamics",
            "bae systems": "BAE Systems",
            "l3harris": "L3Harris Technologies",
            "leidos": "Leidos",
            "saic": "SAIC",
            "maxar": "Maxar Technologies",
            "planet labs": "Planet Labs",
            "capella space": "Capella Space",
            "rocket lab": "Rocket Lab",
            "aerojet": "Aerojet Rocketdyne",
            "ball aerospace": "Ball Aerospace",
            "sierra nevada": "Sierra Nevada Corporation",
            "relativity space": "Relativity Space",
            "anduril": "Anduril Industries",
            "palantir": "Palantir Technologies",
            "shield ai": "Shield AI",
            "heo robotics": "HEO Robotics",
            "mda space": "MDA Space",
            "mda": "MDA Space",
            "thales": "Thales Group",
            "airbus": "Airbus Defence and Space",
            "blue origin": "Blue Origin",
            "virgin orbit": "Virgin Orbit",
            "astra": "Astra Space",
            "spire global": "Spire Global",
            "hawkeye 360": "HawkEye 360",
            "iceye": "ICEYE",
            "umbra": "Umbra Space",
            "terran orbital": "Terran Orbital",
            "momentus": "Momentus Space",
            "ast spacemobile": "AST SpaceMobile",
            "viasat": "Viasat",
            "iridium": "Iridium Communications",
            "ses": "SES S.A.",
            "intelsat": "Intelsat",
            "eutelsat": "Eutelsat",
            "telesat": "Telesat",
            "oneweb": "OneWeb",
            "starlink": "SpaceX Starlink",
            "amazon kuiper": "Amazon Kuiper",
        }

    def extract(self, title: str, snippet: str) -> ExtractionResult:
        """
        Extract organization from title and snippet.

        Args:
            title: Article/result title
            snippet: Description/body text

        Returns:
            ExtractionResult with organization name, confidence, and method
        """
        combined_text = f"{title} {snippet}"

        # Try methods in order of speed and confidence

        # 1. Check known companies first (fastest, highest confidence)
        result = self._extract_known_company(combined_text)
        if result:
            return ExtractionResult(organization=result, confidence=0.95, method="known")

        # 2. Try stock ticker extraction (very reliable when present)
        result = self._extract_from_ticker(combined_text)
        if result:
            return ExtractionResult(organization=result, confidence=0.9, method="ticker")

        # 3. Try news wire header pattern
        result = self._extract_from_news_wire(snippet)
        if result:
            return ExtractionResult(organization=result, confidence=0.85, method="news_wire")

        # 4. Try company suffix pattern
        result = self._extract_from_suffix(combined_text)
        if result:
            return ExtractionResult(organization=result, confidence=0.8, method="suffix")

        # 5. Try institution patterns
        result = self._extract_institution(combined_text)
        if result:
            return ExtractionResult(organization=result, confidence=0.75, method="institution")

        # 6. Fall back to LLM if available
        if self.llm_client:
            result = self._extract_with_llm(title, snippet)
            if result:
                return ExtractionResult(organization=result, confidence=0.7, method="llm")

        # No extraction possible
        return ExtractionResult(organization=None, confidence=0.0, method="none")

    def _extract_known_company(self, text: str) -> Optional[str]:
        """Check for known defense/space companies."""
        text_lower = text.lower()
        for keyword, full_name in self.known_companies.items():
            if keyword in text_lower:
                return full_name
        return None

    def _extract_from_ticker(self, text: str) -> Optional[str]:
        """Extract company name associated with stock ticker."""
        for pattern in self.ticker_patterns:
            match = pattern.search(text)
            if match:
                ticker = match.group(1).upper()
                # Try to find the company name near the ticker
                # Look for pattern like "Company Name (NYSE:XXX)" or "Company Name (XXX)"
                before_ticker = text[:match.start()]

                # Find company name pattern before ticker
                company_match = self.company_pattern.search(before_ticker[-100:])
                if company_match:
                    name = f"{company_match.group(1)} {company_match.group(2)}"
                    return self._clean_company_name(name)

                # Try to get the last few capitalized words before ticker
                words_before = before_ticker.strip().split()[-5:]
                if words_before:
                    # Look for capitalized sequence
                    company_words = []
                    for word in reversed(words_before):
                        clean_word = word.strip('(),-:')
                        if clean_word and clean_word[0].isupper():
                            company_words.insert(0, clean_word)
                        elif company_words:
                            break
                    if company_words:
                        return " ".join(company_words)
        return None

    def _extract_from_news_wire(self, text: str) -> Optional[str]:
        """Extract from news wire format (CITY -- Company Name announced...)."""
        match = self.news_wire_pattern.search(text)
        if match:
            return self._clean_company_name(match.group(1))
        return None

    def _extract_from_suffix(self, text: str) -> Optional[str]:
        """Extract using company suffix patterns."""
        matches = list(self.company_pattern.finditer(text))
        if matches:
            # Prefer matches from title (earlier in combined text)
            # Take the first substantial match
            for match in matches:
                name = f"{match.group(1)} {match.group(2)}"
                cleaned = self._clean_company_name(name)
                if cleaned and len(cleaned) > 3:
                    return cleaned
        return None

    def _extract_institution(self, text: str) -> Optional[str]:
        """Extract university or research institution names."""
        for pattern in self.institution_patterns:
            match = pattern.search(text)
            if match:
                return match.group(1)
        return None

    def _extract_with_llm(self, title: str, snippet: str) -> Optional[str]:
        """Use LLM to extract organization name as fallback."""
        if not self.llm_client:
            return None

        prompt = f"""Extract the primary company or organization name from this text.

Title: {title}
Description: {snippet}

Rules:
- Return ONLY the organization name, nothing else
- If it's a company, include the suffix (Inc, LLC, Corp, etc.) if known
- If no clear organization can be identified, return "NONE"
- Do not make up or guess organization names
- Prefer companies over universities/agencies if both are mentioned

Organization name:"""

        try:
            response = self.llm_client.generate(
                prompt=prompt,
                model=self.model,
                temperature=0.0,  # Deterministic
                max_tokens=50
            )

            if response.success:
                result = response.content.strip()
                # Validate result
                if result and result.upper() != "NONE" and len(result) > 2:
                    # Basic validation - should look like a company name
                    if not any(x in result.lower() for x in ["i don't", "cannot", "no ", "none", "unknown"]):
                        return self._clean_company_name(result)
        except Exception as e:
            logger.warning(f"LLM extraction failed: {e}")

        return None

    def _clean_company_name(self, name: str) -> str:
        """Clean and normalize company name."""
        if not name:
            return ""

        # Remove common prefixes
        prefixes_to_remove = [
            "the ", "a ", "an ",
            "by ", "from ", "at ",
            "-- ", "- ",
        ]
        name_lower = name.lower()
        for prefix in prefixes_to_remove:
            if name_lower.startswith(prefix):
                name = name[len(prefix):]
                name_lower = name.lower()

        # Remove trailing punctuation
        name = name.rstrip('.,;:')

        # Normalize whitespace
        name = " ".join(name.split())

        return name.strip()

    def extract_batch(
        self,
        items: List[Tuple[str, str]],
        use_llm_fallback: bool = True
    ) -> List[ExtractionResult]:
        """
        Extract organizations from multiple items.

        Args:
            items: List of (title, snippet) tuples
            use_llm_fallback: Whether to use LLM for items that regex can't handle

        Returns:
            List of ExtractionResult objects
        """
        results = []

        for title, snippet in items:
            # extract() already includes LLM fallback
            result = self.extract(title, snippet)

            # Log extraction for debugging
            if result.organization:
                logger.info(f"Extracted '{result.organization}' from '{title[:50]}...' (method: {result.method})")
            else:
                logger.debug(f"No organization found in '{title[:50]}...'")

            results.append(result)

        return results