""" Organization Extractor for TechScout. Hybrid approach: Fast regex patterns first, LLM fallback for ambiguous cases. Extracts company/organization names from unstructured text like news articles and web pages. """ import re import logging from typing import Optional, List, Tuple from dataclasses import dataclass logger = logging.getLogger(__name__) @dataclass class ExtractionResult: """Result of organization extraction.""" organization: Optional[str] confidence: float # 0.0 to 1.0 method: str # "regex", "llm", or "none" class OrganizationExtractor: """ Extracts organization names from text using hybrid regex + LLM approach. Strategy: 1. Try fast regex patterns for common company suffixes 2. Try stock ticker extraction (NYSE:XXX, NASDAQ:XXX) 3. Try news source patterns (BUSINESS WIRE, PR Newswire headers) 4. Fall back to LLM for ambiguous cases """ def __init__(self, llm_client=None, model: str = "mistral-nemo:12b"): self.llm_client = llm_client self.model = model # Common company suffixes - ordered by specificity self.company_suffixes = [ r"Inc\.", r"Inc", r"LLC", r"L\.L\.C\.", r"Corp\.", r"Corp", r"Corporation", r"Ltd\.", r"Ltd", r"Limited", r"Co\.", r"Company", r"Technologies", r"Technology", r"Systems", r"Solutions", r"Industries", r"Aerospace", r"Defense", r"Dynamics", r"Sciences", r"Labs", r"Laboratories", r"Group", r"Holdings", r"Partners", r"Ventures", r"Analytics", r"Robotics", r"Space", r"Aviation", ] # Build regex pattern for company names suffix_pattern = "|".join(self.company_suffixes) # Match: 1-4 capitalized words followed by a suffix self.company_pattern = re.compile( rf'\b([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)?(?:\s+[A-Z][a-zA-Z]*)?(?:\s+[A-Z][a-zA-Z]*)?)\s+({suffix_pattern})\b', re.IGNORECASE ) # Stock ticker patterns self.ticker_patterns = [ re.compile(r'\((?:NYSE|NASDAQ|AMEX|OTC):\s*([A-Z]{1,5})\)', re.IGNORECASE), re.compile(r'(?:NYSE|NASDAQ|AMEX|OTC):\s*([A-Z]{1,5})\b', re.IGNORECASE), re.compile(r'\(([A-Z]{2,5})\)', re.IGNORECASE), # Just ticker in parens like (BKSY) ] # News wire header patterns - company often appears right after location self.news_wire_pattern = re.compile( r'(?:HERNDON|SAN FRANCISCO|NEW YORK|LOS ANGELES|WASHINGTON|DENVER|SEATTLE|BOSTON|AUSTIN|ARLINGTON|' r'BOULDER|HUNTSVILLE|COLORADO SPRINGS|MOUNTAIN VIEW|PALO ALTO|REDMOND|SAN DIEGO|PHOENIX|DALLAS|' r'HOUSTON|CHICAGO|ATLANTA|MIAMI|TAMPA|ORLANDO|RESTON|MCLEAN|CHANTILLY|STERLING),?\s+' r'(?:[A-Za-z\.]+,?\s*)?--?\s*' # State abbreviation r'(?:\([^)]+\)--?\s*)?' # Optional source like (BUSINESS WIRE) r'([A-Z][A-Za-z]+(?:\s+[A-Z][a-zA-Z]+)*(?:\s+(?:Inc|LLC|Corp|Ltd|Technologies|Technology|Systems|Solutions|' r'Aerospace|Defense|Dynamics|Sciences|Space|Aviation)\.?)?)', re.IGNORECASE ) # University/research institution patterns self.institution_patterns = [ re.compile(r'\b((?:University of [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?|' r'[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\s+University|' r'MIT|Caltech|Stanford|Harvard|Carnegie Mellon|Georgia Tech|' r'[A-Z][a-z]+\s+Institute of Technology))\b'), re.compile(r'\b((?:[A-Z][a-z]+\s+)?(?:National Laboratory|Research Center|Research Institute))\b'), ] # Known defense/space companies for quick matching self.known_companies = { "blacksky": "BlackSky Technology Inc.", "spacex": "SpaceX", "northrop": "Northrop Grumman", "lockheed": "Lockheed Martin", "raytheon": "Raytheon", "boeing": "Boeing", "general dynamics": "General Dynamics", "bae systems": "BAE Systems", "l3harris": "L3Harris Technologies", "leidos": "Leidos", "saic": "SAIC", "maxar": "Maxar Technologies", "planet labs": "Planet Labs", "capella space": "Capella Space", "rocket lab": "Rocket Lab", "aerojet": "Aerojet Rocketdyne", "ball aerospace": "Ball Aerospace", "sierra nevada": "Sierra Nevada Corporation", "relativity space": "Relativity Space", "anduril": "Anduril Industries", "palantir": "Palantir Technologies", "shield ai": "Shield AI", "heo robotics": "HEO Robotics", "mda space": "MDA Space", "mda": "MDA Space", "thales": "Thales Group", "airbus": "Airbus Defence and Space", "blue origin": "Blue Origin", "virgin orbit": "Virgin Orbit", "astra": "Astra Space", "spire global": "Spire Global", "hawkeye 360": "HawkEye 360", "iceye": "ICEYE", "umbra": "Umbra Space", "terran orbital": "Terran Orbital", "momentus": "Momentus Space", "ast spacemobile": "AST SpaceMobile", "viasat": "Viasat", "iridium": "Iridium Communications", "ses": "SES S.A.", "intelsat": "Intelsat", "eutelsat": "Eutelsat", "telesat": "Telesat", "oneweb": "OneWeb", "starlink": "SpaceX Starlink", "amazon kuiper": "Amazon Kuiper", } def extract(self, title: str, snippet: str) -> ExtractionResult: """ Extract organization from title and snippet. Args: title: Article/result title snippet: Description/body text Returns: ExtractionResult with organization name, confidence, and method """ combined_text = f"{title} {snippet}" # Try methods in order of speed and confidence # 1. Check known companies first (fastest, highest confidence) result = self._extract_known_company(combined_text) if result: return ExtractionResult(organization=result, confidence=0.95, method="known") # 2. Try stock ticker extraction (very reliable when present) result = self._extract_from_ticker(combined_text) if result: return ExtractionResult(organization=result, confidence=0.9, method="ticker") # 3. Try news wire header pattern result = self._extract_from_news_wire(snippet) if result: return ExtractionResult(organization=result, confidence=0.85, method="news_wire") # 4. Try company suffix pattern result = self._extract_from_suffix(combined_text) if result: return ExtractionResult(organization=result, confidence=0.8, method="suffix") # 5. Try institution patterns result = self._extract_institution(combined_text) if result: return ExtractionResult(organization=result, confidence=0.75, method="institution") # 6. Fall back to LLM if available if self.llm_client: result = self._extract_with_llm(title, snippet) if result: return ExtractionResult(organization=result, confidence=0.7, method="llm") # No extraction possible return ExtractionResult(organization=None, confidence=0.0, method="none") def _extract_known_company(self, text: str) -> Optional[str]: """Check for known defense/space companies.""" text_lower = text.lower() for keyword, full_name in self.known_companies.items(): if keyword in text_lower: return full_name return None def _extract_from_ticker(self, text: str) -> Optional[str]: """Extract company name associated with stock ticker.""" for pattern in self.ticker_patterns: match = pattern.search(text) if match: ticker = match.group(1).upper() # Try to find the company name near the ticker # Look for pattern like "Company Name (NYSE:XXX)" or "Company Name (XXX)" before_ticker = text[:match.start()] # Find company name pattern before ticker company_match = self.company_pattern.search(before_ticker[-100:]) if company_match: name = f"{company_match.group(1)} {company_match.group(2)}" return self._clean_company_name(name) # Try to get the last few capitalized words before ticker words_before = before_ticker.strip().split()[-5:] if words_before: # Look for capitalized sequence company_words = [] for word in reversed(words_before): clean_word = word.strip('(),-:') if clean_word and clean_word[0].isupper(): company_words.insert(0, clean_word) elif company_words: break if company_words: return " ".join(company_words) return None def _extract_from_news_wire(self, text: str) -> Optional[str]: """Extract from news wire format (CITY -- Company Name announced...).""" match = self.news_wire_pattern.search(text) if match: return self._clean_company_name(match.group(1)) return None def _extract_from_suffix(self, text: str) -> Optional[str]: """Extract using company suffix patterns.""" matches = list(self.company_pattern.finditer(text)) if matches: # Prefer matches from title (earlier in combined text) # Take the first substantial match for match in matches: name = f"{match.group(1)} {match.group(2)}" cleaned = self._clean_company_name(name) if cleaned and len(cleaned) > 3: return cleaned return None def _extract_institution(self, text: str) -> Optional[str]: """Extract university or research institution names.""" for pattern in self.institution_patterns: match = pattern.search(text) if match: return match.group(1) return None def _extract_with_llm(self, title: str, snippet: str) -> Optional[str]: """Use LLM to extract organization name as fallback.""" if not self.llm_client: return None prompt = f"""Extract the primary company or organization name from this text. Title: {title} Description: {snippet} Rules: - Return ONLY the organization name, nothing else - If it's a company, include the suffix (Inc, LLC, Corp, etc.) if known - If no clear organization can be identified, return "NONE" - Do not make up or guess organization names - Prefer companies over universities/agencies if both are mentioned Organization name:""" try: response = self.llm_client.generate( prompt=prompt, model=self.model, temperature=0.0, # Deterministic max_tokens=50 ) if response.success: result = response.content.strip() # Validate result if result and result.upper() != "NONE" and len(result) > 2: # Basic validation - should look like a company name if not any(x in result.lower() for x in ["i don't", "cannot", "no ", "none", "unknown"]): return self._clean_company_name(result) except Exception as e: logger.warning(f"LLM extraction failed: {e}") return None def _clean_company_name(self, name: str) -> str: """Clean and normalize company name.""" if not name: return "" # Remove common prefixes prefixes_to_remove = [ "the ", "a ", "an ", "by ", "from ", "at ", "-- ", "- ", ] name_lower = name.lower() for prefix in prefixes_to_remove: if name_lower.startswith(prefix): name = name[len(prefix):] name_lower = name.lower() # Remove trailing punctuation name = name.rstrip('.,;:') # Normalize whitespace name = " ".join(name.split()) return name.strip() def extract_batch( self, items: List[Tuple[str, str]], use_llm_fallback: bool = True ) -> List[ExtractionResult]: """ Extract organizations from multiple items. Args: items: List of (title, snippet) tuples use_llm_fallback: Whether to use LLM for items that regex can't handle Returns: List of ExtractionResult objects """ results = [] for title, snippet in items: # extract() already includes LLM fallback result = self.extract(title, snippet) # Log extraction for debugging if result.organization: logger.info(f"Extracted '{result.organization}' from '{title[:50]}...' (method: {result.method})") else: logger.debug(f"No organization found in '{title[:50]}...'") results.append(result) return results