TechScout/techscout/extraction/org_extractor.py

370 lines
14 KiB
Python

"""
Organization Extractor for TechScout.
Hybrid approach: Fast regex patterns first, LLM fallback for ambiguous cases.
Extracts company/organization names from unstructured text like news articles and web pages.
"""
import re
import logging
from typing import Optional, List, Tuple
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class ExtractionResult:
"""Result of organization extraction."""
organization: Optional[str]
confidence: float # 0.0 to 1.0
method: str # "regex", "llm", or "none"
class OrganizationExtractor:
"""
Extracts organization names from text using hybrid regex + LLM approach.
Strategy:
1. Try fast regex patterns for common company suffixes
2. Try stock ticker extraction (NYSE:XXX, NASDAQ:XXX)
3. Try news source patterns (BUSINESS WIRE, PR Newswire headers)
4. Fall back to LLM for ambiguous cases
"""
def __init__(self, llm_client=None, model: str = "mistral-nemo:12b"):
self.llm_client = llm_client
self.model = model
# Common company suffixes - ordered by specificity
self.company_suffixes = [
r"Inc\.",
r"Inc",
r"LLC",
r"L\.L\.C\.",
r"Corp\.",
r"Corp",
r"Corporation",
r"Ltd\.",
r"Ltd",
r"Limited",
r"Co\.",
r"Company",
r"Technologies",
r"Technology",
r"Systems",
r"Solutions",
r"Industries",
r"Aerospace",
r"Defense",
r"Dynamics",
r"Sciences",
r"Labs",
r"Laboratories",
r"Group",
r"Holdings",
r"Partners",
r"Ventures",
r"Analytics",
r"Robotics",
r"Space",
r"Aviation",
]
# Build regex pattern for company names
suffix_pattern = "|".join(self.company_suffixes)
# Match: 1-4 capitalized words followed by a suffix
self.company_pattern = re.compile(
rf'\b([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)?(?:\s+[A-Z][a-zA-Z]*)?(?:\s+[A-Z][a-zA-Z]*)?)\s+({suffix_pattern})\b',
re.IGNORECASE
)
# Stock ticker patterns
self.ticker_patterns = [
re.compile(r'\((?:NYSE|NASDAQ|AMEX|OTC):\s*([A-Z]{1,5})\)', re.IGNORECASE),
re.compile(r'(?:NYSE|NASDAQ|AMEX|OTC):\s*([A-Z]{1,5})\b', re.IGNORECASE),
re.compile(r'\(([A-Z]{2,5})\)', re.IGNORECASE), # Just ticker in parens like (BKSY)
]
# News wire header patterns - company often appears right after location
self.news_wire_pattern = re.compile(
r'(?:HERNDON|SAN FRANCISCO|NEW YORK|LOS ANGELES|WASHINGTON|DENVER|SEATTLE|BOSTON|AUSTIN|ARLINGTON|'
r'BOULDER|HUNTSVILLE|COLORADO SPRINGS|MOUNTAIN VIEW|PALO ALTO|REDMOND|SAN DIEGO|PHOENIX|DALLAS|'
r'HOUSTON|CHICAGO|ATLANTA|MIAMI|TAMPA|ORLANDO|RESTON|MCLEAN|CHANTILLY|STERLING),?\s+'
r'(?:[A-Za-z\.]+,?\s*)?--?\s*' # State abbreviation
r'(?:\([^)]+\)--?\s*)?' # Optional source like (BUSINESS WIRE)
r'([A-Z][A-Za-z]+(?:\s+[A-Z][a-zA-Z]+)*(?:\s+(?:Inc|LLC|Corp|Ltd|Technologies|Technology|Systems|Solutions|'
r'Aerospace|Defense|Dynamics|Sciences|Space|Aviation)\.?)?)',
re.IGNORECASE
)
# University/research institution patterns
self.institution_patterns = [
re.compile(r'\b((?:University of [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?|'
r'[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\s+University|'
r'MIT|Caltech|Stanford|Harvard|Carnegie Mellon|Georgia Tech|'
r'[A-Z][a-z]+\s+Institute of Technology))\b'),
re.compile(r'\b((?:[A-Z][a-z]+\s+)?(?:National Laboratory|Research Center|Research Institute))\b'),
]
# Known defense/space companies for quick matching
self.known_companies = {
"blacksky": "BlackSky Technology Inc.",
"spacex": "SpaceX",
"northrop": "Northrop Grumman",
"lockheed": "Lockheed Martin",
"raytheon": "Raytheon",
"boeing": "Boeing",
"general dynamics": "General Dynamics",
"bae systems": "BAE Systems",
"l3harris": "L3Harris Technologies",
"leidos": "Leidos",
"saic": "SAIC",
"maxar": "Maxar Technologies",
"planet labs": "Planet Labs",
"capella space": "Capella Space",
"rocket lab": "Rocket Lab",
"aerojet": "Aerojet Rocketdyne",
"ball aerospace": "Ball Aerospace",
"sierra nevada": "Sierra Nevada Corporation",
"relativity space": "Relativity Space",
"anduril": "Anduril Industries",
"palantir": "Palantir Technologies",
"shield ai": "Shield AI",
"heo robotics": "HEO Robotics",
"mda space": "MDA Space",
"mda": "MDA Space",
"thales": "Thales Group",
"airbus": "Airbus Defence and Space",
"blue origin": "Blue Origin",
"virgin orbit": "Virgin Orbit",
"astra": "Astra Space",
"spire global": "Spire Global",
"hawkeye 360": "HawkEye 360",
"iceye": "ICEYE",
"umbra": "Umbra Space",
"terran orbital": "Terran Orbital",
"momentus": "Momentus Space",
"ast spacemobile": "AST SpaceMobile",
"viasat": "Viasat",
"iridium": "Iridium Communications",
"ses": "SES S.A.",
"intelsat": "Intelsat",
"eutelsat": "Eutelsat",
"telesat": "Telesat",
"oneweb": "OneWeb",
"starlink": "SpaceX Starlink",
"amazon kuiper": "Amazon Kuiper",
}
def extract(self, title: str, snippet: str) -> ExtractionResult:
"""
Extract organization from title and snippet.
Args:
title: Article/result title
snippet: Description/body text
Returns:
ExtractionResult with organization name, confidence, and method
"""
combined_text = f"{title} {snippet}"
# Try methods in order of speed and confidence
# 1. Check known companies first (fastest, highest confidence)
result = self._extract_known_company(combined_text)
if result:
return ExtractionResult(organization=result, confidence=0.95, method="known")
# 2. Try stock ticker extraction (very reliable when present)
result = self._extract_from_ticker(combined_text)
if result:
return ExtractionResult(organization=result, confidence=0.9, method="ticker")
# 3. Try news wire header pattern
result = self._extract_from_news_wire(snippet)
if result:
return ExtractionResult(organization=result, confidence=0.85, method="news_wire")
# 4. Try company suffix pattern
result = self._extract_from_suffix(combined_text)
if result:
return ExtractionResult(organization=result, confidence=0.8, method="suffix")
# 5. Try institution patterns
result = self._extract_institution(combined_text)
if result:
return ExtractionResult(organization=result, confidence=0.75, method="institution")
# 6. Fall back to LLM if available
if self.llm_client:
result = self._extract_with_llm(title, snippet)
if result:
return ExtractionResult(organization=result, confidence=0.7, method="llm")
# No extraction possible
return ExtractionResult(organization=None, confidence=0.0, method="none")
def _extract_known_company(self, text: str) -> Optional[str]:
"""Check for known defense/space companies."""
text_lower = text.lower()
for keyword, full_name in self.known_companies.items():
if keyword in text_lower:
return full_name
return None
def _extract_from_ticker(self, text: str) -> Optional[str]:
"""Extract company name associated with stock ticker."""
for pattern in self.ticker_patterns:
match = pattern.search(text)
if match:
ticker = match.group(1).upper()
# Try to find the company name near the ticker
# Look for pattern like "Company Name (NYSE:XXX)" or "Company Name (XXX)"
before_ticker = text[:match.start()]
# Find company name pattern before ticker
company_match = self.company_pattern.search(before_ticker[-100:])
if company_match:
name = f"{company_match.group(1)} {company_match.group(2)}"
return self._clean_company_name(name)
# Try to get the last few capitalized words before ticker
words_before = before_ticker.strip().split()[-5:]
if words_before:
# Look for capitalized sequence
company_words = []
for word in reversed(words_before):
clean_word = word.strip('(),-:')
if clean_word and clean_word[0].isupper():
company_words.insert(0, clean_word)
elif company_words:
break
if company_words:
return " ".join(company_words)
return None
def _extract_from_news_wire(self, text: str) -> Optional[str]:
"""Extract from news wire format (CITY -- Company Name announced...)."""
match = self.news_wire_pattern.search(text)
if match:
return self._clean_company_name(match.group(1))
return None
def _extract_from_suffix(self, text: str) -> Optional[str]:
"""Extract using company suffix patterns."""
matches = list(self.company_pattern.finditer(text))
if matches:
# Prefer matches from title (earlier in combined text)
# Take the first substantial match
for match in matches:
name = f"{match.group(1)} {match.group(2)}"
cleaned = self._clean_company_name(name)
if cleaned and len(cleaned) > 3:
return cleaned
return None
def _extract_institution(self, text: str) -> Optional[str]:
"""Extract university or research institution names."""
for pattern in self.institution_patterns:
match = pattern.search(text)
if match:
return match.group(1)
return None
def _extract_with_llm(self, title: str, snippet: str) -> Optional[str]:
"""Use LLM to extract organization name as fallback."""
if not self.llm_client:
return None
prompt = f"""Extract the primary company or organization name from this text.
Title: {title}
Description: {snippet}
Rules:
- Return ONLY the organization name, nothing else
- If it's a company, include the suffix (Inc, LLC, Corp, etc.) if known
- If no clear organization can be identified, return "NONE"
- Do not make up or guess organization names
- Prefer companies over universities/agencies if both are mentioned
Organization name:"""
try:
response = self.llm_client.generate(
prompt=prompt,
model=self.model,
temperature=0.0, # Deterministic
max_tokens=50
)
if response.success:
result = response.content.strip()
# Validate result
if result and result.upper() != "NONE" and len(result) > 2:
# Basic validation - should look like a company name
if not any(x in result.lower() for x in ["i don't", "cannot", "no ", "none", "unknown"]):
return self._clean_company_name(result)
except Exception as e:
logger.warning(f"LLM extraction failed: {e}")
return None
def _clean_company_name(self, name: str) -> str:
"""Clean and normalize company name."""
if not name:
return ""
# Remove common prefixes
prefixes_to_remove = [
"the ", "a ", "an ",
"by ", "from ", "at ",
"-- ", "- ",
]
name_lower = name.lower()
for prefix in prefixes_to_remove:
if name_lower.startswith(prefix):
name = name[len(prefix):]
name_lower = name.lower()
# Remove trailing punctuation
name = name.rstrip('.,;:')
# Normalize whitespace
name = " ".join(name.split())
return name.strip()
def extract_batch(
self,
items: List[Tuple[str, str]],
use_llm_fallback: bool = True
) -> List[ExtractionResult]:
"""
Extract organizations from multiple items.
Args:
items: List of (title, snippet) tuples
use_llm_fallback: Whether to use LLM for items that regex can't handle
Returns:
List of ExtractionResult objects
"""
results = []
for title, snippet in items:
# extract() already includes LLM fallback
result = self.extract(title, snippet)
# Log extraction for debugging
if result.organization:
logger.info(f"Extracted '{result.organization}' from '{title[:50]}...' (method: {result.method})")
else:
logger.debug(f"No organization found in '{title[:50]}...'")
results.append(result)
return results