370 lines
14 KiB
Python
370 lines
14 KiB
Python
"""
|
|
Organization Extractor for TechScout.
|
|
|
|
Hybrid approach: Fast regex patterns first, LLM fallback for ambiguous cases.
|
|
Extracts company/organization names from unstructured text like news articles and web pages.
|
|
"""
|
|
|
|
import re
|
|
import logging
|
|
from typing import Optional, List, Tuple
|
|
from dataclasses import dataclass
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ExtractionResult:
|
|
"""Result of organization extraction."""
|
|
organization: Optional[str]
|
|
confidence: float # 0.0 to 1.0
|
|
method: str # "regex", "llm", or "none"
|
|
|
|
|
|
class OrganizationExtractor:
|
|
"""
|
|
Extracts organization names from text using hybrid regex + LLM approach.
|
|
|
|
Strategy:
|
|
1. Try fast regex patterns for common company suffixes
|
|
2. Try stock ticker extraction (NYSE:XXX, NASDAQ:XXX)
|
|
3. Try news source patterns (BUSINESS WIRE, PR Newswire headers)
|
|
4. Fall back to LLM for ambiguous cases
|
|
"""
|
|
|
|
def __init__(self, llm_client=None, model: str = "mistral-nemo:12b"):
|
|
self.llm_client = llm_client
|
|
self.model = model
|
|
|
|
# Common company suffixes - ordered by specificity
|
|
self.company_suffixes = [
|
|
r"Inc\.",
|
|
r"Inc",
|
|
r"LLC",
|
|
r"L\.L\.C\.",
|
|
r"Corp\.",
|
|
r"Corp",
|
|
r"Corporation",
|
|
r"Ltd\.",
|
|
r"Ltd",
|
|
r"Limited",
|
|
r"Co\.",
|
|
r"Company",
|
|
r"Technologies",
|
|
r"Technology",
|
|
r"Systems",
|
|
r"Solutions",
|
|
r"Industries",
|
|
r"Aerospace",
|
|
r"Defense",
|
|
r"Dynamics",
|
|
r"Sciences",
|
|
r"Labs",
|
|
r"Laboratories",
|
|
r"Group",
|
|
r"Holdings",
|
|
r"Partners",
|
|
r"Ventures",
|
|
r"Analytics",
|
|
r"Robotics",
|
|
r"Space",
|
|
r"Aviation",
|
|
]
|
|
|
|
# Build regex pattern for company names
|
|
suffix_pattern = "|".join(self.company_suffixes)
|
|
# Match: 1-4 capitalized words followed by a suffix
|
|
self.company_pattern = re.compile(
|
|
rf'\b([A-Z][a-zA-Z]*(?:\s+[A-Z][a-zA-Z]*)?(?:\s+[A-Z][a-zA-Z]*)?(?:\s+[A-Z][a-zA-Z]*)?)\s+({suffix_pattern})\b',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
# Stock ticker patterns
|
|
self.ticker_patterns = [
|
|
re.compile(r'\((?:NYSE|NASDAQ|AMEX|OTC):\s*([A-Z]{1,5})\)', re.IGNORECASE),
|
|
re.compile(r'(?:NYSE|NASDAQ|AMEX|OTC):\s*([A-Z]{1,5})\b', re.IGNORECASE),
|
|
re.compile(r'\(([A-Z]{2,5})\)', re.IGNORECASE), # Just ticker in parens like (BKSY)
|
|
]
|
|
|
|
# News wire header patterns - company often appears right after location
|
|
self.news_wire_pattern = re.compile(
|
|
r'(?:HERNDON|SAN FRANCISCO|NEW YORK|LOS ANGELES|WASHINGTON|DENVER|SEATTLE|BOSTON|AUSTIN|ARLINGTON|'
|
|
r'BOULDER|HUNTSVILLE|COLORADO SPRINGS|MOUNTAIN VIEW|PALO ALTO|REDMOND|SAN DIEGO|PHOENIX|DALLAS|'
|
|
r'HOUSTON|CHICAGO|ATLANTA|MIAMI|TAMPA|ORLANDO|RESTON|MCLEAN|CHANTILLY|STERLING),?\s+'
|
|
r'(?:[A-Za-z\.]+,?\s*)?--?\s*' # State abbreviation
|
|
r'(?:\([^)]+\)--?\s*)?' # Optional source like (BUSINESS WIRE)
|
|
r'([A-Z][A-Za-z]+(?:\s+[A-Z][a-zA-Z]+)*(?:\s+(?:Inc|LLC|Corp|Ltd|Technologies|Technology|Systems|Solutions|'
|
|
r'Aerospace|Defense|Dynamics|Sciences|Space|Aviation)\.?)?)',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
# University/research institution patterns
|
|
self.institution_patterns = [
|
|
re.compile(r'\b((?:University of [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?|'
|
|
r'[A-Z][a-z]+(?:\s+[A-Z][a-z]+)?\s+University|'
|
|
r'MIT|Caltech|Stanford|Harvard|Carnegie Mellon|Georgia Tech|'
|
|
r'[A-Z][a-z]+\s+Institute of Technology))\b'),
|
|
re.compile(r'\b((?:[A-Z][a-z]+\s+)?(?:National Laboratory|Research Center|Research Institute))\b'),
|
|
]
|
|
|
|
# Known defense/space companies for quick matching
|
|
self.known_companies = {
|
|
"blacksky": "BlackSky Technology Inc.",
|
|
"spacex": "SpaceX",
|
|
"northrop": "Northrop Grumman",
|
|
"lockheed": "Lockheed Martin",
|
|
"raytheon": "Raytheon",
|
|
"boeing": "Boeing",
|
|
"general dynamics": "General Dynamics",
|
|
"bae systems": "BAE Systems",
|
|
"l3harris": "L3Harris Technologies",
|
|
"leidos": "Leidos",
|
|
"saic": "SAIC",
|
|
"maxar": "Maxar Technologies",
|
|
"planet labs": "Planet Labs",
|
|
"capella space": "Capella Space",
|
|
"rocket lab": "Rocket Lab",
|
|
"aerojet": "Aerojet Rocketdyne",
|
|
"ball aerospace": "Ball Aerospace",
|
|
"sierra nevada": "Sierra Nevada Corporation",
|
|
"relativity space": "Relativity Space",
|
|
"anduril": "Anduril Industries",
|
|
"palantir": "Palantir Technologies",
|
|
"shield ai": "Shield AI",
|
|
"heo robotics": "HEO Robotics",
|
|
"mda space": "MDA Space",
|
|
"mda": "MDA Space",
|
|
"thales": "Thales Group",
|
|
"airbus": "Airbus Defence and Space",
|
|
"blue origin": "Blue Origin",
|
|
"virgin orbit": "Virgin Orbit",
|
|
"astra": "Astra Space",
|
|
"spire global": "Spire Global",
|
|
"hawkeye 360": "HawkEye 360",
|
|
"iceye": "ICEYE",
|
|
"umbra": "Umbra Space",
|
|
"terran orbital": "Terran Orbital",
|
|
"momentus": "Momentus Space",
|
|
"ast spacemobile": "AST SpaceMobile",
|
|
"viasat": "Viasat",
|
|
"iridium": "Iridium Communications",
|
|
"ses": "SES S.A.",
|
|
"intelsat": "Intelsat",
|
|
"eutelsat": "Eutelsat",
|
|
"telesat": "Telesat",
|
|
"oneweb": "OneWeb",
|
|
"starlink": "SpaceX Starlink",
|
|
"amazon kuiper": "Amazon Kuiper",
|
|
}
|
|
|
|
def extract(self, title: str, snippet: str) -> ExtractionResult:
|
|
"""
|
|
Extract organization from title and snippet.
|
|
|
|
Args:
|
|
title: Article/result title
|
|
snippet: Description/body text
|
|
|
|
Returns:
|
|
ExtractionResult with organization name, confidence, and method
|
|
"""
|
|
combined_text = f"{title} {snippet}"
|
|
|
|
# Try methods in order of speed and confidence
|
|
|
|
# 1. Check known companies first (fastest, highest confidence)
|
|
result = self._extract_known_company(combined_text)
|
|
if result:
|
|
return ExtractionResult(organization=result, confidence=0.95, method="known")
|
|
|
|
# 2. Try stock ticker extraction (very reliable when present)
|
|
result = self._extract_from_ticker(combined_text)
|
|
if result:
|
|
return ExtractionResult(organization=result, confidence=0.9, method="ticker")
|
|
|
|
# 3. Try news wire header pattern
|
|
result = self._extract_from_news_wire(snippet)
|
|
if result:
|
|
return ExtractionResult(organization=result, confidence=0.85, method="news_wire")
|
|
|
|
# 4. Try company suffix pattern
|
|
result = self._extract_from_suffix(combined_text)
|
|
if result:
|
|
return ExtractionResult(organization=result, confidence=0.8, method="suffix")
|
|
|
|
# 5. Try institution patterns
|
|
result = self._extract_institution(combined_text)
|
|
if result:
|
|
return ExtractionResult(organization=result, confidence=0.75, method="institution")
|
|
|
|
# 6. Fall back to LLM if available
|
|
if self.llm_client:
|
|
result = self._extract_with_llm(title, snippet)
|
|
if result:
|
|
return ExtractionResult(organization=result, confidence=0.7, method="llm")
|
|
|
|
# No extraction possible
|
|
return ExtractionResult(organization=None, confidence=0.0, method="none")
|
|
|
|
def _extract_known_company(self, text: str) -> Optional[str]:
|
|
"""Check for known defense/space companies."""
|
|
text_lower = text.lower()
|
|
for keyword, full_name in self.known_companies.items():
|
|
if keyword in text_lower:
|
|
return full_name
|
|
return None
|
|
|
|
def _extract_from_ticker(self, text: str) -> Optional[str]:
|
|
"""Extract company name associated with stock ticker."""
|
|
for pattern in self.ticker_patterns:
|
|
match = pattern.search(text)
|
|
if match:
|
|
ticker = match.group(1).upper()
|
|
# Try to find the company name near the ticker
|
|
# Look for pattern like "Company Name (NYSE:XXX)" or "Company Name (XXX)"
|
|
before_ticker = text[:match.start()]
|
|
|
|
# Find company name pattern before ticker
|
|
company_match = self.company_pattern.search(before_ticker[-100:])
|
|
if company_match:
|
|
name = f"{company_match.group(1)} {company_match.group(2)}"
|
|
return self._clean_company_name(name)
|
|
|
|
# Try to get the last few capitalized words before ticker
|
|
words_before = before_ticker.strip().split()[-5:]
|
|
if words_before:
|
|
# Look for capitalized sequence
|
|
company_words = []
|
|
for word in reversed(words_before):
|
|
clean_word = word.strip('(),-:')
|
|
if clean_word and clean_word[0].isupper():
|
|
company_words.insert(0, clean_word)
|
|
elif company_words:
|
|
break
|
|
if company_words:
|
|
return " ".join(company_words)
|
|
return None
|
|
|
|
def _extract_from_news_wire(self, text: str) -> Optional[str]:
|
|
"""Extract from news wire format (CITY -- Company Name announced...)."""
|
|
match = self.news_wire_pattern.search(text)
|
|
if match:
|
|
return self._clean_company_name(match.group(1))
|
|
return None
|
|
|
|
def _extract_from_suffix(self, text: str) -> Optional[str]:
|
|
"""Extract using company suffix patterns."""
|
|
matches = list(self.company_pattern.finditer(text))
|
|
if matches:
|
|
# Prefer matches from title (earlier in combined text)
|
|
# Take the first substantial match
|
|
for match in matches:
|
|
name = f"{match.group(1)} {match.group(2)}"
|
|
cleaned = self._clean_company_name(name)
|
|
if cleaned and len(cleaned) > 3:
|
|
return cleaned
|
|
return None
|
|
|
|
def _extract_institution(self, text: str) -> Optional[str]:
|
|
"""Extract university or research institution names."""
|
|
for pattern in self.institution_patterns:
|
|
match = pattern.search(text)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def _extract_with_llm(self, title: str, snippet: str) -> Optional[str]:
|
|
"""Use LLM to extract organization name as fallback."""
|
|
if not self.llm_client:
|
|
return None
|
|
|
|
prompt = f"""Extract the primary company or organization name from this text.
|
|
|
|
Title: {title}
|
|
Description: {snippet}
|
|
|
|
Rules:
|
|
- Return ONLY the organization name, nothing else
|
|
- If it's a company, include the suffix (Inc, LLC, Corp, etc.) if known
|
|
- If no clear organization can be identified, return "NONE"
|
|
- Do not make up or guess organization names
|
|
- Prefer companies over universities/agencies if both are mentioned
|
|
|
|
Organization name:"""
|
|
|
|
try:
|
|
response = self.llm_client.generate(
|
|
prompt=prompt,
|
|
model=self.model,
|
|
temperature=0.0, # Deterministic
|
|
max_tokens=50
|
|
)
|
|
|
|
if response.success:
|
|
result = response.content.strip()
|
|
# Validate result
|
|
if result and result.upper() != "NONE" and len(result) > 2:
|
|
# Basic validation - should look like a company name
|
|
if not any(x in result.lower() for x in ["i don't", "cannot", "no ", "none", "unknown"]):
|
|
return self._clean_company_name(result)
|
|
except Exception as e:
|
|
logger.warning(f"LLM extraction failed: {e}")
|
|
|
|
return None
|
|
|
|
def _clean_company_name(self, name: str) -> str:
|
|
"""Clean and normalize company name."""
|
|
if not name:
|
|
return ""
|
|
|
|
# Remove common prefixes
|
|
prefixes_to_remove = [
|
|
"the ", "a ", "an ",
|
|
"by ", "from ", "at ",
|
|
"-- ", "- ",
|
|
]
|
|
name_lower = name.lower()
|
|
for prefix in prefixes_to_remove:
|
|
if name_lower.startswith(prefix):
|
|
name = name[len(prefix):]
|
|
name_lower = name.lower()
|
|
|
|
# Remove trailing punctuation
|
|
name = name.rstrip('.,;:')
|
|
|
|
# Normalize whitespace
|
|
name = " ".join(name.split())
|
|
|
|
return name.strip()
|
|
|
|
def extract_batch(
|
|
self,
|
|
items: List[Tuple[str, str]],
|
|
use_llm_fallback: bool = True
|
|
) -> List[ExtractionResult]:
|
|
"""
|
|
Extract organizations from multiple items.
|
|
|
|
Args:
|
|
items: List of (title, snippet) tuples
|
|
use_llm_fallback: Whether to use LLM for items that regex can't handle
|
|
|
|
Returns:
|
|
List of ExtractionResult objects
|
|
"""
|
|
results = []
|
|
|
|
for title, snippet in items:
|
|
# extract() already includes LLM fallback
|
|
result = self.extract(title, snippet)
|
|
|
|
# Log extraction for debugging
|
|
if result.organization:
|
|
logger.info(f"Extracted '{result.organization}' from '{title[:50]}...' (method: {result.method})")
|
|
else:
|
|
logger.debug(f"No organization found in '{title[:50]}...'")
|
|
|
|
results.append(result)
|
|
|
|
return results
|