422 lines
15 KiB
Python
422 lines
15 KiB
Python
|
|
"""
|
||
|
|
Technology Grouper for TechScout.
|
||
|
|
|
||
|
|
Groups duplicate technologies across sources into single entities.
|
||
|
|
Uses conservative heuristic matching to avoid false merges.
|
||
|
|
|
||
|
|
This is Stage 4 of the Capability-Technology Matching pipeline.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import logging
|
||
|
|
import uuid
|
||
|
|
from typing import List, Dict, Set, Tuple, Optional
|
||
|
|
from dataclasses import dataclass
|
||
|
|
from collections import defaultdict
|
||
|
|
|
||
|
|
from .types import ExtractedTechnology, GroupedTechnology, SourceEvidence, Developer
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class SimilarityScore:
|
||
|
|
"""Similarity scores between two technologies."""
|
||
|
|
name_similarity: float # 0-1, string similarity
|
||
|
|
developer_match: bool # Same developer?
|
||
|
|
capability_overlap: float # 0-1, Jaccard similarity
|
||
|
|
description_similarity: float # 0-1, keyword overlap
|
||
|
|
overall: float # Combined score
|
||
|
|
|
||
|
|
|
||
|
|
class TechnologyGrouper:
|
||
|
|
"""
|
||
|
|
Groups extracted technologies that describe the same underlying technology.
|
||
|
|
|
||
|
|
Uses conservative matching:
|
||
|
|
- Same developer + similar name → definite merge
|
||
|
|
- Very similar name + similar capabilities → likely merge
|
||
|
|
- Otherwise → keep separate (avoid false merges)
|
||
|
|
"""
|
||
|
|
|
||
|
|
# Thresholds for grouping decisions
|
||
|
|
HIGH_CONFIDENCE_NAME_THRESHOLD = 0.85
|
||
|
|
MEDIUM_CONFIDENCE_NAME_THRESHOLD = 0.70
|
||
|
|
CAPABILITY_OVERLAP_THRESHOLD = 0.40
|
||
|
|
SAME_DEVELOPER_NAME_THRESHOLD = 0.65
|
||
|
|
|
||
|
|
def __init__(self):
|
||
|
|
pass
|
||
|
|
|
||
|
|
def group(self, technologies: List[ExtractedTechnology]) -> List[GroupedTechnology]:
|
||
|
|
"""
|
||
|
|
Group extracted technologies into unique technologies.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
technologies: List of extracted technologies (may have duplicates)
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
List of grouped technologies (deduplicated)
|
||
|
|
"""
|
||
|
|
if not technologies:
|
||
|
|
return []
|
||
|
|
|
||
|
|
logger.info(f"Grouping {len(technologies)} extracted technologies...")
|
||
|
|
|
||
|
|
# Build groups using union-find-like approach
|
||
|
|
groups: Dict[str, List[ExtractedTechnology]] = {}
|
||
|
|
tech_to_group: Dict[str, str] = {}
|
||
|
|
|
||
|
|
for tech in technologies:
|
||
|
|
# Find if this tech should merge with an existing group
|
||
|
|
merge_group_id = self._find_merge_candidate(tech, groups, tech_to_group)
|
||
|
|
|
||
|
|
if merge_group_id:
|
||
|
|
# Merge into existing group
|
||
|
|
groups[merge_group_id].append(tech)
|
||
|
|
tech_to_group[tech.id] = merge_group_id
|
||
|
|
else:
|
||
|
|
# Create new group
|
||
|
|
group_id = tech.id
|
||
|
|
groups[group_id] = [tech]
|
||
|
|
tech_to_group[tech.id] = group_id
|
||
|
|
|
||
|
|
# Convert groups to GroupedTechnology objects
|
||
|
|
grouped_technologies = []
|
||
|
|
for group_id, tech_list in groups.items():
|
||
|
|
grouped = self._merge_group(tech_list)
|
||
|
|
grouped_technologies.append(grouped)
|
||
|
|
|
||
|
|
logger.info(f"Grouped into {len(grouped_technologies)} unique technologies")
|
||
|
|
|
||
|
|
return grouped_technologies
|
||
|
|
|
||
|
|
def _find_merge_candidate(
|
||
|
|
self,
|
||
|
|
tech: ExtractedTechnology,
|
||
|
|
groups: Dict[str, List[ExtractedTechnology]],
|
||
|
|
tech_to_group: Dict[str, str]
|
||
|
|
) -> Optional[str]:
|
||
|
|
"""Find an existing group to merge this technology into."""
|
||
|
|
best_match_group = None
|
||
|
|
best_match_score = 0.0
|
||
|
|
|
||
|
|
for group_id, group_techs in groups.items():
|
||
|
|
# Compare against the first (canonical) tech in the group
|
||
|
|
representative = group_techs[0]
|
||
|
|
|
||
|
|
similarity = self._calculate_similarity(tech, representative)
|
||
|
|
should_merge, confidence = self._should_merge(similarity)
|
||
|
|
|
||
|
|
if should_merge and confidence > best_match_score:
|
||
|
|
best_match_group = group_id
|
||
|
|
best_match_score = confidence
|
||
|
|
|
||
|
|
return best_match_group
|
||
|
|
|
||
|
|
def _calculate_similarity(
|
||
|
|
self,
|
||
|
|
tech1: ExtractedTechnology,
|
||
|
|
tech2: ExtractedTechnology
|
||
|
|
) -> SimilarityScore:
|
||
|
|
"""Calculate similarity between two technologies."""
|
||
|
|
# Name similarity (fuzzy string match)
|
||
|
|
name_sim = self._string_similarity(tech1.name, tech2.name)
|
||
|
|
|
||
|
|
# Developer match
|
||
|
|
dev1 = (tech1.developer or "").lower().strip()
|
||
|
|
dev2 = (tech2.developer or "").lower().strip()
|
||
|
|
developer_match = False
|
||
|
|
if dev1 and dev2:
|
||
|
|
developer_match = (
|
||
|
|
dev1 == dev2 or
|
||
|
|
self._string_similarity(dev1, dev2) > 0.8 or
|
||
|
|
dev1 in dev2 or
|
||
|
|
dev2 in dev1
|
||
|
|
)
|
||
|
|
|
||
|
|
# Capability overlap (Jaccard similarity)
|
||
|
|
caps1 = set(self._extract_keywords(tech1.capabilities))
|
||
|
|
caps2 = set(self._extract_keywords(tech2.capabilities))
|
||
|
|
cap_overlap = self._jaccard_similarity(caps1, caps2)
|
||
|
|
|
||
|
|
# Description similarity (keyword overlap)
|
||
|
|
desc1_words = set(self._extract_keywords([tech1.description]))
|
||
|
|
desc2_words = set(self._extract_keywords([tech2.description]))
|
||
|
|
desc_sim = self._jaccard_similarity(desc1_words, desc2_words)
|
||
|
|
|
||
|
|
# Overall score
|
||
|
|
if developer_match:
|
||
|
|
overall = 0.5 * name_sim + 0.3 * cap_overlap + 0.2 * desc_sim
|
||
|
|
else:
|
||
|
|
overall = 0.6 * name_sim + 0.25 * cap_overlap + 0.15 * desc_sim
|
||
|
|
|
||
|
|
return SimilarityScore(
|
||
|
|
name_similarity=name_sim,
|
||
|
|
developer_match=developer_match,
|
||
|
|
capability_overlap=cap_overlap,
|
||
|
|
description_similarity=desc_sim,
|
||
|
|
overall=overall
|
||
|
|
)
|
||
|
|
|
||
|
|
def _should_merge(self, similarity: SimilarityScore) -> Tuple[bool, float]:
|
||
|
|
"""
|
||
|
|
Decide whether to merge based on similarity scores.
|
||
|
|
|
||
|
|
Returns (should_merge, confidence)
|
||
|
|
"""
|
||
|
|
# High confidence: Same developer + reasonably similar name
|
||
|
|
if similarity.developer_match and similarity.name_similarity > self.SAME_DEVELOPER_NAME_THRESHOLD:
|
||
|
|
return True, 0.9
|
||
|
|
|
||
|
|
# High confidence: Very similar names
|
||
|
|
if similarity.name_similarity > self.HIGH_CONFIDENCE_NAME_THRESHOLD:
|
||
|
|
return True, 0.85
|
||
|
|
|
||
|
|
# Medium confidence: Similar name + good capability overlap
|
||
|
|
if (similarity.name_similarity > self.MEDIUM_CONFIDENCE_NAME_THRESHOLD and
|
||
|
|
similarity.capability_overlap > self.CAPABILITY_OVERLAP_THRESHOLD):
|
||
|
|
return True, 0.7
|
||
|
|
|
||
|
|
# Same developer but different names - might be related but keep separate
|
||
|
|
if similarity.developer_match and similarity.name_similarity > 0.5:
|
||
|
|
# Only merge if capabilities strongly overlap
|
||
|
|
if similarity.capability_overlap > 0.6:
|
||
|
|
return True, 0.6
|
||
|
|
|
||
|
|
return False, 0.0
|
||
|
|
|
||
|
|
def _merge_group(self, technologies: List[ExtractedTechnology]) -> GroupedTechnology:
|
||
|
|
"""Merge multiple extracted technologies into one grouped technology."""
|
||
|
|
if len(technologies) == 1:
|
||
|
|
tech = technologies[0]
|
||
|
|
return self._single_to_grouped(tech)
|
||
|
|
|
||
|
|
# Choose canonical name (most specific/complete)
|
||
|
|
canonical_name = self._choose_canonical_name([t.name for t in technologies])
|
||
|
|
|
||
|
|
# Collect alternate names
|
||
|
|
alternate_names = list(set(
|
||
|
|
t.name for t in technologies if t.name != canonical_name
|
||
|
|
))
|
||
|
|
|
||
|
|
# Choose best technology type
|
||
|
|
type_counts = defaultdict(int)
|
||
|
|
for t in technologies:
|
||
|
|
type_counts[t.technology_type] += 1
|
||
|
|
technology_type = max(type_counts, key=type_counts.get) if type_counts else "system"
|
||
|
|
|
||
|
|
# Merge descriptions (choose longest/most detailed)
|
||
|
|
descriptions = [t.description for t in technologies if t.description]
|
||
|
|
description = max(descriptions, key=len) if descriptions else ""
|
||
|
|
|
||
|
|
# Merge capabilities (deduplicate)
|
||
|
|
all_capabilities = []
|
||
|
|
seen_caps = set()
|
||
|
|
for t in technologies:
|
||
|
|
for cap in t.capabilities:
|
||
|
|
cap_lower = cap.lower()
|
||
|
|
if cap_lower not in seen_caps:
|
||
|
|
seen_caps.add(cap_lower)
|
||
|
|
all_capabilities.append(cap)
|
||
|
|
|
||
|
|
# Merge mechanisms
|
||
|
|
mechanisms = [t.mechanism for t in technologies if t.mechanism]
|
||
|
|
mechanism = max(mechanisms, key=len) if mechanisms else None
|
||
|
|
|
||
|
|
# Collect developers
|
||
|
|
developers = []
|
||
|
|
seen_devs = set()
|
||
|
|
for t in technologies:
|
||
|
|
if t.developer and t.developer.lower() not in seen_devs:
|
||
|
|
seen_devs.add(t.developer.lower())
|
||
|
|
developers.append(Developer(
|
||
|
|
name=t.developer,
|
||
|
|
type=t.developer_type or "unknown"
|
||
|
|
))
|
||
|
|
|
||
|
|
# Best TRL estimate (highest confidence based on source type)
|
||
|
|
trl_estimate = None
|
||
|
|
trl_confidence = 0.0
|
||
|
|
trl_evidence = []
|
||
|
|
for t in technologies:
|
||
|
|
if t.trl_estimate:
|
||
|
|
# SBIR/patent sources have higher confidence
|
||
|
|
conf = 0.9 if t.source_type in ("sbir", "patent") else 0.7
|
||
|
|
if conf > trl_confidence:
|
||
|
|
trl_estimate = t.trl_estimate
|
||
|
|
trl_confidence = conf
|
||
|
|
trl_evidence.extend(t.trl_evidence)
|
||
|
|
trl_evidence = list(set(trl_evidence))
|
||
|
|
|
||
|
|
# Build source evidence
|
||
|
|
sources = []
|
||
|
|
for t in technologies:
|
||
|
|
sources.append(SourceEvidence(
|
||
|
|
source_type=t.source_type,
|
||
|
|
source_name=t.source_type.upper(),
|
||
|
|
title=t.source_title,
|
||
|
|
url=t.source_url,
|
||
|
|
snippet=t.source_snippet,
|
||
|
|
contribution=self._determine_contribution(t),
|
||
|
|
))
|
||
|
|
|
||
|
|
return GroupedTechnology(
|
||
|
|
id=str(uuid.uuid4())[:8],
|
||
|
|
canonical_name=canonical_name,
|
||
|
|
alternate_names=alternate_names,
|
||
|
|
technology_type=technology_type,
|
||
|
|
description=description,
|
||
|
|
capabilities=all_capabilities,
|
||
|
|
mechanism=mechanism,
|
||
|
|
developers=developers,
|
||
|
|
trl_estimate=trl_estimate,
|
||
|
|
trl_confidence=trl_confidence,
|
||
|
|
trl_evidence=trl_evidence,
|
||
|
|
sources=sources,
|
||
|
|
source_count=len(sources),
|
||
|
|
grouping_confidence=0.8 if len(technologies) > 1 else 1.0,
|
||
|
|
grouped_from=[t.id for t in technologies],
|
||
|
|
)
|
||
|
|
|
||
|
|
def _single_to_grouped(self, tech: ExtractedTechnology) -> GroupedTechnology:
|
||
|
|
"""Convert a single extracted technology to grouped format."""
|
||
|
|
developers = []
|
||
|
|
if tech.developer:
|
||
|
|
developers.append(Developer(
|
||
|
|
name=tech.developer,
|
||
|
|
type=tech.developer_type or "unknown"
|
||
|
|
))
|
||
|
|
|
||
|
|
source = SourceEvidence(
|
||
|
|
source_type=tech.source_type,
|
||
|
|
source_name=tech.source_type.upper(),
|
||
|
|
title=tech.source_title,
|
||
|
|
url=tech.source_url,
|
||
|
|
snippet=tech.source_snippet,
|
||
|
|
contribution=self._determine_contribution(tech),
|
||
|
|
)
|
||
|
|
|
||
|
|
return GroupedTechnology(
|
||
|
|
id=tech.id,
|
||
|
|
canonical_name=tech.name,
|
||
|
|
alternate_names=[],
|
||
|
|
technology_type=tech.technology_type,
|
||
|
|
description=tech.description,
|
||
|
|
capabilities=tech.capabilities,
|
||
|
|
mechanism=tech.mechanism,
|
||
|
|
developers=developers,
|
||
|
|
trl_estimate=tech.trl_estimate,
|
||
|
|
trl_confidence=0.7,
|
||
|
|
trl_evidence=tech.trl_evidence,
|
||
|
|
sources=[source],
|
||
|
|
source_count=1,
|
||
|
|
grouping_confidence=1.0,
|
||
|
|
grouped_from=[tech.id],
|
||
|
|
)
|
||
|
|
|
||
|
|
def _choose_canonical_name(self, names: List[str]) -> str:
|
||
|
|
"""Choose the most specific/complete name as canonical."""
|
||
|
|
if not names:
|
||
|
|
return "Unknown Technology"
|
||
|
|
|
||
|
|
# Prefer longer, more specific names
|
||
|
|
# But penalize overly long names (likely full titles, not tech names)
|
||
|
|
def name_score(name: str) -> float:
|
||
|
|
length = len(name)
|
||
|
|
if length < 5:
|
||
|
|
return 0.3
|
||
|
|
elif length > 100:
|
||
|
|
return 0.4
|
||
|
|
elif length > 50:
|
||
|
|
return 0.6
|
||
|
|
else:
|
||
|
|
return 0.8 + (length / 100)
|
||
|
|
|
||
|
|
return max(names, key=name_score)
|
||
|
|
|
||
|
|
def _determine_contribution(self, tech: ExtractedTechnology) -> str:
|
||
|
|
"""Determine what this source contributes to the technology profile."""
|
||
|
|
contributions = []
|
||
|
|
|
||
|
|
if tech.source_type == "sbir":
|
||
|
|
contributions.append("R&D funding")
|
||
|
|
contributions.append("development status")
|
||
|
|
elif tech.source_type == "patent":
|
||
|
|
contributions.append("technical claims")
|
||
|
|
contributions.append("innovation details")
|
||
|
|
elif tech.source_type == "contract":
|
||
|
|
contributions.append("government interest")
|
||
|
|
contributions.append("deployment status")
|
||
|
|
elif tech.source_type == "news":
|
||
|
|
contributions.append("recent developments")
|
||
|
|
else:
|
||
|
|
contributions.append("general information")
|
||
|
|
|
||
|
|
if tech.developer:
|
||
|
|
contributions.append("developer info")
|
||
|
|
|
||
|
|
return ", ".join(contributions)
|
||
|
|
|
||
|
|
def _string_similarity(self, s1: str, s2: str) -> float:
|
||
|
|
"""Calculate string similarity using token-based comparison."""
|
||
|
|
if not s1 or not s2:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
# Normalize
|
||
|
|
s1 = s1.lower().strip()
|
||
|
|
s2 = s2.lower().strip()
|
||
|
|
|
||
|
|
if s1 == s2:
|
||
|
|
return 1.0
|
||
|
|
|
||
|
|
# Token-based comparison
|
||
|
|
tokens1 = set(s1.split())
|
||
|
|
tokens2 = set(s2.split())
|
||
|
|
|
||
|
|
# Remove common words
|
||
|
|
stopwords = {'the', 'a', 'an', 'for', 'of', 'and', 'in', 'to', 'with'}
|
||
|
|
tokens1 -= stopwords
|
||
|
|
tokens2 -= stopwords
|
||
|
|
|
||
|
|
if not tokens1 or not tokens2:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
# Jaccard similarity of tokens
|
||
|
|
intersection = len(tokens1 & tokens2)
|
||
|
|
union = len(tokens1 | tokens2)
|
||
|
|
|
||
|
|
return intersection / union if union > 0 else 0.0
|
||
|
|
|
||
|
|
def _jaccard_similarity(self, set1: Set[str], set2: Set[str]) -> float:
|
||
|
|
"""Calculate Jaccard similarity between two sets."""
|
||
|
|
if not set1 or not set2:
|
||
|
|
return 0.0
|
||
|
|
|
||
|
|
intersection = len(set1 & set2)
|
||
|
|
union = len(set1 | set2)
|
||
|
|
|
||
|
|
return intersection / union if union > 0 else 0.0
|
||
|
|
|
||
|
|
def _extract_keywords(self, texts: List[str]) -> List[str]:
|
||
|
|
"""Extract keywords from a list of texts."""
|
||
|
|
stopwords = {
|
||
|
|
'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
|
||
|
|
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
|
||
|
|
'would', 'could', 'should', 'may', 'might', 'must', 'shall',
|
||
|
|
'can', 'need', 'for', 'of', 'to', 'in', 'on', 'at', 'by',
|
||
|
|
'from', 'with', 'and', 'or', 'not', 'that', 'this', 'it',
|
||
|
|
'as', 'its', 'also', 'than', 'such', 'into', 'which'
|
||
|
|
}
|
||
|
|
|
||
|
|
keywords = []
|
||
|
|
for text in texts:
|
||
|
|
if not text:
|
||
|
|
continue
|
||
|
|
words = text.lower().split()
|
||
|
|
for word in words:
|
||
|
|
word = word.strip('.,;:!?()[]{}"\'-')
|
||
|
|
if word and len(word) > 2 and word not in stopwords:
|
||
|
|
keywords.append(word)
|
||
|
|
|
||
|
|
return keywords
|