"""
Technology Grouper for TechScout.

Groups duplicate technologies across sources into single entities.
Uses conservative heuristic matching to avoid false merges.

This is Stage 4 of the Capability-Technology Matching pipeline.
"""

import logging
import uuid
from typing import List, Dict, Set, Tuple, Optional
from dataclasses import dataclass
from collections import defaultdict

from .types import ExtractedTechnology, GroupedTechnology, SourceEvidence, Developer

logger = logging.getLogger(__name__)


@dataclass
class SimilarityScore:
    """Similarity scores between two technologies."""
    name_similarity: float        # 0-1, string similarity
    developer_match: bool         # Same developer?
    capability_overlap: float     # 0-1, Jaccard similarity
    description_similarity: float # 0-1, keyword overlap
    overall: float                # Combined score


class TechnologyGrouper:
    """
    Groups extracted technologies that describe the same underlying technology.

    Uses conservative matching:
    - Same developer + similar name → definite merge
    - Very similar name + similar capabilities → likely merge
    - Otherwise → keep separate (avoid false merges)
    """

    # Thresholds for grouping decisions
    HIGH_CONFIDENCE_NAME_THRESHOLD = 0.85
    MEDIUM_CONFIDENCE_NAME_THRESHOLD = 0.70
    CAPABILITY_OVERLAP_THRESHOLD = 0.40
    SAME_DEVELOPER_NAME_THRESHOLD = 0.65

    def __init__(self):
        pass

    def group(self, technologies: List[ExtractedTechnology]) -> List[GroupedTechnology]:
        """
        Group extracted technologies into unique technologies.

        Args:
            technologies: List of extracted technologies (may have duplicates)

        Returns:
            List of grouped technologies (deduplicated)
        """
        if not technologies:
            return []

        logger.info(f"Grouping {len(technologies)} extracted technologies...")

        # Build groups using union-find-like approach
        groups: Dict[str, List[ExtractedTechnology]] = {}
        tech_to_group: Dict[str, str] = {}

        for tech in technologies:
            # Find if this tech should merge with an existing group
            merge_group_id = self._find_merge_candidate(tech, groups, tech_to_group)

            if merge_group_id:
                # Merge into existing group
                groups[merge_group_id].append(tech)
                tech_to_group[tech.id] = merge_group_id
            else:
                # Create new group
                group_id = tech.id
                groups[group_id] = [tech]
                tech_to_group[tech.id] = group_id

        # Convert groups to GroupedTechnology objects
        grouped_technologies = []
        for group_id, tech_list in groups.items():
            grouped = self._merge_group(tech_list)
            grouped_technologies.append(grouped)

        logger.info(f"Grouped into {len(grouped_technologies)} unique technologies")

        return grouped_technologies

    def _find_merge_candidate(
        self,
        tech: ExtractedTechnology,
        groups: Dict[str, List[ExtractedTechnology]],
        tech_to_group: Dict[str, str]
    ) -> Optional[str]:
        """Find an existing group to merge this technology into."""
        best_match_group = None
        best_match_score = 0.0

        for group_id, group_techs in groups.items():
            # Compare against the first (canonical) tech in the group
            representative = group_techs[0]

            similarity = self._calculate_similarity(tech, representative)
            should_merge, confidence = self._should_merge(similarity)

            if should_merge and confidence > best_match_score:
                best_match_group = group_id
                best_match_score = confidence

        return best_match_group

    def _calculate_similarity(
        self,
        tech1: ExtractedTechnology,
        tech2: ExtractedTechnology
    ) -> SimilarityScore:
        """Calculate similarity between two technologies."""
        # Name similarity (fuzzy string match)
        name_sim = self._string_similarity(tech1.name, tech2.name)

        # Developer match
        dev1 = (tech1.developer or "").lower().strip()
        dev2 = (tech2.developer or "").lower().strip()
        developer_match = False
        if dev1 and dev2:
            developer_match = (
                dev1 == dev2 or
                self._string_similarity(dev1, dev2) > 0.8 or
                dev1 in dev2 or
                dev2 in dev1
            )

        # Capability overlap (Jaccard similarity)
        caps1 = set(self._extract_keywords(tech1.capabilities))
        caps2 = set(self._extract_keywords(tech2.capabilities))
        cap_overlap = self._jaccard_similarity(caps1, caps2)

        # Description similarity (keyword overlap)
        desc1_words = set(self._extract_keywords([tech1.description]))
        desc2_words = set(self._extract_keywords([tech2.description]))
        desc_sim = self._jaccard_similarity(desc1_words, desc2_words)

        # Overall score
        if developer_match:
            overall = 0.5 * name_sim + 0.3 * cap_overlap + 0.2 * desc_sim
        else:
            overall = 0.6 * name_sim + 0.25 * cap_overlap + 0.15 * desc_sim

        return SimilarityScore(
            name_similarity=name_sim,
            developer_match=developer_match,
            capability_overlap=cap_overlap,
            description_similarity=desc_sim,
            overall=overall
        )

    def _should_merge(self, similarity: SimilarityScore) -> Tuple[bool, float]:
        """
        Decide whether to merge based on similarity scores.

        Returns (should_merge, confidence)
        """
        # High confidence: Same developer + reasonably similar name
        if similarity.developer_match and similarity.name_similarity > self.SAME_DEVELOPER_NAME_THRESHOLD:
            return True, 0.9

        # High confidence: Very similar names
        if similarity.name_similarity > self.HIGH_CONFIDENCE_NAME_THRESHOLD:
            return True, 0.85

        # Medium confidence: Similar name + good capability overlap
        if (similarity.name_similarity > self.MEDIUM_CONFIDENCE_NAME_THRESHOLD and
            similarity.capability_overlap > self.CAPABILITY_OVERLAP_THRESHOLD):
            return True, 0.7

        # Same developer but different names - might be related but keep separate
        if similarity.developer_match and similarity.name_similarity > 0.5:
            # Only merge if capabilities strongly overlap
            if similarity.capability_overlap > 0.6:
                return True, 0.6

        return False, 0.0

    def _merge_group(self, technologies: List[ExtractedTechnology]) -> GroupedTechnology:
        """Merge multiple extracted technologies into one grouped technology."""
        if len(technologies) == 1:
            tech = technologies[0]
            return self._single_to_grouped(tech)

        # Choose canonical name (most specific/complete)
        canonical_name = self._choose_canonical_name([t.name for t in technologies])

        # Collect alternate names
        alternate_names = list(set(
            t.name for t in technologies if t.name != canonical_name
        ))

        # Choose best technology type
        type_counts = defaultdict(int)
        for t in technologies:
            type_counts[t.technology_type] += 1
        technology_type = max(type_counts, key=type_counts.get) if type_counts else "system"

        # Merge descriptions (choose longest/most detailed)
        descriptions = [t.description for t in technologies if t.description]
        description = max(descriptions, key=len) if descriptions else ""

        # Merge capabilities (deduplicate)
        all_capabilities = []
        seen_caps = set()
        for t in technologies:
            for cap in t.capabilities:
                cap_lower = cap.lower()
                if cap_lower not in seen_caps:
                    seen_caps.add(cap_lower)
                    all_capabilities.append(cap)

        # Merge mechanisms
        mechanisms = [t.mechanism for t in technologies if t.mechanism]
        mechanism = max(mechanisms, key=len) if mechanisms else None

        # Collect developers
        developers = []
        seen_devs = set()
        for t in technologies:
            if t.developer and t.developer.lower() not in seen_devs:
                seen_devs.add(t.developer.lower())
                developers.append(Developer(
                    name=t.developer,
                    type=t.developer_type or "unknown"
                ))

        # Best TRL estimate (highest confidence based on source type)
        trl_estimate = None
        trl_confidence = 0.0
        trl_evidence = []
        for t in technologies:
            if t.trl_estimate:
                # SBIR/patent sources have higher confidence
                conf = 0.9 if t.source_type in ("sbir", "patent") else 0.7
                if conf > trl_confidence:
                    trl_estimate = t.trl_estimate
                    trl_confidence = conf
            trl_evidence.extend(t.trl_evidence)
        trl_evidence = list(set(trl_evidence))

        # Build source evidence
        sources = []
        for t in technologies:
            sources.append(SourceEvidence(
                source_type=t.source_type,
                source_name=t.source_type.upper(),
                title=t.source_title,
                url=t.source_url,
                snippet=t.source_snippet,
                contribution=self._determine_contribution(t),
            ))

        return GroupedTechnology(
            id=str(uuid.uuid4())[:8],
            canonical_name=canonical_name,
            alternate_names=alternate_names,
            technology_type=technology_type,
            description=description,
            capabilities=all_capabilities,
            mechanism=mechanism,
            developers=developers,
            trl_estimate=trl_estimate,
            trl_confidence=trl_confidence,
            trl_evidence=trl_evidence,
            sources=sources,
            source_count=len(sources),
            grouping_confidence=0.8 if len(technologies) > 1 else 1.0,
            grouped_from=[t.id for t in technologies],
        )

    def _single_to_grouped(self, tech: ExtractedTechnology) -> GroupedTechnology:
        """Convert a single extracted technology to grouped format."""
        developers = []
        if tech.developer:
            developers.append(Developer(
                name=tech.developer,
                type=tech.developer_type or "unknown"
            ))

        source = SourceEvidence(
            source_type=tech.source_type,
            source_name=tech.source_type.upper(),
            title=tech.source_title,
            url=tech.source_url,
            snippet=tech.source_snippet,
            contribution=self._determine_contribution(tech),
        )

        return GroupedTechnology(
            id=tech.id,
            canonical_name=tech.name,
            alternate_names=[],
            technology_type=tech.technology_type,
            description=tech.description,
            capabilities=tech.capabilities,
            mechanism=tech.mechanism,
            developers=developers,
            trl_estimate=tech.trl_estimate,
            trl_confidence=0.7,
            trl_evidence=tech.trl_evidence,
            sources=[source],
            source_count=1,
            grouping_confidence=1.0,
            grouped_from=[tech.id],
        )

    def _choose_canonical_name(self, names: List[str]) -> str:
        """Choose the most specific/complete name as canonical."""
        if not names:
            return "Unknown Technology"

        # Prefer longer, more specific names
        # But penalize overly long names (likely full titles, not tech names)
        def name_score(name: str) -> float:
            length = len(name)
            if length < 5:
                return 0.3
            elif length > 100:
                return 0.4
            elif length > 50:
                return 0.6
            else:
                return 0.8 + (length / 100)

        return max(names, key=name_score)

    def _determine_contribution(self, tech: ExtractedTechnology) -> str:
        """Determine what this source contributes to the technology profile."""
        contributions = []

        if tech.source_type == "sbir":
            contributions.append("R&D funding")
            contributions.append("development status")
        elif tech.source_type == "patent":
            contributions.append("technical claims")
            contributions.append("innovation details")
        elif tech.source_type == "contract":
            contributions.append("government interest")
            contributions.append("deployment status")
        elif tech.source_type == "news":
            contributions.append("recent developments")
        else:
            contributions.append("general information")

        if tech.developer:
            contributions.append("developer info")

        return ", ".join(contributions)

    def _string_similarity(self, s1: str, s2: str) -> float:
        """Calculate string similarity using token-based comparison."""
        if not s1 or not s2:
            return 0.0

        # Normalize
        s1 = s1.lower().strip()
        s2 = s2.lower().strip()

        if s1 == s2:
            return 1.0

        # Token-based comparison
        tokens1 = set(s1.split())
        tokens2 = set(s2.split())

        # Remove common words
        stopwords = {'the', 'a', 'an', 'for', 'of', 'and', 'in', 'to', 'with'}
        tokens1 -= stopwords
        tokens2 -= stopwords

        if not tokens1 or not tokens2:
            return 0.0

        # Jaccard similarity of tokens
        intersection = len(tokens1 & tokens2)
        union = len(tokens1 | tokens2)

        return intersection / union if union > 0 else 0.0

    def _jaccard_similarity(self, set1: Set[str], set2: Set[str]) -> float:
        """Calculate Jaccard similarity between two sets."""
        if not set1 or not set2:
            return 0.0

        intersection = len(set1 & set2)
        union = len(set1 | set2)

        return intersection / union if union > 0 else 0.0

    def _extract_keywords(self, texts: List[str]) -> List[str]:
        """Extract keywords from a list of texts."""
        stopwords = {
            'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
            'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
            'would', 'could', 'should', 'may', 'might', 'must', 'shall',
            'can', 'need', 'for', 'of', 'to', 'in', 'on', 'at', 'by',
            'from', 'with', 'and', 'or', 'not', 'that', 'this', 'it',
            'as', 'its', 'also', 'than', 'such', 'into', 'which'
        }

        keywords = []
        for text in texts:
            if not text:
                continue
            words = text.lower().split()
            for word in words:
                word = word.strip('.,;:!?()[]{}"\'-')
                if word and len(word) > 2 and word not in stopwords:
                    keywords.append(word)

        return keywords