TechScout/techscout/pipeline/deep_dive.py

"""
Phase 2: Deep Dive Pipeline

Takes selected technology candidates and performs comprehensive
company and technology analysis.
"""

import json
import logging
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional
import uuid

from ..config import TechScoutConfig, config as default_config
from ..extraction.llm_client import OllamaClient
from ..search.web import WebSearcher
from ..search.base import SearchResult
from ..sources.sbir import SBIRSearcher
from ..sources.patents import PatentSearcher
from ..sources.contracts import ContractSearcher

logger = logging.getLogger(__name__)


@dataclass
class CompanyProfile:
    """Comprehensive company profile."""
    name: str
    description: str
    website: Optional[str] = None
    headquarters: Optional[str] = None
    employee_count: Optional[str] = None
    founded: Optional[str] = None
    leadership: List[Dict[str, str]] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return {
            "name": self.name,
            "description": self.description,
            "website": self.website,
            "headquarters": self.headquarters,
            "employee_count": self.employee_count,
            "founded": self.founded,
            "leadership": self.leadership,
        }


@dataclass
class TechnologyProfile:
    """Detailed technology profile."""
    name: str
    description: str
    technical_approach: str
    trl_assessment: int
    key_capabilities: List[str]
    limitations: List[str]
    competitive_advantage: str
    related_patents: List[Dict[str, Any]] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return {
            "name": self.name,
            "description": self.description,
            "technical_approach": self.technical_approach,
            "trl_assessment": self.trl_assessment,
            "key_capabilities": self.key_capabilities,
            "limitations": self.limitations,
            "competitive_advantage": self.competitive_advantage,
            "related_patents": self.related_patents,
        }


@dataclass
class ContractHistory:
    """Company's federal contract history."""
    total_contracts: int
    total_value: float
    primary_agencies: List[str]
    recent_contracts: List[Dict[str, Any]]
    sbir_awards: List[Dict[str, Any]]

    def to_dict(self) -> Dict[str, Any]:
        return {
            "total_contracts": self.total_contracts,
            "total_value": self.total_value,
            "primary_agencies": self.primary_agencies,
            "recent_contracts": self.recent_contracts,
            "sbir_awards": self.sbir_awards,
        }


@dataclass
class DeepDiveResult:
    """Result of Phase 2 deep dive."""
    id: str
    candidate_id: str
    organization: str
    timestamp: str
    company_profile: CompanyProfile
    technology_profile: TechnologyProfile
    contract_history: ContractHistory
    other_offerings: List[Dict[str, Any]]
    news_mentions: List[Dict[str, Any]]
    assessment: str  # LLM-generated overall assessment
    risk_factors: List[str]
    recommendation: str
    duration_seconds: float
    success: bool = True
    error: Optional[str] = None

    def to_dict(self) -> Dict[str, Any]:
        return {
            "id": self.id,
            "candidate_id": self.candidate_id,
            "organization": self.organization,
            "timestamp": self.timestamp,
            "company_profile": self.company_profile.to_dict(),
            "technology_profile": self.technology_profile.to_dict(),
            "contract_history": self.contract_history.to_dict(),
            "other_offerings": self.other_offerings,
            "news_mentions": self.news_mentions,
            "assessment": self.assessment,
            "risk_factors": self.risk_factors,
            "recommendation": self.recommendation,
            "duration_seconds": self.duration_seconds,
            "success": self.success,
            "error": self.error,
        }

    def save(self, path: Path):
        """Save result to JSON file."""
        with open(path, "w") as f:
            json.dump(self.to_dict(), f, indent=2)


class DeepDivePipeline:
    """
    Phase 2 Deep Dive Pipeline.

    1. Research company background
    2. Analyze technology capabilities
    3. Pull contract history
    4. Find related patents
    5. Search for news/press
    6. Generate comprehensive assessment
    """

    def __init__(
        self,
        config: Optional[TechScoutConfig] = None,
        model: str = "mistral-nemo:12b"
    ):
        self.config = config or default_config
        self.model = model

        # Initialize components
        self.llm_client = OllamaClient(
            base_url=self.config.ollama.base_url,
            default_model=model
        )

        # Initialize searchers
        self.web_searcher = WebSearcher()
        self.sbir_searcher = SBIRSearcher()
        self.patent_searcher = PatentSearcher()
        self.contract_searcher = ContractSearcher()

    def deep_dive(
        self,
        organization: str,
        technology_context: str,
        capability_gap: str,
        candidate_id: str = ""
    ) -> DeepDiveResult:
        """
        Run Phase 2 deep dive on a company/technology.

        Args:
            organization: Company name
            technology_context: Description of the technology
            capability_gap: Original capability gap
            candidate_id: ID from Phase 1 candidate

        Returns:
            DeepDiveResult with comprehensive analysis
        """
        start_time = datetime.now()
        result_id = str(uuid.uuid4())[:8]

        logger.info(f"Starting deep dive on: {organization}")

        # 1. Research company profile
        logger.info("Researching company profile...")
        company_profile = self._research_company(organization)

        # 2. Analyze technology
        logger.info("Analyzing technology...")
        tech_profile = self._analyze_technology(
            organization, technology_context, capability_gap
        )

        # 3. Get contract history
        logger.info("Fetching contract history...")
        contract_history = self._get_contract_history(organization)

        # 4. Find other offerings
        logger.info("Finding other company offerings...")
        other_offerings = self._find_other_offerings(organization)

        # 5. Search news
        logger.info("Searching news mentions...")
        news_mentions = self._search_news(organization)

        # 6. Generate assessment
        logger.info("Generating assessment...")
        assessment, risk_factors, recommendation = self._generate_assessment(
            organization,
            technology_context,
            capability_gap,
            company_profile,
            tech_profile,
            contract_history
        )

        duration = (datetime.now() - start_time).total_seconds()

        result = DeepDiveResult(
            id=result_id,
            candidate_id=candidate_id,
            organization=organization,
            timestamp=datetime.now().isoformat(),
            company_profile=company_profile,
            technology_profile=tech_profile,
            contract_history=contract_history,
            other_offerings=other_offerings,
            news_mentions=news_mentions,
            assessment=assessment,
            risk_factors=risk_factors,
            recommendation=recommendation,
            duration_seconds=duration,
            success=True
        )

        # Save result
        save_path = self.config.analyses_dir / f"deepdive_{result_id}.json"
        result.save(save_path)
        logger.info(f"Saved deep dive result to {save_path}")

        return result

    def _research_company(self, organization: str) -> CompanyProfile:
        """Research company background."""
        # Search web for company info
        results = self.web_searcher.search(
            f'"{organization}" company about founded headquarters',
            max_results=5
        )

        # Build context from results
        context = "\n".join([f"- {r.title}: {r.snippet}" for r in results])

        # Use LLM to extract structured info
        prompt = f"""Extract company information from these search results.

Company: {organization}

Search Results:
{context}

Respond with JSON:
{{
    "description": "Brief company description",
    "website": "company website if found",
    "headquarters": "location if found",
    "employee_count": "approximate if found",
    "founded": "year if found",
    "leadership": [
        {{"name": "CEO Name", "title": "CEO"}},
        {{"name": "CTO Name", "title": "CTO"}}
    ]
}}

Only include information explicitly found. Use null for unknown fields."""

        response = self.llm_client.generate(
            prompt=prompt,
            temperature=0.1,
            format="json"
        )

        if response.success:
            try:
                data = json.loads(response.content)
                return CompanyProfile(
                    name=organization,
                    description=data.get("description", ""),
                    website=data.get("website"),
                    headquarters=data.get("headquarters"),
                    employee_count=data.get("employee_count"),
                    founded=data.get("founded"),
                    leadership=data.get("leadership", [])
                )
            except json.JSONDecodeError:
                pass

        return CompanyProfile(name=organization, description="Information not available")

    def _analyze_technology(
        self,
        organization: str,
        technology_context: str,
        capability_gap: str
    ) -> TechnologyProfile:
        """Analyze the technology in depth."""
        # Search for patents by this organization
        patents = self.patent_searcher.search(
            capability_gap.split()[0],  # First keyword
            max_results=5,
            assignee=organization
        )

        patent_context = "\n".join([
            f"- {p.title}" for p in patents
        ]) if patents else "No patents found"

        prompt = f"""Analyze this technology's capabilities.

Company: {organization}
Technology Context: {technology_context}
Capability Gap: {capability_gap}
Related Patents: {patent_context}

Provide a technical assessment as JSON:
{{
    "name": "Technology name",
    "description": "Detailed technical description",
    "technical_approach": "How the technology works",
    "trl_assessment": 1-9,
    "key_capabilities": ["capability 1", "capability 2"],
    "limitations": ["limitation 1", "limitation 2"],
    "competitive_advantage": "What makes this approach unique"
}}"""

        response = self.llm_client.generate(
            prompt=prompt,
            temperature=0.2,
            format="json"
        )

        related_patents = [
            {"title": p.title, "number": p.patent_number, "url": p.url}
            for p in patents
        ]

        if response.success:
            try:
                data = json.loads(response.content)
                return TechnologyProfile(
                    name=data.get("name", "Unknown"),
                    description=data.get("description", ""),
                    technical_approach=data.get("technical_approach", ""),
                    trl_assessment=data.get("trl_assessment", 5),
                    key_capabilities=data.get("key_capabilities", []),
                    limitations=data.get("limitations", []),
                    competitive_advantage=data.get("competitive_advantage", ""),
                    related_patents=related_patents
                )
            except json.JSONDecodeError:
                pass

        return TechnologyProfile(
            name="Unknown",
            description=technology_context,
            technical_approach="",
            trl_assessment=5,
            key_capabilities=[],
            limitations=[],
            competitive_advantage="",
            related_patents=related_patents
        )

    def _get_contract_history(self, organization: str) -> ContractHistory:
        """Get federal contract history."""
        # Get contracts
        contracts = self.contract_searcher.get_company_contracts(organization, max_results=20)

        # Get SBIR awards
        sbir_results = self.sbir_searcher.search(f'"{organization}"', max_results=10)

        # Calculate totals
        total_value = sum(c.award_amount or 0 for c in contracts)

        # Get agency breakdown
        agencies = {}
        for c in contracts:
            agency = c.raw_data.get("Awarding Agency", "Unknown")
            agencies[agency] = agencies.get(agency, 0) + 1

        primary_agencies = sorted(agencies.keys(), key=lambda x: agencies[x], reverse=True)[:5]

        return ContractHistory(
            total_contracts=len(contracts),
            total_value=total_value,
            primary_agencies=primary_agencies,
            recent_contracts=[
                {
                    "title": c.title,
                    "amount": c.award_amount,
                    "date": c.published_date,
                    "agency": c.raw_data.get("Awarding Agency"),
                    "url": c.url
                }
                for c in contracts[:10]
            ],
            sbir_awards=[
                {
                    "title": s.title,
                    "amount": s.award_amount,
                    "phase": s.raw_data.get("phase"),
                    "year": s.published_date,
                    "url": s.url
                }
                for s in sbir_results
            ]
        )

    def _find_other_offerings(self, organization: str) -> List[Dict[str, Any]]:
        """Find other products/services from this company."""
        results = self.web_searcher.search(
            f'"{organization}" products services solutions offerings',
            max_results=10
        )

        offerings = []
        for r in results:
            offerings.append({
                "title": r.title,
                "description": r.snippet,
                "url": r.url
            })

        return offerings

    def _search_news(self, organization: str) -> List[Dict[str, Any]]:
        """Search recent news about the company."""
        results = self.web_searcher.search(
            f'"{organization}"',
            max_results=10,
            news_only=True,
            time_filter="y"  # Last year
        )

        news = []
        for r in results:
            news.append({
                "title": r.title,
                "snippet": r.snippet,
                "date": r.published_date,
                "url": r.url
            })

        return news

    def _generate_assessment(
        self,
        organization: str,
        technology_context: str,
        capability_gap: str,
        company: CompanyProfile,
        tech: TechnologyProfile,
        contracts: ContractHistory
    ) -> tuple:
        """Generate overall assessment using LLM."""
        prompt = f"""Generate an investment/partnership assessment.

CAPABILITY GAP:
{capability_gap}

COMPANY: {organization}
{company.description}
Headquarters: {company.headquarters or 'Unknown'}
Founded: {company.founded or 'Unknown'}

TECHNOLOGY:
{tech.description}
TRL: {tech.trl_assessment}
Approach: {tech.technical_approach}

CONTRACT HISTORY:
Total Contracts: {contracts.total_contracts}
Total Value: ${contracts.total_value:,.0f}
Primary Agencies: {', '.join(contracts.primary_agencies[:3])}
SBIR Awards: {len(contracts.sbir_awards)}

Provide assessment as JSON:
{{
    "assessment": "2-3 paragraph overall assessment of fit for the capability gap",
    "risk_factors": ["risk 1", "risk 2", "risk 3"],
    "recommendation": "STRONGLY RECOMMEND | RECOMMEND | CONSIDER | DO NOT RECOMMEND",
    "recommendation_rationale": "Brief explanation"
}}"""

        response = self.llm_client.generate(
            prompt=prompt,
            model=self.model,
            temperature=0.3,
            format="json"
        )

        if response.success:
            try:
                data = json.loads(response.content)
                return (
                    data.get("assessment", "Assessment not available"),
                    data.get("risk_factors", []),
                    f"{data.get('recommendation', 'CONSIDER')}: {data.get('recommendation_rationale', '')}"
                )
            except json.JSONDecodeError:
                pass

        return ("Assessment generation failed", [], "CONSIDER: Insufficient data")