""" Phase 2: Deep Dive Pipeline Takes selected technology candidates and performs comprehensive company and technology analysis. """ import json import logging from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import List, Dict, Any, Optional import uuid from ..config import TechScoutConfig, config as default_config from ..extraction.llm_client import OllamaClient from ..search.web import WebSearcher from ..search.base import SearchResult from ..sources.sbir import SBIRSearcher from ..sources.patents import PatentSearcher from ..sources.contracts import ContractSearcher logger = logging.getLogger(__name__) @dataclass class CompanyProfile: """Comprehensive company profile.""" name: str description: str website: Optional[str] = None headquarters: Optional[str] = None employee_count: Optional[str] = None founded: Optional[str] = None leadership: List[Dict[str, str]] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return { "name": self.name, "description": self.description, "website": self.website, "headquarters": self.headquarters, "employee_count": self.employee_count, "founded": self.founded, "leadership": self.leadership, } @dataclass class TechnologyProfile: """Detailed technology profile.""" name: str description: str technical_approach: str trl_assessment: int key_capabilities: List[str] limitations: List[str] competitive_advantage: str related_patents: List[Dict[str, Any]] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return { "name": self.name, "description": self.description, "technical_approach": self.technical_approach, "trl_assessment": self.trl_assessment, "key_capabilities": self.key_capabilities, "limitations": self.limitations, "competitive_advantage": self.competitive_advantage, "related_patents": self.related_patents, } @dataclass class ContractHistory: """Company's federal contract history.""" total_contracts: int total_value: float primary_agencies: List[str] recent_contracts: List[Dict[str, Any]] sbir_awards: List[Dict[str, Any]] def to_dict(self) -> Dict[str, Any]: return { "total_contracts": self.total_contracts, "total_value": self.total_value, "primary_agencies": self.primary_agencies, "recent_contracts": self.recent_contracts, "sbir_awards": self.sbir_awards, } @dataclass class DeepDiveResult: """Result of Phase 2 deep dive.""" id: str candidate_id: str organization: str timestamp: str company_profile: CompanyProfile technology_profile: TechnologyProfile contract_history: ContractHistory other_offerings: List[Dict[str, Any]] news_mentions: List[Dict[str, Any]] assessment: str # LLM-generated overall assessment risk_factors: List[str] recommendation: str duration_seconds: float success: bool = True error: Optional[str] = None def to_dict(self) -> Dict[str, Any]: return { "id": self.id, "candidate_id": self.candidate_id, "organization": self.organization, "timestamp": self.timestamp, "company_profile": self.company_profile.to_dict(), "technology_profile": self.technology_profile.to_dict(), "contract_history": self.contract_history.to_dict(), "other_offerings": self.other_offerings, "news_mentions": self.news_mentions, "assessment": self.assessment, "risk_factors": self.risk_factors, "recommendation": self.recommendation, "duration_seconds": self.duration_seconds, "success": self.success, "error": self.error, } def save(self, path: Path): """Save result to JSON file.""" with open(path, "w") as f: json.dump(self.to_dict(), f, indent=2) class DeepDivePipeline: """ Phase 2 Deep Dive Pipeline. 1. Research company background 2. Analyze technology capabilities 3. Pull contract history 4. Find related patents 5. Search for news/press 6. Generate comprehensive assessment """ def __init__( self, config: Optional[TechScoutConfig] = None, model: str = "mistral-nemo:12b" ): self.config = config or default_config self.model = model # Initialize components self.llm_client = OllamaClient( base_url=self.config.ollama.base_url, default_model=model ) # Initialize searchers self.web_searcher = WebSearcher() self.sbir_searcher = SBIRSearcher() self.patent_searcher = PatentSearcher() self.contract_searcher = ContractSearcher() def deep_dive( self, organization: str, technology_context: str, capability_gap: str, candidate_id: str = "" ) -> DeepDiveResult: """ Run Phase 2 deep dive on a company/technology. Args: organization: Company name technology_context: Description of the technology capability_gap: Original capability gap candidate_id: ID from Phase 1 candidate Returns: DeepDiveResult with comprehensive analysis """ start_time = datetime.now() result_id = str(uuid.uuid4())[:8] logger.info(f"Starting deep dive on: {organization}") # 1. Research company profile logger.info("Researching company profile...") company_profile = self._research_company(organization) # 2. Analyze technology logger.info("Analyzing technology...") tech_profile = self._analyze_technology( organization, technology_context, capability_gap ) # 3. Get contract history logger.info("Fetching contract history...") contract_history = self._get_contract_history(organization) # 4. Find other offerings logger.info("Finding other company offerings...") other_offerings = self._find_other_offerings(organization) # 5. Search news logger.info("Searching news mentions...") news_mentions = self._search_news(organization) # 6. Generate assessment logger.info("Generating assessment...") assessment, risk_factors, recommendation = self._generate_assessment( organization, technology_context, capability_gap, company_profile, tech_profile, contract_history ) duration = (datetime.now() - start_time).total_seconds() result = DeepDiveResult( id=result_id, candidate_id=candidate_id, organization=organization, timestamp=datetime.now().isoformat(), company_profile=company_profile, technology_profile=tech_profile, contract_history=contract_history, other_offerings=other_offerings, news_mentions=news_mentions, assessment=assessment, risk_factors=risk_factors, recommendation=recommendation, duration_seconds=duration, success=True ) # Save result save_path = self.config.analyses_dir / f"deepdive_{result_id}.json" result.save(save_path) logger.info(f"Saved deep dive result to {save_path}") return result def _research_company(self, organization: str) -> CompanyProfile: """Research company background.""" # Search web for company info results = self.web_searcher.search( f'"{organization}" company about founded headquarters', max_results=5 ) # Build context from results context = "\n".join([f"- {r.title}: {r.snippet}" for r in results]) # Use LLM to extract structured info prompt = f"""Extract company information from these search results. Company: {organization} Search Results: {context} Respond with JSON: {{ "description": "Brief company description", "website": "company website if found", "headquarters": "location if found", "employee_count": "approximate if found", "founded": "year if found", "leadership": [ {{"name": "CEO Name", "title": "CEO"}}, {{"name": "CTO Name", "title": "CTO"}} ] }} Only include information explicitly found. Use null for unknown fields.""" response = self.llm_client.generate( prompt=prompt, temperature=0.1, format="json" ) if response.success: try: data = json.loads(response.content) return CompanyProfile( name=organization, description=data.get("description", ""), website=data.get("website"), headquarters=data.get("headquarters"), employee_count=data.get("employee_count"), founded=data.get("founded"), leadership=data.get("leadership", []) ) except json.JSONDecodeError: pass return CompanyProfile(name=organization, description="Information not available") def _analyze_technology( self, organization: str, technology_context: str, capability_gap: str ) -> TechnologyProfile: """Analyze the technology in depth.""" # Search for patents by this organization patents = self.patent_searcher.search( capability_gap.split()[0], # First keyword max_results=5, assignee=organization ) patent_context = "\n".join([ f"- {p.title}" for p in patents ]) if patents else "No patents found" prompt = f"""Analyze this technology's capabilities. Company: {organization} Technology Context: {technology_context} Capability Gap: {capability_gap} Related Patents: {patent_context} Provide a technical assessment as JSON: {{ "name": "Technology name", "description": "Detailed technical description", "technical_approach": "How the technology works", "trl_assessment": 1-9, "key_capabilities": ["capability 1", "capability 2"], "limitations": ["limitation 1", "limitation 2"], "competitive_advantage": "What makes this approach unique" }}""" response = self.llm_client.generate( prompt=prompt, temperature=0.2, format="json" ) related_patents = [ {"title": p.title, "number": p.patent_number, "url": p.url} for p in patents ] if response.success: try: data = json.loads(response.content) return TechnologyProfile( name=data.get("name", "Unknown"), description=data.get("description", ""), technical_approach=data.get("technical_approach", ""), trl_assessment=data.get("trl_assessment", 5), key_capabilities=data.get("key_capabilities", []), limitations=data.get("limitations", []), competitive_advantage=data.get("competitive_advantage", ""), related_patents=related_patents ) except json.JSONDecodeError: pass return TechnologyProfile( name="Unknown", description=technology_context, technical_approach="", trl_assessment=5, key_capabilities=[], limitations=[], competitive_advantage="", related_patents=related_patents ) def _get_contract_history(self, organization: str) -> ContractHistory: """Get federal contract history.""" # Get contracts contracts = self.contract_searcher.get_company_contracts(organization, max_results=20) # Get SBIR awards sbir_results = self.sbir_searcher.search(f'"{organization}"', max_results=10) # Calculate totals total_value = sum(c.award_amount or 0 for c in contracts) # Get agency breakdown agencies = {} for c in contracts: agency = c.raw_data.get("Awarding Agency", "Unknown") agencies[agency] = agencies.get(agency, 0) + 1 primary_agencies = sorted(agencies.keys(), key=lambda x: agencies[x], reverse=True)[:5] return ContractHistory( total_contracts=len(contracts), total_value=total_value, primary_agencies=primary_agencies, recent_contracts=[ { "title": c.title, "amount": c.award_amount, "date": c.published_date, "agency": c.raw_data.get("Awarding Agency"), "url": c.url } for c in contracts[:10] ], sbir_awards=[ { "title": s.title, "amount": s.award_amount, "phase": s.raw_data.get("phase"), "year": s.published_date, "url": s.url } for s in sbir_results ] ) def _find_other_offerings(self, organization: str) -> List[Dict[str, Any]]: """Find other products/services from this company.""" results = self.web_searcher.search( f'"{organization}" products services solutions offerings', max_results=10 ) offerings = [] for r in results: offerings.append({ "title": r.title, "description": r.snippet, "url": r.url }) return offerings def _search_news(self, organization: str) -> List[Dict[str, Any]]: """Search recent news about the company.""" results = self.web_searcher.search( f'"{organization}"', max_results=10, news_only=True, time_filter="y" # Last year ) news = [] for r in results: news.append({ "title": r.title, "snippet": r.snippet, "date": r.published_date, "url": r.url }) return news def _generate_assessment( self, organization: str, technology_context: str, capability_gap: str, company: CompanyProfile, tech: TechnologyProfile, contracts: ContractHistory ) -> tuple: """Generate overall assessment using LLM.""" prompt = f"""Generate an investment/partnership assessment. CAPABILITY GAP: {capability_gap} COMPANY: {organization} {company.description} Headquarters: {company.headquarters or 'Unknown'} Founded: {company.founded or 'Unknown'} TECHNOLOGY: {tech.description} TRL: {tech.trl_assessment} Approach: {tech.technical_approach} CONTRACT HISTORY: Total Contracts: {contracts.total_contracts} Total Value: ${contracts.total_value:,.0f} Primary Agencies: {', '.join(contracts.primary_agencies[:3])} SBIR Awards: {len(contracts.sbir_awards)} Provide assessment as JSON: {{ "assessment": "2-3 paragraph overall assessment of fit for the capability gap", "risk_factors": ["risk 1", "risk 2", "risk 3"], "recommendation": "STRONGLY RECOMMEND | RECOMMEND | CONSIDER | DO NOT RECOMMEND", "recommendation_rationale": "Brief explanation" }}""" response = self.llm_client.generate( prompt=prompt, model=self.model, temperature=0.3, format="json" ) if response.success: try: data = json.loads(response.content) return ( data.get("assessment", "Assessment not available"), data.get("risk_factors", []), f"{data.get('recommendation', 'CONSIDER')}: {data.get('recommendation_rationale', '')}" ) except json.JSONDecodeError: pass return ("Assessment generation failed", [], "CONSIDER: Insufficient data")