TechScout/techscout/pipeline/deep_dive.py

524 lines
16 KiB
Python

"""
Phase 2: Deep Dive Pipeline
Takes selected technology candidates and performs comprehensive
company and technology analysis.
"""
import json
import logging
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional
import uuid
from ..config import TechScoutConfig, config as default_config
from ..extraction.llm_client import OllamaClient
from ..search.web import WebSearcher
from ..search.base import SearchResult
from ..sources.sbir import SBIRSearcher
from ..sources.patents import PatentSearcher
from ..sources.contracts import ContractSearcher
logger = logging.getLogger(__name__)
@dataclass
class CompanyProfile:
"""Comprehensive company profile."""
name: str
description: str
website: Optional[str] = None
headquarters: Optional[str] = None
employee_count: Optional[str] = None
founded: Optional[str] = None
leadership: List[Dict[str, str]] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return {
"name": self.name,
"description": self.description,
"website": self.website,
"headquarters": self.headquarters,
"employee_count": self.employee_count,
"founded": self.founded,
"leadership": self.leadership,
}
@dataclass
class TechnologyProfile:
"""Detailed technology profile."""
name: str
description: str
technical_approach: str
trl_assessment: int
key_capabilities: List[str]
limitations: List[str]
competitive_advantage: str
related_patents: List[Dict[str, Any]] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return {
"name": self.name,
"description": self.description,
"technical_approach": self.technical_approach,
"trl_assessment": self.trl_assessment,
"key_capabilities": self.key_capabilities,
"limitations": self.limitations,
"competitive_advantage": self.competitive_advantage,
"related_patents": self.related_patents,
}
@dataclass
class ContractHistory:
"""Company's federal contract history."""
total_contracts: int
total_value: float
primary_agencies: List[str]
recent_contracts: List[Dict[str, Any]]
sbir_awards: List[Dict[str, Any]]
def to_dict(self) -> Dict[str, Any]:
return {
"total_contracts": self.total_contracts,
"total_value": self.total_value,
"primary_agencies": self.primary_agencies,
"recent_contracts": self.recent_contracts,
"sbir_awards": self.sbir_awards,
}
@dataclass
class DeepDiveResult:
"""Result of Phase 2 deep dive."""
id: str
candidate_id: str
organization: str
timestamp: str
company_profile: CompanyProfile
technology_profile: TechnologyProfile
contract_history: ContractHistory
other_offerings: List[Dict[str, Any]]
news_mentions: List[Dict[str, Any]]
assessment: str # LLM-generated overall assessment
risk_factors: List[str]
recommendation: str
duration_seconds: float
success: bool = True
error: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
return {
"id": self.id,
"candidate_id": self.candidate_id,
"organization": self.organization,
"timestamp": self.timestamp,
"company_profile": self.company_profile.to_dict(),
"technology_profile": self.technology_profile.to_dict(),
"contract_history": self.contract_history.to_dict(),
"other_offerings": self.other_offerings,
"news_mentions": self.news_mentions,
"assessment": self.assessment,
"risk_factors": self.risk_factors,
"recommendation": self.recommendation,
"duration_seconds": self.duration_seconds,
"success": self.success,
"error": self.error,
}
def save(self, path: Path):
"""Save result to JSON file."""
with open(path, "w") as f:
json.dump(self.to_dict(), f, indent=2)
class DeepDivePipeline:
"""
Phase 2 Deep Dive Pipeline.
1. Research company background
2. Analyze technology capabilities
3. Pull contract history
4. Find related patents
5. Search for news/press
6. Generate comprehensive assessment
"""
def __init__(
self,
config: Optional[TechScoutConfig] = None,
model: str = "mistral-nemo:12b"
):
self.config = config or default_config
self.model = model
# Initialize components
self.llm_client = OllamaClient(
base_url=self.config.ollama.base_url,
default_model=model
)
# Initialize searchers
self.web_searcher = WebSearcher()
self.sbir_searcher = SBIRSearcher()
self.patent_searcher = PatentSearcher()
self.contract_searcher = ContractSearcher()
def deep_dive(
self,
organization: str,
technology_context: str,
capability_gap: str,
candidate_id: str = ""
) -> DeepDiveResult:
"""
Run Phase 2 deep dive on a company/technology.
Args:
organization: Company name
technology_context: Description of the technology
capability_gap: Original capability gap
candidate_id: ID from Phase 1 candidate
Returns:
DeepDiveResult with comprehensive analysis
"""
start_time = datetime.now()
result_id = str(uuid.uuid4())[:8]
logger.info(f"Starting deep dive on: {organization}")
# 1. Research company profile
logger.info("Researching company profile...")
company_profile = self._research_company(organization)
# 2. Analyze technology
logger.info("Analyzing technology...")
tech_profile = self._analyze_technology(
organization, technology_context, capability_gap
)
# 3. Get contract history
logger.info("Fetching contract history...")
contract_history = self._get_contract_history(organization)
# 4. Find other offerings
logger.info("Finding other company offerings...")
other_offerings = self._find_other_offerings(organization)
# 5. Search news
logger.info("Searching news mentions...")
news_mentions = self._search_news(organization)
# 6. Generate assessment
logger.info("Generating assessment...")
assessment, risk_factors, recommendation = self._generate_assessment(
organization,
technology_context,
capability_gap,
company_profile,
tech_profile,
contract_history
)
duration = (datetime.now() - start_time).total_seconds()
result = DeepDiveResult(
id=result_id,
candidate_id=candidate_id,
organization=organization,
timestamp=datetime.now().isoformat(),
company_profile=company_profile,
technology_profile=tech_profile,
contract_history=contract_history,
other_offerings=other_offerings,
news_mentions=news_mentions,
assessment=assessment,
risk_factors=risk_factors,
recommendation=recommendation,
duration_seconds=duration,
success=True
)
# Save result
save_path = self.config.analyses_dir / f"deepdive_{result_id}.json"
result.save(save_path)
logger.info(f"Saved deep dive result to {save_path}")
return result
def _research_company(self, organization: str) -> CompanyProfile:
"""Research company background."""
# Search web for company info
results = self.web_searcher.search(
f'"{organization}" company about founded headquarters',
max_results=5
)
# Build context from results
context = "\n".join([f"- {r.title}: {r.snippet}" for r in results])
# Use LLM to extract structured info
prompt = f"""Extract company information from these search results.
Company: {organization}
Search Results:
{context}
Respond with JSON:
{{
"description": "Brief company description",
"website": "company website if found",
"headquarters": "location if found",
"employee_count": "approximate if found",
"founded": "year if found",
"leadership": [
{{"name": "CEO Name", "title": "CEO"}},
{{"name": "CTO Name", "title": "CTO"}}
]
}}
Only include information explicitly found. Use null for unknown fields."""
response = self.llm_client.generate(
prompt=prompt,
temperature=0.1,
format="json"
)
if response.success:
try:
data = json.loads(response.content)
return CompanyProfile(
name=organization,
description=data.get("description", ""),
website=data.get("website"),
headquarters=data.get("headquarters"),
employee_count=data.get("employee_count"),
founded=data.get("founded"),
leadership=data.get("leadership", [])
)
except json.JSONDecodeError:
pass
return CompanyProfile(name=organization, description="Information not available")
def _analyze_technology(
self,
organization: str,
technology_context: str,
capability_gap: str
) -> TechnologyProfile:
"""Analyze the technology in depth."""
# Search for patents by this organization
patents = self.patent_searcher.search(
capability_gap.split()[0], # First keyword
max_results=5,
assignee=organization
)
patent_context = "\n".join([
f"- {p.title}" for p in patents
]) if patents else "No patents found"
prompt = f"""Analyze this technology's capabilities.
Company: {organization}
Technology Context: {technology_context}
Capability Gap: {capability_gap}
Related Patents: {patent_context}
Provide a technical assessment as JSON:
{{
"name": "Technology name",
"description": "Detailed technical description",
"technical_approach": "How the technology works",
"trl_assessment": 1-9,
"key_capabilities": ["capability 1", "capability 2"],
"limitations": ["limitation 1", "limitation 2"],
"competitive_advantage": "What makes this approach unique"
}}"""
response = self.llm_client.generate(
prompt=prompt,
temperature=0.2,
format="json"
)
related_patents = [
{"title": p.title, "number": p.patent_number, "url": p.url}
for p in patents
]
if response.success:
try:
data = json.loads(response.content)
return TechnologyProfile(
name=data.get("name", "Unknown"),
description=data.get("description", ""),
technical_approach=data.get("technical_approach", ""),
trl_assessment=data.get("trl_assessment", 5),
key_capabilities=data.get("key_capabilities", []),
limitations=data.get("limitations", []),
competitive_advantage=data.get("competitive_advantage", ""),
related_patents=related_patents
)
except json.JSONDecodeError:
pass
return TechnologyProfile(
name="Unknown",
description=technology_context,
technical_approach="",
trl_assessment=5,
key_capabilities=[],
limitations=[],
competitive_advantage="",
related_patents=related_patents
)
def _get_contract_history(self, organization: str) -> ContractHistory:
"""Get federal contract history."""
# Get contracts
contracts = self.contract_searcher.get_company_contracts(organization, max_results=20)
# Get SBIR awards
sbir_results = self.sbir_searcher.search(f'"{organization}"', max_results=10)
# Calculate totals
total_value = sum(c.award_amount or 0 for c in contracts)
# Get agency breakdown
agencies = {}
for c in contracts:
agency = c.raw_data.get("Awarding Agency", "Unknown")
agencies[agency] = agencies.get(agency, 0) + 1
primary_agencies = sorted(agencies.keys(), key=lambda x: agencies[x], reverse=True)[:5]
return ContractHistory(
total_contracts=len(contracts),
total_value=total_value,
primary_agencies=primary_agencies,
recent_contracts=[
{
"title": c.title,
"amount": c.award_amount,
"date": c.published_date,
"agency": c.raw_data.get("Awarding Agency"),
"url": c.url
}
for c in contracts[:10]
],
sbir_awards=[
{
"title": s.title,
"amount": s.award_amount,
"phase": s.raw_data.get("phase"),
"year": s.published_date,
"url": s.url
}
for s in sbir_results
]
)
def _find_other_offerings(self, organization: str) -> List[Dict[str, Any]]:
"""Find other products/services from this company."""
results = self.web_searcher.search(
f'"{organization}" products services solutions offerings',
max_results=10
)
offerings = []
for r in results:
offerings.append({
"title": r.title,
"description": r.snippet,
"url": r.url
})
return offerings
def _search_news(self, organization: str) -> List[Dict[str, Any]]:
"""Search recent news about the company."""
results = self.web_searcher.search(
f'"{organization}"',
max_results=10,
news_only=True,
time_filter="y" # Last year
)
news = []
for r in results:
news.append({
"title": r.title,
"snippet": r.snippet,
"date": r.published_date,
"url": r.url
})
return news
def _generate_assessment(
self,
organization: str,
technology_context: str,
capability_gap: str,
company: CompanyProfile,
tech: TechnologyProfile,
contracts: ContractHistory
) -> tuple:
"""Generate overall assessment using LLM."""
prompt = f"""Generate an investment/partnership assessment.
CAPABILITY GAP:
{capability_gap}
COMPANY: {organization}
{company.description}
Headquarters: {company.headquarters or 'Unknown'}
Founded: {company.founded or 'Unknown'}
TECHNOLOGY:
{tech.description}
TRL: {tech.trl_assessment}
Approach: {tech.technical_approach}
CONTRACT HISTORY:
Total Contracts: {contracts.total_contracts}
Total Value: ${contracts.total_value:,.0f}
Primary Agencies: {', '.join(contracts.primary_agencies[:3])}
SBIR Awards: {len(contracts.sbir_awards)}
Provide assessment as JSON:
{{
"assessment": "2-3 paragraph overall assessment of fit for the capability gap",
"risk_factors": ["risk 1", "risk 2", "risk 3"],
"recommendation": "STRONGLY RECOMMEND | RECOMMEND | CONSIDER | DO NOT RECOMMEND",
"recommendation_rationale": "Brief explanation"
}}"""
response = self.llm_client.generate(
prompt=prompt,
model=self.model,
temperature=0.3,
format="json"
)
if response.success:
try:
data = json.loads(response.content)
return (
data.get("assessment", "Assessment not available"),
data.get("risk_factors", []),
f"{data.get('recommendation', 'CONSIDER')}: {data.get('recommendation_rationale', '')}"
)
except json.JSONDecodeError:
pass
return ("Assessment generation failed", [], "CONSIDER: Insufficient data")