524 lines
16 KiB
Python
524 lines
16 KiB
Python
"""
|
|
Phase 2: Deep Dive Pipeline
|
|
|
|
Takes selected technology candidates and performs comprehensive
|
|
company and technology analysis.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional
|
|
import uuid
|
|
|
|
from ..config import TechScoutConfig, config as default_config
|
|
from ..extraction.llm_client import OllamaClient
|
|
from ..search.web import WebSearcher
|
|
from ..search.base import SearchResult
|
|
from ..sources.sbir import SBIRSearcher
|
|
from ..sources.patents import PatentSearcher
|
|
from ..sources.contracts import ContractSearcher
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class CompanyProfile:
|
|
"""Comprehensive company profile."""
|
|
name: str
|
|
description: str
|
|
website: Optional[str] = None
|
|
headquarters: Optional[str] = None
|
|
employee_count: Optional[str] = None
|
|
founded: Optional[str] = None
|
|
leadership: List[Dict[str, str]] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"name": self.name,
|
|
"description": self.description,
|
|
"website": self.website,
|
|
"headquarters": self.headquarters,
|
|
"employee_count": self.employee_count,
|
|
"founded": self.founded,
|
|
"leadership": self.leadership,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class TechnologyProfile:
|
|
"""Detailed technology profile."""
|
|
name: str
|
|
description: str
|
|
technical_approach: str
|
|
trl_assessment: int
|
|
key_capabilities: List[str]
|
|
limitations: List[str]
|
|
competitive_advantage: str
|
|
related_patents: List[Dict[str, Any]] = field(default_factory=list)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"name": self.name,
|
|
"description": self.description,
|
|
"technical_approach": self.technical_approach,
|
|
"trl_assessment": self.trl_assessment,
|
|
"key_capabilities": self.key_capabilities,
|
|
"limitations": self.limitations,
|
|
"competitive_advantage": self.competitive_advantage,
|
|
"related_patents": self.related_patents,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class ContractHistory:
|
|
"""Company's federal contract history."""
|
|
total_contracts: int
|
|
total_value: float
|
|
primary_agencies: List[str]
|
|
recent_contracts: List[Dict[str, Any]]
|
|
sbir_awards: List[Dict[str, Any]]
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"total_contracts": self.total_contracts,
|
|
"total_value": self.total_value,
|
|
"primary_agencies": self.primary_agencies,
|
|
"recent_contracts": self.recent_contracts,
|
|
"sbir_awards": self.sbir_awards,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class DeepDiveResult:
|
|
"""Result of Phase 2 deep dive."""
|
|
id: str
|
|
candidate_id: str
|
|
organization: str
|
|
timestamp: str
|
|
company_profile: CompanyProfile
|
|
technology_profile: TechnologyProfile
|
|
contract_history: ContractHistory
|
|
other_offerings: List[Dict[str, Any]]
|
|
news_mentions: List[Dict[str, Any]]
|
|
assessment: str # LLM-generated overall assessment
|
|
risk_factors: List[str]
|
|
recommendation: str
|
|
duration_seconds: float
|
|
success: bool = True
|
|
error: Optional[str] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"id": self.id,
|
|
"candidate_id": self.candidate_id,
|
|
"organization": self.organization,
|
|
"timestamp": self.timestamp,
|
|
"company_profile": self.company_profile.to_dict(),
|
|
"technology_profile": self.technology_profile.to_dict(),
|
|
"contract_history": self.contract_history.to_dict(),
|
|
"other_offerings": self.other_offerings,
|
|
"news_mentions": self.news_mentions,
|
|
"assessment": self.assessment,
|
|
"risk_factors": self.risk_factors,
|
|
"recommendation": self.recommendation,
|
|
"duration_seconds": self.duration_seconds,
|
|
"success": self.success,
|
|
"error": self.error,
|
|
}
|
|
|
|
def save(self, path: Path):
|
|
"""Save result to JSON file."""
|
|
with open(path, "w") as f:
|
|
json.dump(self.to_dict(), f, indent=2)
|
|
|
|
|
|
class DeepDivePipeline:
|
|
"""
|
|
Phase 2 Deep Dive Pipeline.
|
|
|
|
1. Research company background
|
|
2. Analyze technology capabilities
|
|
3. Pull contract history
|
|
4. Find related patents
|
|
5. Search for news/press
|
|
6. Generate comprehensive assessment
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
config: Optional[TechScoutConfig] = None,
|
|
model: str = "mistral-nemo:12b"
|
|
):
|
|
self.config = config or default_config
|
|
self.model = model
|
|
|
|
# Initialize components
|
|
self.llm_client = OllamaClient(
|
|
base_url=self.config.ollama.base_url,
|
|
default_model=model
|
|
)
|
|
|
|
# Initialize searchers
|
|
self.web_searcher = WebSearcher()
|
|
self.sbir_searcher = SBIRSearcher()
|
|
self.patent_searcher = PatentSearcher()
|
|
self.contract_searcher = ContractSearcher()
|
|
|
|
def deep_dive(
|
|
self,
|
|
organization: str,
|
|
technology_context: str,
|
|
capability_gap: str,
|
|
candidate_id: str = ""
|
|
) -> DeepDiveResult:
|
|
"""
|
|
Run Phase 2 deep dive on a company/technology.
|
|
|
|
Args:
|
|
organization: Company name
|
|
technology_context: Description of the technology
|
|
capability_gap: Original capability gap
|
|
candidate_id: ID from Phase 1 candidate
|
|
|
|
Returns:
|
|
DeepDiveResult with comprehensive analysis
|
|
"""
|
|
start_time = datetime.now()
|
|
result_id = str(uuid.uuid4())[:8]
|
|
|
|
logger.info(f"Starting deep dive on: {organization}")
|
|
|
|
# 1. Research company profile
|
|
logger.info("Researching company profile...")
|
|
company_profile = self._research_company(organization)
|
|
|
|
# 2. Analyze technology
|
|
logger.info("Analyzing technology...")
|
|
tech_profile = self._analyze_technology(
|
|
organization, technology_context, capability_gap
|
|
)
|
|
|
|
# 3. Get contract history
|
|
logger.info("Fetching contract history...")
|
|
contract_history = self._get_contract_history(organization)
|
|
|
|
# 4. Find other offerings
|
|
logger.info("Finding other company offerings...")
|
|
other_offerings = self._find_other_offerings(organization)
|
|
|
|
# 5. Search news
|
|
logger.info("Searching news mentions...")
|
|
news_mentions = self._search_news(organization)
|
|
|
|
# 6. Generate assessment
|
|
logger.info("Generating assessment...")
|
|
assessment, risk_factors, recommendation = self._generate_assessment(
|
|
organization,
|
|
technology_context,
|
|
capability_gap,
|
|
company_profile,
|
|
tech_profile,
|
|
contract_history
|
|
)
|
|
|
|
duration = (datetime.now() - start_time).total_seconds()
|
|
|
|
result = DeepDiveResult(
|
|
id=result_id,
|
|
candidate_id=candidate_id,
|
|
organization=organization,
|
|
timestamp=datetime.now().isoformat(),
|
|
company_profile=company_profile,
|
|
technology_profile=tech_profile,
|
|
contract_history=contract_history,
|
|
other_offerings=other_offerings,
|
|
news_mentions=news_mentions,
|
|
assessment=assessment,
|
|
risk_factors=risk_factors,
|
|
recommendation=recommendation,
|
|
duration_seconds=duration,
|
|
success=True
|
|
)
|
|
|
|
# Save result
|
|
save_path = self.config.analyses_dir / f"deepdive_{result_id}.json"
|
|
result.save(save_path)
|
|
logger.info(f"Saved deep dive result to {save_path}")
|
|
|
|
return result
|
|
|
|
def _research_company(self, organization: str) -> CompanyProfile:
|
|
"""Research company background."""
|
|
# Search web for company info
|
|
results = self.web_searcher.search(
|
|
f'"{organization}" company about founded headquarters',
|
|
max_results=5
|
|
)
|
|
|
|
# Build context from results
|
|
context = "\n".join([f"- {r.title}: {r.snippet}" for r in results])
|
|
|
|
# Use LLM to extract structured info
|
|
prompt = f"""Extract company information from these search results.
|
|
|
|
Company: {organization}
|
|
|
|
Search Results:
|
|
{context}
|
|
|
|
Respond with JSON:
|
|
{{
|
|
"description": "Brief company description",
|
|
"website": "company website if found",
|
|
"headquarters": "location if found",
|
|
"employee_count": "approximate if found",
|
|
"founded": "year if found",
|
|
"leadership": [
|
|
{{"name": "CEO Name", "title": "CEO"}},
|
|
{{"name": "CTO Name", "title": "CTO"}}
|
|
]
|
|
}}
|
|
|
|
Only include information explicitly found. Use null for unknown fields."""
|
|
|
|
response = self.llm_client.generate(
|
|
prompt=prompt,
|
|
temperature=0.1,
|
|
format="json"
|
|
)
|
|
|
|
if response.success:
|
|
try:
|
|
data = json.loads(response.content)
|
|
return CompanyProfile(
|
|
name=organization,
|
|
description=data.get("description", ""),
|
|
website=data.get("website"),
|
|
headquarters=data.get("headquarters"),
|
|
employee_count=data.get("employee_count"),
|
|
founded=data.get("founded"),
|
|
leadership=data.get("leadership", [])
|
|
)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return CompanyProfile(name=organization, description="Information not available")
|
|
|
|
def _analyze_technology(
|
|
self,
|
|
organization: str,
|
|
technology_context: str,
|
|
capability_gap: str
|
|
) -> TechnologyProfile:
|
|
"""Analyze the technology in depth."""
|
|
# Search for patents by this organization
|
|
patents = self.patent_searcher.search(
|
|
capability_gap.split()[0], # First keyword
|
|
max_results=5,
|
|
assignee=organization
|
|
)
|
|
|
|
patent_context = "\n".join([
|
|
f"- {p.title}" for p in patents
|
|
]) if patents else "No patents found"
|
|
|
|
prompt = f"""Analyze this technology's capabilities.
|
|
|
|
Company: {organization}
|
|
Technology Context: {technology_context}
|
|
Capability Gap: {capability_gap}
|
|
Related Patents: {patent_context}
|
|
|
|
Provide a technical assessment as JSON:
|
|
{{
|
|
"name": "Technology name",
|
|
"description": "Detailed technical description",
|
|
"technical_approach": "How the technology works",
|
|
"trl_assessment": 1-9,
|
|
"key_capabilities": ["capability 1", "capability 2"],
|
|
"limitations": ["limitation 1", "limitation 2"],
|
|
"competitive_advantage": "What makes this approach unique"
|
|
}}"""
|
|
|
|
response = self.llm_client.generate(
|
|
prompt=prompt,
|
|
temperature=0.2,
|
|
format="json"
|
|
)
|
|
|
|
related_patents = [
|
|
{"title": p.title, "number": p.patent_number, "url": p.url}
|
|
for p in patents
|
|
]
|
|
|
|
if response.success:
|
|
try:
|
|
data = json.loads(response.content)
|
|
return TechnologyProfile(
|
|
name=data.get("name", "Unknown"),
|
|
description=data.get("description", ""),
|
|
technical_approach=data.get("technical_approach", ""),
|
|
trl_assessment=data.get("trl_assessment", 5),
|
|
key_capabilities=data.get("key_capabilities", []),
|
|
limitations=data.get("limitations", []),
|
|
competitive_advantage=data.get("competitive_advantage", ""),
|
|
related_patents=related_patents
|
|
)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return TechnologyProfile(
|
|
name="Unknown",
|
|
description=technology_context,
|
|
technical_approach="",
|
|
trl_assessment=5,
|
|
key_capabilities=[],
|
|
limitations=[],
|
|
competitive_advantage="",
|
|
related_patents=related_patents
|
|
)
|
|
|
|
def _get_contract_history(self, organization: str) -> ContractHistory:
|
|
"""Get federal contract history."""
|
|
# Get contracts
|
|
contracts = self.contract_searcher.get_company_contracts(organization, max_results=20)
|
|
|
|
# Get SBIR awards
|
|
sbir_results = self.sbir_searcher.search(f'"{organization}"', max_results=10)
|
|
|
|
# Calculate totals
|
|
total_value = sum(c.award_amount or 0 for c in contracts)
|
|
|
|
# Get agency breakdown
|
|
agencies = {}
|
|
for c in contracts:
|
|
agency = c.raw_data.get("Awarding Agency", "Unknown")
|
|
agencies[agency] = agencies.get(agency, 0) + 1
|
|
|
|
primary_agencies = sorted(agencies.keys(), key=lambda x: agencies[x], reverse=True)[:5]
|
|
|
|
return ContractHistory(
|
|
total_contracts=len(contracts),
|
|
total_value=total_value,
|
|
primary_agencies=primary_agencies,
|
|
recent_contracts=[
|
|
{
|
|
"title": c.title,
|
|
"amount": c.award_amount,
|
|
"date": c.published_date,
|
|
"agency": c.raw_data.get("Awarding Agency"),
|
|
"url": c.url
|
|
}
|
|
for c in contracts[:10]
|
|
],
|
|
sbir_awards=[
|
|
{
|
|
"title": s.title,
|
|
"amount": s.award_amount,
|
|
"phase": s.raw_data.get("phase"),
|
|
"year": s.published_date,
|
|
"url": s.url
|
|
}
|
|
for s in sbir_results
|
|
]
|
|
)
|
|
|
|
def _find_other_offerings(self, organization: str) -> List[Dict[str, Any]]:
|
|
"""Find other products/services from this company."""
|
|
results = self.web_searcher.search(
|
|
f'"{organization}" products services solutions offerings',
|
|
max_results=10
|
|
)
|
|
|
|
offerings = []
|
|
for r in results:
|
|
offerings.append({
|
|
"title": r.title,
|
|
"description": r.snippet,
|
|
"url": r.url
|
|
})
|
|
|
|
return offerings
|
|
|
|
def _search_news(self, organization: str) -> List[Dict[str, Any]]:
|
|
"""Search recent news about the company."""
|
|
results = self.web_searcher.search(
|
|
f'"{organization}"',
|
|
max_results=10,
|
|
news_only=True,
|
|
time_filter="y" # Last year
|
|
)
|
|
|
|
news = []
|
|
for r in results:
|
|
news.append({
|
|
"title": r.title,
|
|
"snippet": r.snippet,
|
|
"date": r.published_date,
|
|
"url": r.url
|
|
})
|
|
|
|
return news
|
|
|
|
def _generate_assessment(
|
|
self,
|
|
organization: str,
|
|
technology_context: str,
|
|
capability_gap: str,
|
|
company: CompanyProfile,
|
|
tech: TechnologyProfile,
|
|
contracts: ContractHistory
|
|
) -> tuple:
|
|
"""Generate overall assessment using LLM."""
|
|
prompt = f"""Generate an investment/partnership assessment.
|
|
|
|
CAPABILITY GAP:
|
|
{capability_gap}
|
|
|
|
COMPANY: {organization}
|
|
{company.description}
|
|
Headquarters: {company.headquarters or 'Unknown'}
|
|
Founded: {company.founded or 'Unknown'}
|
|
|
|
TECHNOLOGY:
|
|
{tech.description}
|
|
TRL: {tech.trl_assessment}
|
|
Approach: {tech.technical_approach}
|
|
|
|
CONTRACT HISTORY:
|
|
Total Contracts: {contracts.total_contracts}
|
|
Total Value: ${contracts.total_value:,.0f}
|
|
Primary Agencies: {', '.join(contracts.primary_agencies[:3])}
|
|
SBIR Awards: {len(contracts.sbir_awards)}
|
|
|
|
Provide assessment as JSON:
|
|
{{
|
|
"assessment": "2-3 paragraph overall assessment of fit for the capability gap",
|
|
"risk_factors": ["risk 1", "risk 2", "risk 3"],
|
|
"recommendation": "STRONGLY RECOMMEND | RECOMMEND | CONSIDER | DO NOT RECOMMEND",
|
|
"recommendation_rationale": "Brief explanation"
|
|
}}"""
|
|
|
|
response = self.llm_client.generate(
|
|
prompt=prompt,
|
|
model=self.model,
|
|
temperature=0.3,
|
|
format="json"
|
|
)
|
|
|
|
if response.success:
|
|
try:
|
|
data = json.loads(response.content)
|
|
return (
|
|
data.get("assessment", "Assessment not available"),
|
|
data.get("risk_factors", []),
|
|
f"{data.get('recommendation', 'CONSIDER')}: {data.get('recommendation_rationale', '')}"
|
|
)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return ("Assessment generation failed", [], "CONSIDER: Insufficient data")
|