112 lines
3.2 KiB
Python
112 lines
3.2 KiB
Python
"""
|
|
Base search classes for TechScout.
|
|
"""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass, field
|
|
from typing import List, Optional, Dict, Any
|
|
from datetime import datetime
|
|
|
|
|
|
@dataclass
|
|
class SearchResult:
|
|
"""A single search result from any source."""
|
|
title: str
|
|
url: str
|
|
snippet: str
|
|
source: str # Which searcher found this
|
|
source_type: str # sbir, patent, contract, news, web
|
|
rank: int = 0 # Position in results
|
|
|
|
# Metadata
|
|
published_date: Optional[str] = None
|
|
organization: Optional[str] = None # Company/institution name
|
|
award_amount: Optional[float] = None # For contracts/grants
|
|
trl_estimate: Optional[int] = None # Technology readiness level
|
|
|
|
# Identifiers
|
|
award_id: Optional[str] = None # SBIR award ID, contract number
|
|
patent_number: Optional[str] = None
|
|
|
|
# Raw data for later processing
|
|
raw_data: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
# Scoring (filled in later)
|
|
relevance_score: float = 0.0
|
|
final_score: float = 0.0
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"title": self.title,
|
|
"url": self.url,
|
|
"snippet": self.snippet,
|
|
"source": self.source,
|
|
"source_type": self.source_type,
|
|
"rank": self.rank,
|
|
"published_date": self.published_date,
|
|
"organization": self.organization,
|
|
"award_amount": self.award_amount,
|
|
"trl_estimate": self.trl_estimate,
|
|
"award_id": self.award_id,
|
|
"patent_number": self.patent_number,
|
|
"relevance_score": self.relevance_score,
|
|
"final_score": self.final_score,
|
|
}
|
|
|
|
|
|
class BaseSearcher(ABC):
|
|
"""Abstract base class for all searchers."""
|
|
|
|
@property
|
|
@abstractmethod
|
|
def name(self) -> str:
|
|
"""Return the name of this searcher."""
|
|
pass
|
|
|
|
@property
|
|
@abstractmethod
|
|
def source_type(self) -> str:
|
|
"""Return the type of source (sbir, patent, contract, news, web)."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def search(self, query: str, max_results: int = 20, **kwargs) -> List[SearchResult]:
|
|
"""
|
|
Execute a search.
|
|
|
|
Args:
|
|
query: Search query
|
|
max_results: Maximum results to return
|
|
|
|
Returns:
|
|
List of SearchResult objects
|
|
"""
|
|
pass
|
|
|
|
def search_multiple(
|
|
self,
|
|
queries: List[str],
|
|
max_results_per_query: int = 10
|
|
) -> List[SearchResult]:
|
|
"""
|
|
Execute multiple searches and deduplicate.
|
|
|
|
Args:
|
|
queries: List of search queries
|
|
max_results_per_query: Max results per query
|
|
|
|
Returns:
|
|
Deduplicated list of results
|
|
"""
|
|
all_results = []
|
|
seen_urls = set()
|
|
|
|
for query in queries:
|
|
results = self.search(query, max_results=max_results_per_query)
|
|
for result in results:
|
|
if result.url not in seen_urls:
|
|
seen_urls.add(result.url)
|
|
all_results.append(result)
|
|
|
|
return all_results
|