TechScout/techscout/search/base.py

112 lines
3.2 KiB
Python

"""
Base search classes for TechScout.
"""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
from datetime import datetime
@dataclass
class SearchResult:
"""A single search result from any source."""
title: str
url: str
snippet: str
source: str # Which searcher found this
source_type: str # sbir, patent, contract, news, web
rank: int = 0 # Position in results
# Metadata
published_date: Optional[str] = None
organization: Optional[str] = None # Company/institution name
award_amount: Optional[float] = None # For contracts/grants
trl_estimate: Optional[int] = None # Technology readiness level
# Identifiers
award_id: Optional[str] = None # SBIR award ID, contract number
patent_number: Optional[str] = None
# Raw data for later processing
raw_data: Dict[str, Any] = field(default_factory=dict)
# Scoring (filled in later)
relevance_score: float = 0.0
final_score: float = 0.0
def to_dict(self) -> Dict[str, Any]:
return {
"title": self.title,
"url": self.url,
"snippet": self.snippet,
"source": self.source,
"source_type": self.source_type,
"rank": self.rank,
"published_date": self.published_date,
"organization": self.organization,
"award_amount": self.award_amount,
"trl_estimate": self.trl_estimate,
"award_id": self.award_id,
"patent_number": self.patent_number,
"relevance_score": self.relevance_score,
"final_score": self.final_score,
}
class BaseSearcher(ABC):
"""Abstract base class for all searchers."""
@property
@abstractmethod
def name(self) -> str:
"""Return the name of this searcher."""
pass
@property
@abstractmethod
def source_type(self) -> str:
"""Return the type of source (sbir, patent, contract, news, web)."""
pass
@abstractmethod
def search(self, query: str, max_results: int = 20, **kwargs) -> List[SearchResult]:
"""
Execute a search.
Args:
query: Search query
max_results: Maximum results to return
Returns:
List of SearchResult objects
"""
pass
def search_multiple(
self,
queries: List[str],
max_results_per_query: int = 10
) -> List[SearchResult]:
"""
Execute multiple searches and deduplicate.
Args:
queries: List of search queries
max_results_per_query: Max results per query
Returns:
Deduplicated list of results
"""
all_results = []
seen_urls = set()
for query in queries:
results = self.search(query, max_results=max_results_per_query)
for result in results:
if result.url not in seen_urls:
seen_urls.add(result.url)
all_results.append(result)
return all_results