""" Web Search using DuckDuckGo. General web search for technology scouting. """ import logging from typing import List, Optional from urllib.parse import urlparse from datetime import datetime from .base import BaseSearcher, SearchResult logger = logging.getLogger(__name__) class WebSearcher(BaseSearcher): """ General web search using DuckDuckGo. Used for defense news, tech publications, and general discovery. """ def __init__(self, delay_between_searches: float = 1.0): self.delay = delay_between_searches self._last_search_time: Optional[datetime] = None @property def name(self) -> str: return "DuckDuckGo" @property def source_type(self) -> str: return "web" def _extract_domain(self, url: str) -> str: try: parsed = urlparse(url) return parsed.netloc except Exception: return "" def _rate_limit(self): if self._last_search_time is not None: import time elapsed = (datetime.now() - self._last_search_time).total_seconds() if elapsed < self.delay: time.sleep(self.delay - elapsed) self._last_search_time = datetime.now() def search( self, query: str, max_results: int = 20, news_only: bool = False, time_filter: Optional[str] = None ) -> List[SearchResult]: """ Execute DuckDuckGo search. Args: query: Search query max_results: Maximum results news_only: Search news instead of web time_filter: d=day, w=week, m=month, y=year """ try: from ddgs import DDGS except ImportError: raise ImportError("ddgs not installed. Run: pip install ddgs") self._rate_limit() results = [] try: with DDGS() as ddgs: if news_only: search_results = list(ddgs.news( query, timelimit=time_filter, max_results=max_results )) for rank, result in enumerate(search_results, 1): results.append(SearchResult( title=result.get("title", ""), url=result.get("url", ""), snippet=result.get("body", ""), source=self.name, source_type="news", rank=rank, published_date=result.get("date"), raw_data=result )) else: search_results = list(ddgs.text( query, timelimit=time_filter, max_results=max_results )) for rank, result in enumerate(search_results, 1): # Detect source type based on domain url = result.get("href", "") domain = self._extract_domain(url) source_type = self._classify_domain(domain) results.append(SearchResult( title=result.get("title", ""), url=url, snippet=result.get("body", ""), source=self.name, source_type=source_type, rank=rank, raw_data=result )) logger.info(f"Web search for '{query}' returned {len(results)} results") except Exception as e: logger.error(f"Web search failed: {e}") raise return results def _classify_domain(self, domain: str) -> str: """Classify domain into source type.""" domain = domain.lower() # Defense/government sources if any(x in domain for x in ['.gov', '.mil']): return "government" # News sources if any(x in domain for x in [ 'news', 'reuters', 'bloomberg', 'defensenews', 'spacenews', 'aviationweek', 'janes', 'breakingdefense' ]): return "news" # Academic if any(x in domain for x in ['.edu', 'arxiv', 'ieee', 'springer', 'nature']): return "academic" # Patent databases if any(x in domain for x in ['patent', 'uspto', 'espacenet']): return "patent" return "web" def search_defense_news(self, query: str, max_results: int = 15) -> List[SearchResult]: """Search specifically for defense news.""" defense_query = f"{query} site:defensenews.com OR site:breakingdefense.com OR site:janes.com OR site:aviationweek.com" return self.search(defense_query, max_results=max_results) def search_space_tech(self, query: str, max_results: int = 15) -> List[SearchResult]: """Search specifically for space technology news.""" space_query = f"{query} site:spacenews.com OR site:spaceflightnow.com OR site:nasaspaceflight.com" return self.search(space_query, max_results=max_results)