158 lines
5.2 KiB
Python
158 lines
5.2 KiB
Python
"""
|
|
Web Search using DuckDuckGo.
|
|
General web search for technology scouting.
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Optional
|
|
from urllib.parse import urlparse
|
|
from datetime import datetime
|
|
|
|
from .base import BaseSearcher, SearchResult
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class WebSearcher(BaseSearcher):
|
|
"""
|
|
General web search using DuckDuckGo.
|
|
Used for defense news, tech publications, and general discovery.
|
|
"""
|
|
|
|
def __init__(self, delay_between_searches: float = 1.0):
|
|
self.delay = delay_between_searches
|
|
self._last_search_time: Optional[datetime] = None
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "DuckDuckGo"
|
|
|
|
@property
|
|
def source_type(self) -> str:
|
|
return "web"
|
|
|
|
def _extract_domain(self, url: str) -> str:
|
|
try:
|
|
parsed = urlparse(url)
|
|
return parsed.netloc
|
|
except Exception:
|
|
return ""
|
|
|
|
def _rate_limit(self):
|
|
if self._last_search_time is not None:
|
|
import time
|
|
elapsed = (datetime.now() - self._last_search_time).total_seconds()
|
|
if elapsed < self.delay:
|
|
time.sleep(self.delay - elapsed)
|
|
self._last_search_time = datetime.now()
|
|
|
|
def search(
|
|
self,
|
|
query: str,
|
|
max_results: int = 20,
|
|
news_only: bool = False,
|
|
time_filter: Optional[str] = None
|
|
) -> List[SearchResult]:
|
|
"""
|
|
Execute DuckDuckGo search.
|
|
|
|
Args:
|
|
query: Search query
|
|
max_results: Maximum results
|
|
news_only: Search news instead of web
|
|
time_filter: d=day, w=week, m=month, y=year
|
|
"""
|
|
try:
|
|
from ddgs import DDGS
|
|
except ImportError:
|
|
raise ImportError("ddgs not installed. Run: pip install ddgs")
|
|
|
|
self._rate_limit()
|
|
results = []
|
|
|
|
try:
|
|
with DDGS() as ddgs:
|
|
if news_only:
|
|
search_results = list(ddgs.news(
|
|
query,
|
|
timelimit=time_filter,
|
|
max_results=max_results
|
|
))
|
|
|
|
for rank, result in enumerate(search_results, 1):
|
|
results.append(SearchResult(
|
|
title=result.get("title", ""),
|
|
url=result.get("url", ""),
|
|
snippet=result.get("body", ""),
|
|
source=self.name,
|
|
source_type="news",
|
|
rank=rank,
|
|
published_date=result.get("date"),
|
|
raw_data=result
|
|
))
|
|
else:
|
|
search_results = list(ddgs.text(
|
|
query,
|
|
timelimit=time_filter,
|
|
max_results=max_results
|
|
))
|
|
|
|
for rank, result in enumerate(search_results, 1):
|
|
# Detect source type based on domain
|
|
url = result.get("href", "")
|
|
domain = self._extract_domain(url)
|
|
source_type = self._classify_domain(domain)
|
|
|
|
results.append(SearchResult(
|
|
title=result.get("title", ""),
|
|
url=url,
|
|
snippet=result.get("body", ""),
|
|
source=self.name,
|
|
source_type=source_type,
|
|
rank=rank,
|
|
raw_data=result
|
|
))
|
|
|
|
logger.info(f"Web search for '{query}' returned {len(results)} results")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Web search failed: {e}")
|
|
raise
|
|
|
|
return results
|
|
|
|
def _classify_domain(self, domain: str) -> str:
|
|
"""Classify domain into source type."""
|
|
domain = domain.lower()
|
|
|
|
# Defense/government sources
|
|
if any(x in domain for x in ['.gov', '.mil']):
|
|
return "government"
|
|
|
|
# News sources
|
|
if any(x in domain for x in [
|
|
'news', 'reuters', 'bloomberg', 'defensenews',
|
|
'spacenews', 'aviationweek', 'janes', 'breakingdefense'
|
|
]):
|
|
return "news"
|
|
|
|
# Academic
|
|
if any(x in domain for x in ['.edu', 'arxiv', 'ieee', 'springer', 'nature']):
|
|
return "academic"
|
|
|
|
# Patent databases
|
|
if any(x in domain for x in ['patent', 'uspto', 'espacenet']):
|
|
return "patent"
|
|
|
|
return "web"
|
|
|
|
def search_defense_news(self, query: str, max_results: int = 15) -> List[SearchResult]:
|
|
"""Search specifically for defense news."""
|
|
defense_query = f"{query} site:defensenews.com OR site:breakingdefense.com OR site:janes.com OR site:aviationweek.com"
|
|
return self.search(defense_query, max_results=max_results)
|
|
|
|
def search_space_tech(self, query: str, max_results: int = 15) -> List[SearchResult]:
|
|
"""Search specifically for space technology news."""
|
|
space_query = f"{query} site:spacenews.com OR site:spaceflightnow.com OR site:nasaspaceflight.com"
|
|
return self.search(space_query, max_results=max_results)
|