TechScout/techscout/search/web.py

158 lines
5.2 KiB
Python

"""
Web Search using DuckDuckGo.
General web search for technology scouting.
"""
import logging
from typing import List, Optional
from urllib.parse import urlparse
from datetime import datetime
from .base import BaseSearcher, SearchResult
logger = logging.getLogger(__name__)
class WebSearcher(BaseSearcher):
"""
General web search using DuckDuckGo.
Used for defense news, tech publications, and general discovery.
"""
def __init__(self, delay_between_searches: float = 1.0):
self.delay = delay_between_searches
self._last_search_time: Optional[datetime] = None
@property
def name(self) -> str:
return "DuckDuckGo"
@property
def source_type(self) -> str:
return "web"
def _extract_domain(self, url: str) -> str:
try:
parsed = urlparse(url)
return parsed.netloc
except Exception:
return ""
def _rate_limit(self):
if self._last_search_time is not None:
import time
elapsed = (datetime.now() - self._last_search_time).total_seconds()
if elapsed < self.delay:
time.sleep(self.delay - elapsed)
self._last_search_time = datetime.now()
def search(
self,
query: str,
max_results: int = 20,
news_only: bool = False,
time_filter: Optional[str] = None
) -> List[SearchResult]:
"""
Execute DuckDuckGo search.
Args:
query: Search query
max_results: Maximum results
news_only: Search news instead of web
time_filter: d=day, w=week, m=month, y=year
"""
try:
from ddgs import DDGS
except ImportError:
raise ImportError("ddgs not installed. Run: pip install ddgs")
self._rate_limit()
results = []
try:
with DDGS() as ddgs:
if news_only:
search_results = list(ddgs.news(
query,
timelimit=time_filter,
max_results=max_results
))
for rank, result in enumerate(search_results, 1):
results.append(SearchResult(
title=result.get("title", ""),
url=result.get("url", ""),
snippet=result.get("body", ""),
source=self.name,
source_type="news",
rank=rank,
published_date=result.get("date"),
raw_data=result
))
else:
search_results = list(ddgs.text(
query,
timelimit=time_filter,
max_results=max_results
))
for rank, result in enumerate(search_results, 1):
# Detect source type based on domain
url = result.get("href", "")
domain = self._extract_domain(url)
source_type = self._classify_domain(domain)
results.append(SearchResult(
title=result.get("title", ""),
url=url,
snippet=result.get("body", ""),
source=self.name,
source_type=source_type,
rank=rank,
raw_data=result
))
logger.info(f"Web search for '{query}' returned {len(results)} results")
except Exception as e:
logger.error(f"Web search failed: {e}")
raise
return results
def _classify_domain(self, domain: str) -> str:
"""Classify domain into source type."""
domain = domain.lower()
# Defense/government sources
if any(x in domain for x in ['.gov', '.mil']):
return "government"
# News sources
if any(x in domain for x in [
'news', 'reuters', 'bloomberg', 'defensenews',
'spacenews', 'aviationweek', 'janes', 'breakingdefense'
]):
return "news"
# Academic
if any(x in domain for x in ['.edu', 'arxiv', 'ieee', 'springer', 'nature']):
return "academic"
# Patent databases
if any(x in domain for x in ['patent', 'uspto', 'espacenet']):
return "patent"
return "web"
def search_defense_news(self, query: str, max_results: int = 15) -> List[SearchResult]:
"""Search specifically for defense news."""
defense_query = f"{query} site:defensenews.com OR site:breakingdefense.com OR site:janes.com OR site:aviationweek.com"
return self.search(defense_query, max_results=max_results)
def search_space_tech(self, query: str, max_results: int = 15) -> List[SearchResult]:
"""Search specifically for space technology news."""
space_query = f"{query} site:spacenews.com OR site:spaceflightnow.com OR site:nasaspaceflight.com"
return self.search(space_query, max_results=max_results)