TechScout/techscout/extraction/scorer.py

322 lines
9.4 KiB
Python

"""
Result Scoring for TechScout.
Scores search results based on problem-solution fit using LLM analysis
and heuristic factors.
"""
import json
import logging
import math
from dataclasses import dataclass
from datetime import datetime
from typing import List, Dict, Any, Optional
from .llm_client import OllamaClient
from ..search.base import SearchResult
logger = logging.getLogger(__name__)
@dataclass
class ScoringWeights:
"""Weights for different scoring factors."""
technical_relevance: float = 0.35
trl_match: float = 0.20
recency: float = 0.15
source_authority: float = 0.15
funding_indicator: float = 0.15
class ResultScorer:
"""
Scores and ranks search results based on problem-solution fit.
"""
SOURCE_AUTHORITY = {
"sbir": 0.9,
"patent": 0.85,
"contract": 0.8,
"government": 0.85,
"academic": 0.75,
"news": 0.6,
"web": 0.5,
}
def __init__(
self,
ollama_client: Optional[OllamaClient] = None,
model: str = "mistral-nemo:12b",
weights: Optional[ScoringWeights] = None
):
self.client = ollama_client or OllamaClient()
self.model = model
self.weights = weights or ScoringWeights()
def score_results(
self,
results: List[SearchResult],
capability_gap: str,
target_trl: tuple = (4, 7),
use_llm: bool = True
) -> List[SearchResult]:
"""
Score all results and return sorted by final score.
Args:
results: List of search results to score
capability_gap: Original capability gap description
target_trl: Target TRL range (min, max)
use_llm: Whether to use LLM for relevance scoring
Returns:
Sorted list of results with scores populated
"""
if not results:
return []
# Score each result
for result in results:
# LLM-based relevance score
if use_llm:
result.relevance_score = self._score_relevance_llm(
result, capability_gap
)
else:
result.relevance_score = self._score_relevance_heuristic(
result, capability_gap
)
# Calculate component scores
trl_score = self._score_trl(result.trl_estimate, target_trl)
recency_score = self._score_recency(result.published_date)
authority_score = self.SOURCE_AUTHORITY.get(result.source_type, 0.5)
funding_score = self._score_funding(result.award_amount)
# Calculate final weighted score
result.final_score = (
self.weights.technical_relevance * result.relevance_score +
self.weights.trl_match * trl_score +
self.weights.recency * recency_score +
self.weights.source_authority * authority_score +
self.weights.funding_indicator * funding_score
)
# Sort by final score descending
results.sort(key=lambda x: x.final_score, reverse=True)
return results
def _score_relevance_llm(
self,
result: SearchResult,
capability_gap: str
) -> float:
"""Score relevance using LLM."""
prompt = f"""Rate how well this technology/solution addresses the capability gap.
CAPABILITY GAP:
{capability_gap}
POTENTIAL SOLUTION:
Title: {result.title}
Description: {result.snippet}
Organization: {result.organization or 'Unknown'}
Source: {result.source_type}
Rate the relevance from 0.0 to 1.0 where:
- 0.0-0.2: Not relevant
- 0.2-0.4: Tangentially related
- 0.4-0.6: Somewhat relevant
- 0.6-0.8: Highly relevant
- 0.8-1.0: Direct solution
Respond with JSON:
{{
"score": 0.0-1.0,
"rationale": "Brief explanation"
}}"""
response = self.client.generate(
prompt=prompt,
model=self.model,
temperature=0.1,
format="json"
)
if not response.success:
return self._score_relevance_heuristic(result, capability_gap)
try:
data = json.loads(response.content)
return min(1.0, max(0.0, float(data.get("score", 0.5))))
except (json.JSONDecodeError, ValueError, TypeError):
return self._score_relevance_heuristic(result, capability_gap)
def _score_relevance_heuristic(
self,
result: SearchResult,
capability_gap: str
) -> float:
"""Score relevance using keyword matching heuristics."""
# Simple keyword overlap scoring
gap_words = set(capability_gap.lower().split())
result_text = f"{result.title} {result.snippet}".lower()
result_words = set(result_text.split())
# Remove common words
stopwords = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been',
'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
'would', 'could', 'should', 'may', 'might', 'must', 'shall',
'can', 'need', 'for', 'of', 'to', 'in', 'on', 'at', 'by',
'from', 'with', 'and', 'or', 'not', 'that', 'this', 'it'}
gap_words -= stopwords
result_words -= stopwords
if not gap_words:
return 0.5
overlap = len(gap_words & result_words)
score = overlap / len(gap_words)
return min(1.0, score)
def _score_trl(self, trl: Optional[int], target_range: tuple) -> float:
"""Score based on TRL match."""
if trl is None:
return 0.5 # Unknown TRL
min_trl, max_trl = target_range
if min_trl <= trl <= max_trl:
return 1.0
elif trl < min_trl:
# Too early stage
distance = min_trl - trl
return max(0.2, 1.0 - (distance * 0.2))
else:
# Too mature (might be less innovative)
distance = trl - max_trl
return max(0.4, 1.0 - (distance * 0.15))
def _score_recency(self, date_str: Optional[str]) -> float:
"""Score based on how recent the result is."""
if not date_str:
return 0.5
try:
# Try to parse various date formats
for fmt in ["%Y-%m-%d", "%Y-%m", "%Y", "%m/%d/%Y"]:
try:
date = datetime.strptime(date_str[:len(fmt.replace('%', '0'))], fmt)
break
except ValueError:
continue
else:
# Try to extract year
import re
year_match = re.search(r'20\d{2}', date_str)
if year_match:
date = datetime(int(year_match.group()), 6, 1)
else:
return 0.5
# Calculate age in years
age_days = (datetime.now() - date).days
age_years = age_days / 365.25
# Exponential decay with 2-year half-life
score = math.exp(-0.347 * age_years) # ln(2)/2 ≈ 0.347
return max(0.1, min(1.0, score))
except Exception:
return 0.5
def _score_funding(self, amount: Optional[float]) -> float:
"""Score based on funding amount (higher = more validated)."""
if amount is None:
return 0.5
# Log scale scoring
if amount <= 0:
return 0.3
elif amount < 100000: # < $100K
return 0.5
elif amount < 500000: # $100K - $500K
return 0.65
elif amount < 1000000: # $500K - $1M
return 0.75
elif amount < 5000000: # $1M - $5M
return 0.85
else: # > $5M
return 0.95
def batch_score_relevance(
self,
results: List[SearchResult],
capability_gap: str,
batch_size: int = 5
) -> List[float]:
"""
Score multiple results in batches for efficiency.
Args:
results: Results to score
capability_gap: Capability gap description
batch_size: How many to score per LLM call
Returns:
List of relevance scores
"""
scores = []
for i in range(0, len(results), batch_size):
batch = results[i:i + batch_size]
# Build batch prompt
items = []
for j, result in enumerate(batch):
items.append(f"""
Item {j + 1}:
Title: {result.title}
Description: {result.snippet[:200]}
Organization: {result.organization or 'Unknown'}""")
prompt = f"""Rate how well each item addresses this capability gap.
CAPABILITY GAP:
{capability_gap}
ITEMS TO SCORE:
{''.join(items)}
Respond with JSON:
{{
"scores": [
{{"item": 1, "score": 0.0-1.0}},
{{"item": 2, "score": 0.0-1.0}},
...
]
}}"""
response = self.client.generate(
prompt=prompt,
model=self.model,
temperature=0.1,
format="json"
)
if response.success:
try:
data = json.loads(response.content)
batch_scores = {s["item"]: s["score"] for s in data.get("scores", [])}
for j in range(len(batch)):
scores.append(batch_scores.get(j + 1, 0.5))
except (json.JSONDecodeError, KeyError):
scores.extend([0.5] * len(batch))
else:
scores.extend([0.5] * len(batch))
return scores