"""
Relevance Metric: Evaluates how well a response addresses the user's query.
"""
from typing import List, Dict, Optional
from reward_kit import reward_function, EvaluateResult, MetricResult, Message
@reward_function
def evaluate(messages: List[Message], original_messages: List[Message] = list(), **kwargs) -> EvaluateResult:
"""
Evaluate the relevance of a response to the user's query.
Args:
messages: List of conversation messages
original_messages: Original messages (context)
**kwargs: Additional parameters
Returns:
EvaluateResult with score and metrics
"""
# Validate input
if not messages or len(messages) < 2:
return EvaluateResult(
score=0.0,
reason="Insufficient messages",
metrics={},
is_score_valid=False
)
# Find the user query (most recent user message)
user_query = None
for msg in reversed(messages[:-1]):
if msg.role == "user":
user_query = msg.content
break
if not user_query:
return EvaluateResult(
score=0.0,
reason="No user query found",
metrics={},
is_score_valid=False
)
# Get the assistant's response (last message)
response = messages[-1].content or ""
if messages[-1].role != "assistant":
return EvaluateResult(
score=0.0,
reason="Last message is not from assistant",
metrics={},
is_score_valid=False
)
# Calculate keyword overlap
user_keywords = set(user_query.lower().split())
response_keywords = set(response.lower().split())
# Remove common stop words
stop_words = {"a", "an", "the", "and", "or", "but", "is", "are", "on", "in", "at", "to", "for", "with", "by", "of"}
user_keywords = user_keywords - stop_words
response_keywords = response_keywords - stop_words
if not user_keywords:
return EvaluateResult(
score=0.5,
reason="No significant keywords in user query",
metrics={},
is_score_valid=True
)
# Calculate overlap
common_keywords = user_keywords.intersection(response_keywords)
overlap_ratio = len(common_keywords) / len(user_keywords)
# Calculate basic relevance score
relevance_score = min(overlap_ratio, 1.0)
# Check for direct answer patterns
direct_answer_patterns = [
"the answer is",
"to answer your question",
"in response to your question",
"regarding your question",
"to address your query"
]
has_direct_answer = any(pattern in response.lower() for pattern in direct_answer_patterns)
if has_direct_answer:
relevance_score = min(relevance_score + 0.2, 1.0) # Boost for direct answers
# Create metrics dictionary
metrics = {
"keyword_overlap": MetricResult(
score=overlap_ratio,
reason=f"Keyword overlap ratio: {overlap_ratio:.2f} ({len(common_keywords)}/{len(user_keywords)} keywords)",
is_score_valid=True
),
"direct_answer": MetricResult(
score=1.0 if has_direct_answer else 0.0,
reason="Response explicitly addresses the query" if has_direct_answer else "No explicit answer indicators",
is_score_valid=True
)
}
return EvaluateResult(
score=relevance_score,
reason=f"Relevance score: {relevance_score:.2f}",
metrics=metrics,
is_score_valid=True
)