pip install e2b-code-interpreter
export E2B_API_KEY="your_api_key_here"
from reward_kit import reward_function
from e2b_code_interpreter import CodeInterpreter
@reward_function
def e2b_code_execution_reward(response: str, expected_output: str, **kwargs) -> float:
"""
Executes code using E2B and compares output to expected result.
"""
try:
with CodeInterpreter() as sandbox:
# Execute the generated code
execution = sandbox.notebook.exec_cell(response)
if execution.error:
return 0.0
# Get the output
output = ""
for result in execution.results:
if hasattr(result, 'text'):
output += result.text
elif hasattr(result, 'data'):
output += str(result.data)
# Compare with expected output
if output.strip() == expected_output.strip():
return 1.0
else:
# Partial credit based on similarity
return calculate_output_similarity(output, expected_output)
except Exception as e:
print(f"E2B execution error: {e}")
return 0.0
def calculate_output_similarity(actual: str, expected: str) -> float:
"""Calculate similarity between outputs."""
actual = actual.strip()
expected = expected.strip()
if not expected:
return 1.0 if not actual else 0.0
# Simple word-based similarity
actual_words = set(actual.lower().split())
expected_words = set(expected.lower().split())
if not expected_words:
return 1.0
intersection = actual_words & expected_words
return len(intersection) / len(expected_words)
@reward_function
def advanced_code_evaluation(response: str, test_cases: list, **kwargs) -> float:
"""
Evaluates code against multiple test cases using E2B.
"""
if not test_cases:
return 0.0
passed_tests = 0
total_tests = len(test_cases)
try:
with CodeInterpreter() as sandbox:
# First, execute the response code to define functions/variables
setup_execution = sandbox.notebook.exec_cell(response)
if setup_execution.error:
return 0.0
# Run each test case
for test_case in test_cases:
test_code = test_case.get('code', '')
expected_output = test_case.get('expected', '')
test_execution = sandbox.notebook.exec_cell(test_code)
if test_execution.error:
continue
# Check output
actual_output = ""
for result in test_execution.results:
if hasattr(result, 'text'):
actual_output += result.text
if actual_output.strip() == expected_output.strip():
passed_tests += 1
except Exception as e:
print(f"E2B execution error: {e}")
return 0.0
return passed_tests / total_tests
@reward_function
def file_based_code_reward(response: str, file_operations: list, **kwargs) -> float:
"""
Evaluates code that performs file operations.
"""
try:
with CodeInterpreter() as sandbox:
# Execute the code
execution = sandbox.notebook.exec_cell(response)
if execution.error:
return 0.0
score = 0.0
total_operations = len(file_operations)
# Check each expected file operation
for operation in file_operations:
file_path = operation.get('path', '')
expected_content = operation.get('content', '')
operation_type = operation.get('type', 'create')
if operation_type == 'create':
# Check if file was created with correct content
try:
# Read the file
read_execution = sandbox.notebook.exec_cell(f"""
with open('{file_path}', 'r') as f:
content = f.read()
print(content)
""")
if not read_execution.error:
actual_content = ""
for result in read_execution.results:
if hasattr(result, 'text'):
actual_content += result.text
if actual_content.strip() == expected_content.strip():
score += 1.0
except Exception:
pass
elif operation_type == 'exists':
# Check if file exists
check_execution = sandbox.notebook.exec_cell(f"""
import os
print(os.path.exists('{file_path}'))
""")
if not check_execution.error:
for result in check_execution.results:
if hasattr(result, 'text') and 'True' in result.text:
score += 1.0
break
return score / total_operations if total_operations > 0 else 0.0
except Exception as e:
print(f"E2B execution error: {e}")
return 0.0
@reward_function
def data_analysis_reward(response: str, dataset_path: str, expected_insights: list, **kwargs) -> float:
"""
Evaluates data analysis code by checking insights and outputs.
"""
try:
with CodeInterpreter() as sandbox:
# Upload dataset to sandbox
with open(dataset_path, 'rb') as f:
data_file = sandbox.files.write('dataset.csv', f.read())
# Execute the analysis code
execution = sandbox.notebook.exec_cell(response)
if execution.error:
return 0.0
score = 0.0
# Check for expected insights
output_text = ""
for result in execution.results:
if hasattr(result, 'text'):
output_text += result.text.lower()
# Check each expected insight
for insight in expected_insights:
if insight.lower() in output_text:
score += 1.0
# Bonus for plots/visualizations
has_plot = any(
hasattr(result, 'formats') and 'image/png' in result.formats
for result in execution.results
)
if has_plot:
score += 0.5
return min(score / len(expected_insights), 1.0) if expected_insights else 0.0
except Exception as e:
print(f"E2B execution error: {e}")
return 0.0
# Test a simple calculation
test_cases = [
{
'code': 'result = add_numbers(2, 3)',
'expected': '5'
},
{
'code': 'result = add_numbers(-1, 1)',
'expected': '0'
}
]
code_response = """
def add_numbers(a, b):
return a + b
"""
score = advanced_code_evaluation(code_response, test_cases)
print(f"Code evaluation score: {score}")
from reward_kit.evaluation import evaluate_dataset
# Evaluate a dataset of coding problems
results = evaluate_dataset(
reward_function=e2b_code_execution_reward,
dataset_path="coding_problems.jsonl",
additional_fields=['test_cases', 'expected_output']
)