s21mind commited on
Commit
d98b6f9
Β·
verified Β·
1 Parent(s): 22ac1c8

Upload benchmark.py

Browse files
Files changed (1) hide show
  1. benchmark.py +325 -0
benchmark.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HexaMind Hallucination Benchmark - Evaluation Framework
3
+ ========================================================
4
+
5
+ This module provides the evaluation infrastructure for the HexaMind
6
+ Hallucination Benchmark. It does NOT include the HexaMind detector itself,
7
+ which is available under commercial license.
8
+
9
+ Usage:
10
+ from benchmark import HexaMindBenchmark
11
+
12
+ benchmark = HexaMindBenchmark()
13
+ results = benchmark.evaluate(your_detector_function)
14
+ """
15
+
16
+ import json
17
+ import os
18
+ import time
19
+ from dataclasses import dataclass
20
+ from typing import Callable, Dict, List, Optional
21
+
22
+
23
+ @dataclass
24
+ class BenchmarkResults:
25
+ """Results from benchmark evaluation"""
26
+ pattern_accuracy: float
27
+ knowledge_accuracy: float
28
+ overall_accuracy: float
29
+ pattern_samples: int
30
+ knowledge_samples: int
31
+ total_samples: int
32
+ avg_latency_ms: float
33
+
34
+ def to_dict(self) -> Dict:
35
+ return {
36
+ "pattern_detectable_accuracy": round(self.pattern_accuracy, 2),
37
+ "knowledge_required_accuracy": round(self.knowledge_accuracy, 2),
38
+ "overall_accuracy": round(self.overall_accuracy, 2),
39
+ "pattern_samples": self.pattern_samples,
40
+ "knowledge_samples": self.knowledge_samples,
41
+ "total_samples": self.total_samples,
42
+ "avg_latency_ms": round(self.avg_latency_ms, 2)
43
+ }
44
+
45
+ def __repr__(self):
46
+ return f"""
47
+ ══════════════════════════════════════════════════════════════
48
+ HEXAMIND BENCHMARK RESULTS
49
+ ══════════════════════════════════════════════════════════════
50
+ Pattern-Detectable: {self.pattern_accuracy:5.1f}% (n={self.pattern_samples})
51
+ Knowledge-Required: {self.knowledge_accuracy:5.1f}% (n={self.knowledge_samples})
52
+ ──────────────────────────────────────────────────────────
53
+ Overall: {self.overall_accuracy:5.1f}% (n={self.total_samples})
54
+ Avg Latency: {self.avg_latency_ms:5.2f} ms
55
+ ══════════════════════════════════════════════════════════════
56
+ """
57
+
58
+
59
+ class HexaMindBenchmark:
60
+ """
61
+ Evaluation framework for the HexaMind Hallucination Benchmark.
62
+
63
+ The benchmark splits TruthfulQA into:
64
+ - Pattern-Detectable: Questions with linguistic markers
65
+ - Knowledge-Required: Questions needing factual verification
66
+
67
+ Example:
68
+ benchmark = HexaMindBenchmark()
69
+
70
+ def my_detector(question, answer):
71
+ # Return True if trustworthy, False if hallucination
72
+ return some_logic(question, answer)
73
+
74
+ results = benchmark.evaluate(my_detector)
75
+ print(results)
76
+ """
77
+
78
+ def __init__(self, data_dir: str = "data"):
79
+ """
80
+ Initialize benchmark with data directory.
81
+
82
+ Args:
83
+ data_dir: Path to directory containing JSON split files
84
+ """
85
+ self.data_dir = data_dir
86
+ self._pattern_data = None
87
+ self._knowledge_data = None
88
+
89
+ @property
90
+ def pattern_detectable(self) -> List[Dict]:
91
+ """Load pattern-detectable split lazily"""
92
+ if self._pattern_data is None:
93
+ self._pattern_data = self._load_json("pattern_detectable.json")
94
+ return self._pattern_data
95
+
96
+ @property
97
+ def knowledge_required(self) -> List[Dict]:
98
+ """Load knowledge-required split lazily"""
99
+ if self._knowledge_data is None:
100
+ self._knowledge_data = self._load_json("knowledge_required.json")
101
+ return self._knowledge_data
102
+
103
+ def _load_json(self, filename: str) -> List[Dict]:
104
+ """Load a JSON file from data directory"""
105
+ path = os.path.join(self.data_dir, filename)
106
+ if not os.path.exists(path):
107
+ raise FileNotFoundError(
108
+ f"Data file not found: {path}\n"
109
+ f"Please ensure you have downloaded the benchmark data."
110
+ )
111
+ with open(path, 'r', encoding='utf-8') as f:
112
+ return json.load(f)
113
+
114
+ def evaluate(
115
+ self,
116
+ detector: Callable[[str, str], bool],
117
+ split: str = "all",
118
+ verbose: bool = True
119
+ ) -> BenchmarkResults:
120
+ """
121
+ Evaluate a hallucination detector on the benchmark.
122
+
123
+ Args:
124
+ detector: Function(question, answer) -> bool
125
+ Returns True if answer is trustworthy
126
+ Returns False if answer is a hallucination
127
+ split: Which split to evaluate
128
+ "all" - both splits
129
+ "pattern" - pattern-detectable only
130
+ "knowledge" - knowledge-required only
131
+ verbose: Print progress during evaluation
132
+
133
+ Returns:
134
+ BenchmarkResults with accuracy metrics
135
+ """
136
+ # Select data based on split
137
+ if split == "all":
138
+ pattern_data = self.pattern_detectable
139
+ knowledge_data = self.knowledge_required
140
+ elif split in ("pattern", "pattern_detectable"):
141
+ pattern_data = self.pattern_detectable
142
+ knowledge_data = []
143
+ elif split in ("knowledge", "knowledge_required"):
144
+ pattern_data = []
145
+ knowledge_data = self.knowledge_required
146
+ else:
147
+ raise ValueError(f"Unknown split: {split}")
148
+
149
+ latencies = []
150
+
151
+ # Evaluate pattern-detectable
152
+ pattern_correct = 0
153
+ if pattern_data and verbose:
154
+ print(f"Evaluating pattern-detectable ({len(pattern_data)} samples)...")
155
+
156
+ for i, sample in enumerate(pattern_data):
157
+ start = time.perf_counter()
158
+ prediction = detector(sample["question"], sample["answer"])
159
+ latencies.append((time.perf_counter() - start) * 1000)
160
+
161
+ expected = sample["ground_truth"] == 1
162
+ if prediction == expected:
163
+ pattern_correct += 1
164
+
165
+ if verbose and (i + 1) % 25 == 0:
166
+ print(f" Progress: {i + 1}/{len(pattern_data)}")
167
+
168
+ # Evaluate knowledge-required
169
+ knowledge_correct = 0
170
+ if knowledge_data and verbose:
171
+ print(f"Evaluating knowledge-required ({len(knowledge_data)} samples)...")
172
+
173
+ for i, sample in enumerate(knowledge_data):
174
+ start = time.perf_counter()
175
+ prediction = detector(sample["question"], sample["answer"])
176
+ latencies.append((time.perf_counter() - start) * 1000)
177
+
178
+ expected = sample["ground_truth"] == 1
179
+ if prediction == expected:
180
+ knowledge_correct += 1
181
+
182
+ if verbose and (i + 1) % 200 == 0:
183
+ print(f" Progress: {i + 1}/{len(knowledge_data)}")
184
+
185
+ # Compute metrics
186
+ pattern_n = len(pattern_data)
187
+ knowledge_n = len(knowledge_data)
188
+ total_n = pattern_n + knowledge_n
189
+
190
+ pattern_acc = (pattern_correct / pattern_n * 100) if pattern_n > 0 else 0
191
+ knowledge_acc = (knowledge_correct / knowledge_n * 100) if knowledge_n > 0 else 0
192
+ overall_acc = ((pattern_correct + knowledge_correct) / total_n * 100) if total_n > 0 else 0
193
+ avg_latency = sum(latencies) / len(latencies) if latencies else 0
194
+
195
+ results = BenchmarkResults(
196
+ pattern_accuracy=pattern_acc,
197
+ knowledge_accuracy=knowledge_acc,
198
+ overall_accuracy=overall_acc,
199
+ pattern_samples=pattern_n,
200
+ knowledge_samples=knowledge_n,
201
+ total_samples=total_n,
202
+ avg_latency_ms=avg_latency
203
+ )
204
+
205
+ if verbose:
206
+ print(results)
207
+
208
+ return results
209
+
210
+ def create_submission(
211
+ self,
212
+ results: BenchmarkResults,
213
+ model_name: str,
214
+ model_type: str,
215
+ parameters: str,
216
+ contact: str = "",
217
+ paper_url: str = "",
218
+ cost_per_1k: str = "Unknown"
219
+ ) -> Dict:
220
+ """
221
+ Create a submission JSON for the leaderboard.
222
+
223
+ Args:
224
+ results: BenchmarkResults from evaluate()
225
+ model_name: Name of your model
226
+ model_type: Category (LLM-as-Judge, Classifier, Zero-Parameter, etc.)
227
+ parameters: Parameter count (e.g., "7B", "0", "70B")
228
+ contact: Email for questions
229
+ paper_url: Link to paper/preprint (optional)
230
+ cost_per_1k: API cost per 1000 evaluations (optional)
231
+
232
+ Returns:
233
+ Dict ready to save as JSON submission
234
+ """
235
+ from datetime import datetime
236
+
237
+ return {
238
+ "model_name": model_name,
239
+ "model_type": model_type,
240
+ "parameters": parameters,
241
+ "pattern_detectable_accuracy": results.pattern_accuracy,
242
+ "knowledge_required_accuracy": results.knowledge_accuracy,
243
+ "overall_accuracy": results.overall_accuracy,
244
+ "latency_ms": results.avg_latency_ms,
245
+ "cost_per_1k": cost_per_1k,
246
+ "submission_date": datetime.now().strftime("%Y-%m-%d"),
247
+ "contact": contact,
248
+ "paper_url": paper_url
249
+ }
250
+
251
+
252
+ # ═══════════════════════════════════════════════════════════════════════════════
253
+ # EXAMPLE BASELINES (for reference)
254
+ # ═══════════════════════════════════════════════════════════════════════════════
255
+
256
+ def random_baseline(question: str, answer: str) -> bool:
257
+ """Random baseline - 50% expected accuracy"""
258
+ import random
259
+ return random.random() > 0.5
260
+
261
+
262
+ def always_trust_baseline(question: str, answer: str) -> bool:
263
+ """Always returns True - accuracy = % of truthful samples"""
264
+ return True
265
+
266
+
267
+ def always_reject_baseline(question: str, answer: str) -> bool:
268
+ """Always returns False - accuracy = % of hallucination samples"""
269
+ return False
270
+
271
+
272
+ # ═══════════════════════════════════════════════════════════════════════════════
273
+ # CLI
274
+ # ═══════════════════════════════════════════════════════════════════════════════
275
+
276
+ if __name__ == "__main__":
277
+ import argparse
278
+
279
+ parser = argparse.ArgumentParser(
280
+ description="HexaMind Hallucination Benchmark Evaluation"
281
+ )
282
+ parser.add_argument(
283
+ "--baseline",
284
+ choices=["random", "always_trust", "always_reject"],
285
+ default="random",
286
+ help="Baseline to evaluate"
287
+ )
288
+ parser.add_argument(
289
+ "--split",
290
+ choices=["all", "pattern", "knowledge"],
291
+ default="all",
292
+ help="Which split to evaluate"
293
+ )
294
+ parser.add_argument(
295
+ "--data-dir",
296
+ default="data",
297
+ help="Path to data directory"
298
+ )
299
+
300
+ args = parser.parse_args()
301
+
302
+ # Select baseline
303
+ baselines = {
304
+ "random": random_baseline,
305
+ "always_trust": always_trust_baseline,
306
+ "always_reject": always_reject_baseline
307
+ }
308
+ detector = baselines[args.baseline]
309
+
310
+ # Run evaluation
311
+ benchmark = HexaMindBenchmark(data_dir=args.data_dir)
312
+ results = benchmark.evaluate(detector, split=args.split)
313
+
314
+ # Save results
315
+ submission = benchmark.create_submission(
316
+ results,
317
+ model_name=f"{args.baseline}_baseline",
318
+ model_type="Statistical Baseline",
319
+ parameters="0"
320
+ )
321
+
322
+ output_file = f"submission_{args.baseline}.json"
323
+ with open(output_file, 'w') as f:
324
+ json.dump(submission, f, indent=2)
325
+ print(f"\nSubmission saved to {output_file}")