MagicBench/magicbench.py at main · twweeb/MagicBench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
MagicBench v1.0: A Deception-Sensitive Cognitive Benchmark for LLMs
====================================================================
Tests adversarial reasoning, metacognition, theory of mind, and causal
inference through magic-trick understanding — aligned with DeepMind's
AGI cognitive framework (perception, attention, reasoning, metacognition,
social cognition, executive functions, learning, memory, problem solving,
generation).

Usage:
    python magicbench.py --model gpt-4o --provider openai --api-key $KEY
    python magicbench.py --dry-run          # print prompts only
    python magicbench.py --human            # interactive human baseline mode

Outputs:
    results/<model>_<timestamp>.json   — per-item scores
    results/<model>_<timestamp>_profile.json — cognitive profile (5 dimensions)
"""

import json, os, re, time, argparse, random, importlib
from dataclasses import dataclass, asdict
from typing import List, Dict
from enum import Enum
from datetime import datetime
from collections import defaultdict

# ════════════════════════════════════════════════════════════════
# §1  ENUMERATIONS & DATA STRUCTURES
# ════════════════════════════════════════════════════════════════

class EffectType(str, Enum):
    VANISH = "vanish"
    APPEARANCE = "appearance"
    CONTROL = "control"
    TRANSPOSITION = "transposition"
    TRANSFORMATION = "transformation"
    LEVITATION = "levitation"
    PENETRATION = "penetration"
    PREDICTION = "prediction"
    MENTALISM = "mentalism"
    RESTORATION = "restoration"

class ViolationType(str, Enum):
    OBJECT_PERMANENCE = "object_permanence"
    SPATIOTEMPORAL = "spatiotemporal_continuity"
    SUPPORT_GRAVITY = "support_gravity"
    CAUSAL_CHAIN = "causal_chain"
    INFO_ACCESS = "information_access"
    FREE_WILL = "free_will"
    MATERIAL_INTEGRITY = "material_integrity"

class MethodFamily(str, Enum):
    CONCEALMENT = "concealment"
    SUBSTITUTION = "substitution"
    FORCING = "forcing"
    MISDIRECTION_ATT = "attention_misdirection"
    MISDIRECTION_MEM = "memory_misdirection"
    GIMMICK = "gimmick"
    MATHEMATICAL = "mathematical"
    PSYCHOLOGICAL = "psychological"
    DUAL_REALITY = "dual_reality"
    PRE_SHOW = "pre_show"
    MULTIPLE_OUTS = "multiple_outs"

# DeepMind 10 cognitive faculties
class CognitiveFaculty(str, Enum):
    PERCEPTION = "perception"
    ATTENTION = "attention"
    MEMORY = "memory"
    REASONING = "reasoning"
    METACOGNITION = "metacognition"
    EXECUTIVE = "executive_functions"
    SOCIAL = "social_cognition"
    LEARNING = "learning"
    PROBLEM_SOLVING = "problem_solving"
    GENERATION = "generation"

@dataclass
class BeliefState:
    """What the audience rationally believes at a given step."""
    step: int
    observable_event: str
    audience_belief: str
    actual_reality: str

    @classmethod
    def from_dict(cls, data: dict) -> "BeliefState":
        return cls(**data)

@dataclass
class CounterfactualQ:
    condition: str          # what changes
    question: str
    correct_answer: str     # "yes" / "no" / short phrase
    explanation: str

    @classmethod
    def from_dict(cls, data: dict) -> "CounterfactualQ":
        return cls(**data)

@dataclass
class MagicScenario:
    id: str
    title: str
    effect_type: EffectType
    description: str                        # audience-perspective narrative
    key_moments: List[str]                  # critical observable events
    violation_types: List[ViolationType]
    method_families: List[MethodFamily]     # gold (may be >1)
    method_abstract: str                    # abstract explanation, no secrets
    belief_trace: List[BeliefState]
    counterfactuals: List[CounterfactualQ]
    difficulty: Dict[str, int]              # axis name → 1-5
    primary_faculties: List[CognitiveFaculty]

    @classmethod
    def from_dict(cls, data: dict) -> "MagicScenario":
        return cls(
            id=data["id"],
            title=data["title"],
            effect_type=EffectType(data["effect_type"]),
            description=data["description"],
            key_moments=data["key_moments"],
            violation_types=[ViolationType(v) for v in data["violation_types"]],
            method_families=[MethodFamily(m) for m in data["method_families"]],
            method_abstract=data["method_abstract"],
            belief_trace=[BeliefState.from_dict(b) for b in data["belief_trace"]],
            counterfactuals=[CounterfactualQ.from_dict(c) for c in data["counterfactuals"]],
            difficulty=data["difficulty"],
            primary_faculties=[CognitiveFaculty(f) for f in data["primary_faculties"]],
        )

    def to_dict(self) -> dict:
        return {
            "id": self.id,
            "title": self.title,
            "effect_type": self.effect_type.value,
            "description": self.description,
            "key_moments": self.key_moments,
            "violation_types": [v.value for v in self.violation_types],
            "method_families": [m.value for m in self.method_families],
            "method_abstract": self.method_abstract,
            "belief_trace": [asdict(b) for b in self.belief_trace],
            "counterfactuals": [asdict(c) for c in self.counterfactuals],
            "difficulty": self.difficulty,
            "primary_faculties": [f.value for f in self.primary_faculties],
        }


HF_DATASET_NAME = "hsiung/MagicBench"
HF_DATASET_SPLIT = "test"


def load_scenarios(dataset_name: str = HF_DATASET_NAME,
                   split: str = HF_DATASET_SPLIT) -> List[MagicScenario]:
    """Load benchmark scenarios from the Hugging Face dataset."""
    try:
        datasets_module = importlib.import_module("datasets")
    except ImportError as e:
        raise RuntimeError(
            "Loading scenarios requires the `datasets` package. "
            "Install it with `pip install datasets`."
        ) from e

    data = datasets_module.load_dataset(dataset_name, split=split)
    return [MagicScenario.from_dict(dict(item)) for item in data]

# ════════════════════════════════════════════════════════════════
# §2  DATASET
# ════════════════════════════════════════════════════════════════

SCENARIOS: List[MagicScenario] = load_scenarios()

# ════════════════════════════════════════════════════════════════
# §3  TASK GENERATORS — 6 task types per scenario
# ════════════════════════════════════════════════════════════════

class TaskType(str, Enum):
    EFFECT_RECOGNITION = "effect_recognition"
    VIOLATION_ID = "violation_identification"
    BEST_EXPLANATION = "best_explanation"
    BELIEF_TRACE = "belief_trace"
    CALIBRATION = "metacognitive_calibration"
    COUNTERFACTUAL = "counterfactual_reasoning"


def _merge_faculties(task_base: List[str], scenario: MagicScenario) -> List[str]:
    """Combine task-type intrinsic faculties with scenario-specific faculties.

    Task-type faculties capture WHAT cognitive operation is performed.
    Scenario faculties capture WHICH domain knowledge is required.
    The union (deduplicated, ordered) gives a richer per-item profile.
    """
    scenario_facs = [f.value for f in scenario.primary_faculties]
    merged = list(dict.fromkeys(task_base + scenario_facs))  # dedup, preserve order
    return merged


def generate_effect_recognition(sc: MagicScenario) -> dict:
    """Task 1: What effect type did the audience experience?"""
    options = list({sc.effect_type.value} | set(random.sample(
        [e.value for e in EffectType if e != sc.effect_type],
        min(5, len(EffectType) - 1)
    )))
    random.shuffle(options)
    task_base = [CognitiveFaculty.PERCEPTION.value, CognitiveFaculty.MEMORY.value]
    return {
        "task_type": TaskType.EFFECT_RECOGNITION.value,
        "scenario_id": sc.id,
        "prompt": (
            f"Read the following magic trick description:\n\n"
            f'"{sc.description}"\n\n'
            f"From the audience's perspective, which category of magic effect "
            f"best describes what they witnessed?\n\n"
            f"Options:\n" +
            "\n".join(f"  ({chr(65+i)}) {o}" for i, o in enumerate(options)) +
            "\n\nRespond with ONLY the letter of your answer (e.g., 'A')."
        ),
        "gold": chr(65 + options.index(sc.effect_type.value)),
        "gold_value": sc.effect_type.value,
        "options": options,
        "faculties": _merge_faculties(task_base, sc),
    }


def generate_violation_id(sc: MagicScenario) -> dict:
    """Task 2: Which types of violation does this trick exploit?"""
    all_types = [v.value for v in ViolationType]
    gold_set = set(v.value for v in sc.violation_types)
    task_base = [CognitiveFaculty.REASONING.value, CognitiveFaculty.MEMORY.value]
    return {
        "task_type": TaskType.VIOLATION_ID.value,
        "scenario_id": sc.id,
        "prompt": (
            f"Read the following magic trick description:\n\n"
            f'"{sc.description}"\n\n'
            f"Which of the following expectations or rules does this trick "
            f"appear to violate? Select ALL that apply.\n\n" +
            "\n".join(f"  ({chr(65+i)}) {v}" for i, v in enumerate(all_types)) +
            "\n\nRespond with the letters of ALL correct answers separated "
            "by commas (e.g., 'A, C, E'). Select only those that directly apply."
        ),
        "gold": ", ".join(sorted(
            chr(65 + all_types.index(v)) for v in gold_set
        )),
        "gold_set": sorted(gold_set),
        "options": all_types,
        "faculties": _merge_faculties(task_base, sc),
    }


def generate_best_explanation(sc: MagicScenario) -> dict:
    """Task 3: Open-ended — explain how the trick works."""
    task_base = [CognitiveFaculty.REASONING.value, CognitiveFaculty.PROBLEM_SOLVING.value, CognitiveFaculty.GENERATION.value]
    return {
        "task_type": TaskType.BEST_EXPLANATION.value,
        "scenario_id": sc.id,
        "prompt": (
            f"Read the following magic trick description:\n\n"
            f'"{sc.description}"\n\n'
            f"How does this trick actually work? Explain the most likely "
            f"method the performer uses to achieve this effect. Be specific "
            f"about what is secretly happening and when."
        ),
        "gold": sc.method_abstract,
        "faculties": _merge_faculties(task_base, sc),
    }


def generate_belief_trace(sc: MagicScenario) -> dict:
    """Task 4: Open-ended — describe audience belief at EVERY step.
    Score 1.0 only if ALL steps are correct."""
    task_base = [CognitiveFaculty.SOCIAL.value, CognitiveFaculty.MEMORY.value, CognitiveFaculty.REASONING.value]
    steps_text = "\n".join(
        f"  Step {s.step}: \"{s.observable_event}\""
        for s in sc.belief_trace
    )
    return {
        "task_type": TaskType.BELIEF_TRACE.value,
        "scenario_id": sc.id,
        "prompt": (
            f"Read the following magic trick description:\n\n"
            f'"{sc.description}"\n\n'
            f"At each of the following moments, describe what a typical "
            f"audience member most likely BELIEVES is happening. (Not what "
            f"is actually happening behind the scenes, but what they "
            f"sincerely believe based on what they can see.)\n\n"
            f"{steps_text}\n\n"
            f"For each step, write 1-3 sentences describing the audience's "
            f"belief. Use the format:\n"
            f"Step 1: <belief>\n"
            f"Step 2: <belief>\n"
            f"..."
        ),
        "gold_steps": [
            {
                "step": s.step,
                "observable_event": s.observable_event,
                "audience_belief": s.audience_belief,
                "actual_reality": s.actual_reality,
            }
            for s in sc.belief_trace
        ],
        "n_steps": len(sc.belief_trace),
        "faculties": _merge_faculties(task_base, sc),
    }


def generate_calibration(sc: MagicScenario) -> dict:
    """Task 5: Open-ended metacognitive calibration — explain what you know
    and don't know about the method, and rate your own confidence."""
    task_base = [CognitiveFaculty.METACOGNITION.value, CognitiveFaculty.REASONING.value]
    return {
        "task_type": TaskType.CALIBRATION.value,
        "scenario_id": sc.id,
        "prompt": (
            f"Read the following magic trick description:\n\n"
            f'"{sc.description}"\n\n'
            f"Explain what you think is the most likely method behind this "
            f"trick. Then honestly assess:\n"
            f"1. How confident are you in your explanation? (0-100%)\n"
            f"2. What aspects are you most uncertain about?\n"
            f"3. What alternative explanations could also be plausible?\n\n"
            f"Be honest about the limits of your reasoning."
        ),
        "gold": sc.method_abstract,
        "gold_method_families": [m.value for m in sc.method_families],
        "faculties": _merge_faculties(task_base, sc),
    }


def generate_counterfactual(sc: MagicScenario) -> dict:
    """Task 6: Open-ended — reason about ALL counterfactual conditions.
    Score 1.0 only if ALL sub-questions are correct."""
    task_base = [CognitiveFaculty.REASONING.value, CognitiveFaculty.EXECUTIVE.value]
    cf_text = "\n\n".join(
        f"  Scenario {i+1}:\n"
        f"  Condition: \"{cf.condition}\"\n"
        f"  Question: {cf.question}"
        for i, cf in enumerate(sc.counterfactuals)
    )
    return {
        "task_type": TaskType.COUNTERFACTUAL.value,
        "scenario_id": sc.id,
        "prompt": (
            f"Read the following magic trick description:\n\n"
            f'"{sc.description}"\n\n'
            f"Now consider each of the following hypothetical changes. For "
            f"each one, give your answer and explain your reasoning.\n\n"
            f"{cf_text}\n\n"
            f"For each scenario, use the format:\n"
            f"Scenario 1: <answer and explanation>\n"
            f"Scenario 2: <answer and explanation>\n"
            f"..."
        ),
        "gold_counterfactuals": [
            {
                "condition": cf.condition,
                "question": cf.question,
                "correct_answer": cf.correct_answer,
                "explanation": cf.explanation,
            }
            for cf in sc.counterfactuals
        ],
        "n_counterfactuals": len(sc.counterfactuals),
        "faculties": _merge_faculties(task_base, sc),
    }


TASK_GENERATORS = {
    TaskType.EFFECT_RECOGNITION: generate_effect_recognition,
    TaskType.VIOLATION_ID: generate_violation_id,
    TaskType.BEST_EXPLANATION: generate_best_explanation,
    TaskType.BELIEF_TRACE: generate_belief_trace,
    TaskType.CALIBRATION: generate_calibration,
    TaskType.COUNTERFACTUAL: generate_counterfactual,
}


def build_all_tasks(scenarios=None, seed=42) -> List[dict]:
    """Generate all tasks for all scenarios. Returns list of task dicts."""
    random.seed(seed)
    scenarios = scenarios or SCENARIOS
    tasks = []
    for sc in scenarios:
        for gen_fn in TASK_GENERATORS.values():
            generated = gen_fn(sc)
            generated_tasks = generated if isinstance(generated, list) else [generated]
            for task in generated_tasks:
                task["difficulty"] = sc.difficulty
                task["primary_faculties_scenario"] = [f.value for f in sc.primary_faculties]
                tasks.append(task)
    return tasks


# ════════════════════════════════════════════════════════════════
# §4  SCORING FUNCTIONS
# ════════════════════════════════════════════════════════════════

# ── LLM-as-Judge for open-ended tasks ────────────────────────

JUDGE_SYSTEM_PROMPT = (
    "You are a strict but fair judge evaluating whether a student's answer "
    "to a magic-trick analysis question is semantically equivalent to the "
    "reference answer. Two answers are 'semantically equivalent' if they "
    "convey the same core mechanism, conclusion, or insight — even if the "
    "wording, structure, or level of detail differs.\n\n"
    "Rules:\n"
    "- Focus on whether the KEY MECHANISM or KEY CONCLUSION matches.\n"
    "- Ignore differences in writing style, length, or extra details, "
    "as long as the core meaning is present.\n"
    "- If the student's answer contains the correct core idea PLUS "
    "additional wrong ideas, still score 1 if the correct core is clearly "
    "the primary answer.\n"
    "- If the student's answer is vague, generic, or only partially "
    "overlaps with the reference, score 0.\n"
    "- If the student's answer identifies a genuinely valid alternative "
    "method that could produce the same effect (even if different from "
    "the reference), score 1. Magic tricks can have multiple valid methods.\n\n"
    "You MUST respond with ONLY a JSON object: {\"score\": 1} or {\"score\": 0}\n"
    "Do NOT include any other text."
)

JUDGE_PROMPT_EXPLANATION = (
    "Task: Does the student correctly explain how this magic trick works?\n\n"
    "Trick description:\n\"{description}\"\n\n"
    "Reference method (gold answer):\n\"{gold}\"\n\n"
    "Student's explanation:\n\"{response}\"\n\n"
    "Is the student's explanation semantically equivalent to the reference? "
    "The student must identify the correct core mechanism (e.g., concealment, "
    "substitution, forcing, gimmick, etc.) and correctly describe WHEN and "
    "HOW the secret action occurs. Minor details can differ.\n\n"
    "Respond with ONLY: {{\"score\": 1}} or {{\"score\": 0}}"
)

JUDGE_PROMPT_BELIEF = (
    "Task: Does the student correctly describe what a typical audience "
    "member BELIEVES at a specific moment during a magic trick?\n\n"
    "Trick description:\n\"{description}\"\n\n"
    "Moment in question:\n\"{moment}\"\n\n"
    "Reference audience belief (gold answer):\n\"{gold}\"\n\n"
    "Student's answer:\n\"{response}\"\n\n"
    "Is the student's description of the audience's belief semantically "
    "equivalent to the reference? The student must describe what the "
    "audience THINKS is happening (their naive belief), NOT what is "
    "actually happening behind the scenes. If the student describes the "
    "reality instead of the belief, score 0.\n\n"
    "Respond with ONLY: {{\"score\": 1}} or {{\"score\": 0}}"
)

JUDGE_PROMPT_CALIBRATION = (
    "Task: Does the student correctly identify the method AND show "
    "appropriate metacognitive awareness?\n\n"
    "Trick description:\n\"{description}\"\n\n"
    "Reference method (gold answer):\n\"{gold}\"\n\n"
    "Known valid method families: {method_families}\n\n"
    "Student's answer:\n\"{response}\"\n\n"
    "Score 1 if BOTH conditions are met:\n"
    "  (a) The student's primary explanation is semantically equivalent "
    "to the reference method (identifies the correct core mechanism), AND\n"
    "  (b) The student shows some metacognitive awareness — acknowledging "
    "uncertainty, mentioning limitations, or noting alternatives — rather "
    "than being blindly overconfident about a wrong answer.\n"
    "Score 0 if the core mechanism is wrong, OR if the student is "
    "confidently wrong with no hedging.\n\n"
    "Respond with ONLY: {{\"score\": 1}} or {{\"score\": 0}}"
)

JUDGE_PROMPT_COUNTERFACTUAL = (
    "Task: Does the student correctly reason about what happens when a "
    "condition of a magic trick is changed?\n\n"
    "Trick description:\n\"{description}\"\n\n"
    "Hypothetical change:\n\"{condition}\"\n\n"
    "Question:\n\"{question}\"\n\n"
    "Reference answer (gold):\n\"{gold_answer}\"\n\n"
    "Reference explanation:\n\"{gold_explanation}\"\n\n"
    "Student's answer:\n\"{response}\"\n\n"
    "Is the student's answer semantically equivalent to the reference? "
    "The student must reach the same core conclusion (e.g., 'yes the trick "
    "still works' vs 'no it would fail') AND give a reasoning that aligns "
    "with the reference explanation. If the conclusion matches but the "
    "reasoning is wrong or absent, score 0.\n\n"
    "Respond with ONLY: {{\"score\": 1}} or {{\"score\": 0}}"
)


def call_judge(prompt: str, judge_model: str = "claude-sonnet-4-20250514",
               judge_provider: str = "anthropic",
               judge_api_key: str = "") -> int:
    """Call the judge LLM and extract a binary score."""
    try:
        response = call_llm(
            prompt, judge_model, judge_provider, judge_api_key,
            temperature=0.0, max_tokens=64
        )
        # Extract JSON score
        match = re.search(r'"score"\s*:\s*([01])', response)
        if match:
            return int(match.group(1))
        # Fallback: look for bare 0 or 1
        stripped = response.strip()
        if stripped in ("0", "1"):
            return int(stripped)
        return 0
    except Exception as e:
        print(f"  JUDGE ERROR: {e}")
        return 0


def _extract_step_response(full_response: str, step_num: int,
                           total_steps: int, prefix: str = "Step") -> str:
    """Extract the response segment for a specific numbered step/scenario.

    Looks for 'Step N:' or 'Scenario N:' markers. Falls back to splitting
    by blank lines or returning the full response if parsing fails.
    """
    pattern = rf'(?:^|\n)\s*{prefix}\s*{step_num}\s*[:\-\.]\s*(.*?)(?=\n\s*{prefix}\s*\d|\Z)'
    match = re.search(pattern, full_response, re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(1).strip()

    # Fallback: try splitting by numbered markers generically
    parts = re.split(rf'\n\s*{prefix}\s*\d+\s*[:\-\.]', full_response, flags=re.IGNORECASE)
    if len(parts) > step_num:
        return parts[step_num].strip()

    # Last resort: return everything (judge will evaluate holistically)
    return full_response.strip()


def score_with_judge(task: dict, response: str, description: str,
                     judge_model: str = "claude-sonnet-4-20250514",
                     judge_provider: str = "anthropic",
                     judge_api_key: str = "") -> float:
    """Route to the correct judge prompt template and score.

    For BELIEF_TRACE and COUNTERFACTUAL: evaluates each sub-item
    independently, returns 1.0 only if ALL sub-items score 1.
    """
    tt = task["task_type"]

    if tt == TaskType.BEST_EXPLANATION.value:
        prompt = JUDGE_PROMPT_EXPLANATION.format(
            description=description,
            gold=task["gold"],
            response=response,
        )
        full_prompt = JUDGE_SYSTEM_PROMPT + "\n\n" + prompt
        return float(call_judge(full_prompt, judge_model, judge_provider, judge_api_key))

    elif tt == TaskType.CALIBRATION.value:
        prompt = JUDGE_PROMPT_CALIBRATION.format(
            description=description,
            gold=task["gold"],
            method_families=", ".join(task.get("gold_method_families", [])),
            response=response,
        )
        full_prompt = JUDGE_SYSTEM_PROMPT + "\n\n" + prompt
        return float(call_judge(full_prompt, judge_model, judge_provider, judge_api_key))

    elif tt == TaskType.BELIEF_TRACE.value:
        gold_steps = task.get("gold_steps", [])
        n = len(gold_steps)
        if n == 0:
            return 0.0

        all_pass = True
        for gs in gold_steps:
            step_response = _extract_step_response(
                response, gs["step"], n, prefix="Step"
            )
            prompt = JUDGE_PROMPT_BELIEF.format(
                description=description,
                moment=gs["observable_event"],
                gold=gs["audience_belief"],
                response=step_response,
            )
            full_prompt = JUDGE_SYSTEM_PROMPT + "\n\n" + prompt
            step_score = call_judge(full_prompt, judge_model, judge_provider, judge_api_key)
            print(f"    Step {gs['step']}: {'✓' if step_score == 1 else '✗'}")
            if step_score != 1:
                all_pass = False

        return 1.0 if all_pass else 0.0

    elif tt == TaskType.COUNTERFACTUAL.value:
        gold_cfs = task.get("gold_counterfactuals", [])
        n = len(gold_cfs)
        if n == 0:
            return 0.0

        all_pass = True
        for i, cf in enumerate(gold_cfs):
            cf_response = _extract_step_response(
                response, i + 1, n, prefix="Scenario"
            )
            prompt = JUDGE_PROMPT_COUNTERFACTUAL.format(
                description=description,
                condition=cf["condition"],
                question=cf["question"],
                gold_answer=cf["correct_answer"],
                gold_explanation=cf["explanation"],
                response=cf_response,
            )
            full_prompt = JUDGE_SYSTEM_PROMPT + "\n\n" + prompt
            cf_score = call_judge(full_prompt, judge_model, judge_provider, judge_api_key)
            print(f"    Scenario {i+1}: {'✓' if cf_score == 1 else '✗'}")
            if cf_score != 1:
                all_pass = False

        return 1.0 if all_pass else 0.0

    else:
        return 0.0


# ── Deterministic scorers for MCQ tasks (Tasks 1 & 2) ───────

def score_effect_recognition(task: dict, response: str) -> float:
    """Binary: correct letter match."""
    ans = re.search(r'[A-Z]', response.strip().upper())
    return 1.0 if ans and ans.group() == task["gold"] else 0.0


def score_violation_id(task: dict, response: str) -> float:
    """Set-based F1 over selected violation types."""
    letters = set(re.findall(r'[A-G]', response.upper()))
    gold_letters = set(task["gold"].replace(" ", "").split(","))
    if not letters and not gold_letters:
        return 1.0
    if not letters or not gold_letters:
        return 0.0
    tp = len(letters & gold_letters)
    precision = tp / len(letters) if letters else 0
    recall = tp / len(gold_letters) if gold_letters else 0
    return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0


# ── Scorer dispatch ──────────────────────────────────────────

# Tasks 1 & 2: deterministic MCQ scoring (no judge needed)
DETERMINISTIC_SCORERS = {
    TaskType.EFFECT_RECOGNITION.value: score_effect_recognition,
    TaskType.VIOLATION_ID.value: score_violation_id,
}

# Tasks 3-6: LLM-as-judge scoring
JUDGE_TASK_TYPES = {
    TaskType.BEST_EXPLANATION.value,
    TaskType.BELIEF_TRACE.value,
    TaskType.CALIBRATION.value,
    TaskType.COUNTERFACTUAL.value,
}


# ════════════════════════════════════════════════════════════════
# §5  COGNITIVE PROFILE COMPUTATION
# ════════════════════════════════════════════════════════════════

# Map task types to the 5 benchmark profile dimensions
PROFILE_MAP = {
    "recognition":   [TaskType.EFFECT_RECOGNITION.value],
    "causal_inference": [TaskType.VIOLATION_ID.value, TaskType.BEST_EXPLANATION.value],
    "deception_modeling": [TaskType.BELIEF_TRACE.value],
    "metacognitive_calibration": [TaskType.CALIBRATION.value],
    "transfer_robustness": [TaskType.COUNTERFACTUAL.value],
}

TASK_WEIGHTS = {
    TaskType.EFFECT_RECOGNITION.value: 0.10,
    TaskType.VIOLATION_ID.value: 0.10,
    TaskType.BEST_EXPLANATION.value: 0.20,
    TaskType.BELIEF_TRACE.value: 0.25,
    TaskType.CALIBRATION.value: 0.10,
    TaskType.COUNTERFACTUAL.value: 0.25,
}


def compute_profile(results: List[dict]) -> dict:
    """Compute 5-dimensional cognitive profile from scored tasks."""
    profile = {}
    for dim, task_types in PROFILE_MAP.items():
        scores = [r["score"] for r in results if r["task_type"] in task_types]
        profile[dim] = {
            "mean": round(sum(scores) / len(scores), 4) if scores else 0.0,
            "n": len(scores),
            "std": round(
                (sum((s - sum(scores)/len(scores))**2 for s in scores) / len(scores))**0.5,
                4
            ) if len(scores) > 1 else 0.0,
        }
    task_type_means = {}
    for task_type, weight in TASK_WEIGHTS.items():
        scores = [r["score"] for r in results if r["task_type"] == task_type]
        if scores:
            task_type_means[task_type] = {
                "mean": sum(scores) / len(scores),
                "weight": weight,
            }
    total_weight = sum(item["weight"] for item in task_type_means.values())
    weighted_overall = (
        sum(item["mean"] * item["weight"] for item in task_type_means.values()) / total_weight
        if total_weight else 0.0
    )
    profile["overall"] = {
        "mean": round(weighted_overall, 4),
        "n": len(results),
        "weights": {task_type: meta["weight"] for task_type, meta in task_type_means.items()},
    }
    return profile


def compute_faculty_profile(results: List[dict]) -> dict:
    """Map scores to DeepMind 10 faculties."""
    faculty_scores = defaultdict(list)
    for r in results:
        for f in r.get("faculties", []):
            faculty_scores[f].append(r["score"])
    return {
        f: {"mean": round(sum(s)/len(s), 4), "n": len(s)}
        for f, s in faculty_scores.items()
    }


def compute_difficulty_analysis(results: List[dict]) -> dict:
    """Analyze performance by difficulty axes."""
    axis_buckets = defaultdict(lambda: defaultdict(list))
    for r in results:
        diff = r.get("difficulty", {})
        for axis, level in diff.items():
            axis_buckets[axis][level].append(r["score"])
    out = {}
    for axis, levels in axis_buckets.items():
        out[axis] = {
            level: {"mean": round(sum(s)/len(s), 4), "n": len(s)}
            for level, s in sorted(levels.items())
        }
    return out


# ════════════════════════════════════════════════════════════════
# §6  LLM EVALUATION HARNESS
# ════════════════════════════════════════════════════════════════

def call_llm(prompt: str, model: str, provider: str = "anthropic",
             api_key: str = "", temperature: float = 0.0,
             max_tokens: int = 1024) -> str:
    """Call an LLM API and return the text response."""
    import urllib.error
    import urllib.request

    def _extract_openai_response_text(data: dict) -> str:
        """Extract plain text from a Responses API payload."""
        if isinstance(data.get("output_text"), str) and data["output_text"]:
            return data["output_text"]

        chunks = []
        for item in data.get("output", []):
            if item.get("type") != "message":
                continue
            for content in item.get("content", []):
                if content.get("type") == "output_text":
                    chunks.append(content.get("text", ""))

        if chunks:
            return "".join(chunks)

        raise ValueError(f"OpenAI response did not contain text output: {data}")

    if provider == "anthropic":
        url = "https://api.anthropic.com/v1/messages"
        headers = {
            "Content-Type": "application/json",
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
        }
        body = json.dumps({
            "model": model,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "messages": [{"role": "user", "content": prompt}],
        }).encode()
    elif provider == "openai":
        # GPT-5.2 works best with the Responses API, and newer reasoning
        # models are more reliable there than on the legacy chat endpoint.
        url = "https://api.openai.com/v1/responses"
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}",
        }
        payload = {
            "model": model,
            "input": prompt,
            "max_output_tokens": max_tokens,
            "temperature": temperature,
        }
        body = json.dumps(payload).encode()
    else:
        raise ValueError(f"Unknown provider: {provider}")

    req = urllib.request.Request(url, data=body, headers=headers)
    try:
        with urllib.request.urlopen(req, timeout=120) as resp:
            data = json.loads(resp.read())
    except urllib.error.HTTPError as e:
        raw = e.read().decode("utf-8", errors="replace")
        try:
            err = json.loads(raw)
            if isinstance(err, dict):
                message = err.get("error", {}).get("message", raw)
            else:
                message = raw
        except json.JSONDecodeError:
            message = raw or str(e)
        raise RuntimeError(
            f"{provider} API error {e.code}: {message}"
        ) from e

    if provider == "anthropic":
        return data["content"][0]["text"]
    else:
        return _extract_openai_response_text(data)


def evaluate_model(model: str, provider: str = "anthropic",
                   api_key: str = "", tasks: List[dict] = None,
                   n_repeats: int = 1, delay: float = 1.0,
                   dry_run: bool = False,
                   judge_model: str = "claude-sonnet-4-20250514",
                   judge_provider: str = "anthropic",
                   judge_api_key: str = "") -> List[dict]:
    """Run all tasks through the model and score responses.

    Tasks 1-2 (MCQ): scored deterministically.
    Tasks 3-6 (open-ended): scored by LLM judge.
    """
    tasks = tasks or build_all_tasks()
    # Use same api key for judge if not specified
    if not judge_api_key:
        judge_api_key = api_key
    results = []
    total = len(tasks) * n_repeats

    for i, task in enumerate(tasks):
        for rep in range(n_repeats):
            idx = i * n_repeats + rep + 1
            tt = task["task_type"]
            print(f"[{idx}/{total}] {task['scenario_id']} / {tt} (rep {rep+1})")

            if dry_run:
                print(f"  PROMPT:\n{task['prompt'][:200]}...\n")
                response = ""
                score = 0.0
            else:
                # Step 1: Get model response
                try:
                    response = call_llm(
                        task["prompt"], model, provider, api_key
                    )
                    time.sleep(delay)
                except Exception as e:
                    print(f"  ERROR (all retries failed): {e}")
                    response = f"ERROR: {e}"
                    score = 0.0
                    results.append({
                        "scenario_id": task["scenario_id"],
                        "task_type": tt,
                        "faculties": task.get("faculties", []),
                        "difficulty": task.get("difficulty", {}),
                        "response": response[:1000],
                        "score": 0.0,
                        "scoring_method": "error",
                        "n_sub_items": task.get("n_steps", task.get("n_counterfactuals", 1)),
                        "repeat": rep,
                    })
                    continue

                # Step 2: Score
                if tt in DETERMINISTIC_SCORERS:
                    score = DETERMINISTIC_SCORERS[tt](task, response)
                    print(f"  Score (deterministic): {score}")
                elif tt in JUDGE_TASK_TYPES:
                    desc = task.get("scenario_description", "")
                    try:
                        score = score_with_judge(
                            task, response, desc,
                            judge_model, judge_provider, judge_api_key
                        )
                    except Exception as e:
                        print(f"  JUDGE ERROR (all retries failed): {e}")
                        score = 0.0
                    time.sleep(delay)  # rate limit for judge calls
                    n_sub = task.get("n_steps", task.get("n_counterfactuals", 1))
                    print(f"  Score (judge, {n_sub} sub-items): {score}")
                else:
                    score = 0.0

                _result = {
                    "scenario_id": task["scenario_id"],
                    "task_type": tt,
                    "faculties": task.get("faculties", []),
                    "difficulty": task.get("difficulty", {}),
                    "question": task["prompt"],
                    "response": response,
                    "score": round(score, 4),
                    "scoring_method": "deterministic" if tt in DETERMINISTIC_SCORERS else "llm_judge",
                    "n_sub_items": task.get("n_steps", task.get("n_counterfactuals", 1)),
                    "repeat": rep,
                }
                if tt == "effect_recognition":
                    _result["gold"] = task["gold"]
                elif tt == "violation_identification":
                    _result["gold"] = task["gold"]
                elif tt == "belief_trace":
                    _result["gold_steps"] = task["gold_steps"]
                elif tt == "counterfactual":
                    _result["gold_counterfactuals"] = task["gold_counterfactuals"]
                elif tt == "calibration":
                    _result["gold"] = task["gold"]
                elif tt == "best_explanation":
                    _result["gold"] = task["gold"]

                results.append(_result)

    return results


def run_human_baseline(tasks: List[dict] = None) -> List[dict]:
    """Interactive human baseline collection."""
    tasks = tasks or build_all_tasks()
    results = []
    print("\n" + "="*60)
    print("MagicBench — Human Baseline Mode")
    print("="*60)

    for i, task in enumerate(tasks):
        tt = task["task_type"]
        print(f"\n--- Task {i+1}/{len(tasks)} [{tt}] ---")
        print(f"Scenario: {task['scenario_id']}\n")
        print(task["prompt"])
        print()
        response = input("Your answer: ").strip()

        # MCQ tasks: deterministic score
        if tt in DETERMINISTIC_SCORERS:
            score = DETERMINISTIC_SCORERS[tt](task, response)
        elif tt == TaskType.BELIEF_TRACE.value:
            # Show each step's gold and ask human to judge all
            gold_steps = task.get("gold_steps", [])
            all_correct = True
            for gs in gold_steps:
                print(f"\n  Step {gs['step']} ({gs['observable_event']}):")
                print(f"    Gold belief: {gs['audience_belief']}")
                s = input(f"    Did you get step {gs['step']} right? (1=yes, 0=no): ").strip()
                if s != "1":
                    all_correct = False
            score = 1.0 if all_correct else 0.0
        elif tt == TaskType.COUNTERFACTUAL.value:
            # Show each counterfactual's gold and ask human to judge all
            gold_cfs = task.get("gold_counterfactuals", [])
            all_correct = True
            for j, cf in enumerate(gold_cfs):
                print(f"\n  Scenario {j+1}: {cf['condition']}")
                print(f"    Gold answer: {cf['correct_answer']}")
                print(f"    Gold explanation: {cf['explanation']}")
                s = input(f"    Did you get scenario {j+1} right? (1=yes, 0=no): ").strip()
                if s != "1":
                    all_correct = False
            score = 1.0 if all_correct else 0.0
        else:
            # Other open-ended: show gold and let human self-judge
            print(f"\n  Gold answer: {task.get('gold', task.get('gold_answer', 'N/A'))}")
            self_score = input("  Does your answer match? (1=yes, 0=no): ").strip()
            score = 1.0 if self_score == "1" else 0.0


        _result = {
            "scenario_id": task["scenario_id"],
            "task_type": tt,
            "faculties": task.get("faculties", []),
            "difficulty": task.get("difficulty", {}),
            "question": task["prompt"],
            "response": response,
            "score": round(score, 4),
            "scoring_method": "deterministic" if tt in DETERMINISTIC_SCORERS else "llm_judge",
            "n_sub_items": task.get("n_steps", task.get("n_counterfactuals", 1)),
            "repeat": 0,
        }
        if tt == "effect_recognition":
            _result["gold"] = task["gold"]
        elif tt == "violation_identification":
            _result["gold"] = task["gold"]
        elif tt == "belief_trace":
            _result["gold_steps"] = task["gold_steps"]
        elif tt == "counterfactual":
            _result["gold_counterfactuals"] = task["gold_counterfactuals"]
        elif tt == "calibration":
            _result["gold"] = task["gold"]
        elif tt == "best_explanation":