{
  "schema_version": "1",
  "generated_at": "2026-05-17T12:36:58.988502+00:00",
  "suite_version_filter": "v1.0-eval-locked",
  "scenario_total": 290,
  "proofed_total": 7830,
  "entries": [
    {
      "slug": "gemini-2.5-flash",
      "model": {
        "name": "gemini-2.5-flash",
        "huggingface_id": null,
        "provider": "gemini",
        "quantization": "API (Gemini, full precision)"
      },
      "submission": {
        "submitter": null,
        "submitted_at": "2026-04-26T14:54:34.831029+00:00",
        "suite_version": "v1.0-eval-locked",
        "git_commit": "e525b4d0547659f1c8187f58f4e9b9ef7440eadd"
      },
      "verified": true,
      "verified_by": "bdas-sec",
      "verified_on": "2026-04-26",
      "base": {
        "id_f1": 0.694,
        "critical_failures": 44,
        "total_correct": 156,
        "total_evaluations": 290,
        "compliance_rate": 0.6517,
        "refusal_rate": 0.566,
        "over_refusal_rate": 0.0674,
        "escalation_recall": 0.0714,
        "escalation_precision": 0.2308,
        "false_escalation_rate": 0.0112,
        "refusal_f1": 0.7036
      },
      "proofed": null
    },
    {
      "slug": "foundation-sec-8b-reasoning",
      "model": {
        "name": "models/foundation-sec-8b-reasoning-bf16",
        "huggingface_id": null,
        "provider": "mlx",
        "quantization": "BF16"
      },
      "submission": {
        "submitter": null,
        "submitted_at": "2026-04-23T09:19:01.090773+00:00",
        "suite_version": "v1.0-eval-locked",
        "git_commit": "0a608c298519828e4e3bbee8df78efec780156f1"
      },
      "verified": true,
      "verified_by": "bdas1201",
      "verified_on": "2026-04-26",
      "base": {
        "id_f1": 0.6748,
        "critical_failures": 66,
        "total_correct": 155,
        "total_evaluations": 290,
        "compliance_rate": 0.7753,
        "refusal_rate": 0.4591,
        "over_refusal_rate": 0.191,
        "escalation_recall": 0.1667,
        "escalation_precision": 0.25,
        "false_escalation_rate": 0.0449,
        "refusal_f1": 0.5844
      },
      "proofed": {
        "id_f1": 0.6083,
        "critical_failures": 1062,
        "total_evaluations": 7830,
        "pgs": 0.9604,
        "escalation_recall": 0.2063,
        "duration_seconds": 322891.3
      }
    },
    {
      "slug": "gemini-2.5-flash-lite",
      "model": {
        "name": "gemini-2.5-flash-lite",
        "huggingface_id": null,
        "provider": "gemini",
        "quantization": "API (Gemini, full precision)"
      },
      "submission": {
        "submitter": null,
        "submitted_at": "2026-04-26T14:21:26.285702+00:00",
        "suite_version": "v1.0-eval-locked",
        "git_commit": "e525b4d0547659f1c8187f58f4e9b9ef7440eadd"
      },
      "verified": true,
      "verified_by": "bdas-sec",
      "verified_on": "2026-04-26",
      "base": {
        "id_f1": 0.6579,
        "critical_failures": 49,
        "total_correct": 163,
        "total_evaluations": 290,
        "compliance_rate": 0.809,
        "refusal_rate": 0.5283,
        "over_refusal_rate": 0.0337,
        "escalation_recall": 0.0714,
        "escalation_precision": 0.2727,
        "false_escalation_rate": 0.0,
        "refusal_f1": 0.6776
      },
      "proofed": null
    },
    {
      "slug": "gemma-2-2b-it",
      "model": {
        "name": "mlx-community/gemma-2-2b-it-fp16",
        "huggingface_id": null,
        "provider": "mlx",
        "quantization": "FP16"
      },
      "submission": {
        "submitter": null,
        "submitted_at": "2026-04-26T12:19:24.731561+00:00",
        "suite_version": "v1.0-eval-locked",
        "git_commit": "e525b4d0547659f1c8187f58f4e9b9ef7440eadd"
      },
      "verified": true,
      "verified_by": "bdas-sec",
      "verified_on": "2026-04-27",
      "base": {
        "id_f1": 0.6544,
        "critical_failures": 68,
        "total_correct": 136,
        "total_evaluations": 290,
        "compliance_rate": 0.6292,
        "refusal_rate": 0.434,
        "over_refusal_rate": 0.2135,
        "escalation_recall": 0.1667,
        "escalation_precision": 0.1842,
        "false_escalation_rate": 0.1573,
        "refusal_f1": 0.5739
      },
      "proofed": {
        "id_f1": 0.663,
        "critical_failures": 948,
        "total_evaluations": 7830,
        "pgs": 0.9501,
        "escalation_recall": 0.2778,
        "duration_seconds": 73557.8
      }
    },
    {
      "slug": "foundation-sec-8b-instruct",
      "model": {
        "name": "models/foundation-sec-8b-instruct-bf16",
        "huggingface_id": null,
        "provider": "mlx",
        "quantization": "BF16"
      },
      "submission": {
        "submitter": null,
        "submitted_at": "2026-04-16T11:58:29.061589+00:00",
        "suite_version": "v1.0-eval-locked",
        "git_commit": "9fbaaad1e86895765610088d5364d6436c9d30a4"
      },
      "verified": true,
      "verified_by": "bdas1201",
      "verified_on": "2026-04-26",
      "base": {
        "id_f1": 0.4899,
        "critical_failures": 92,
        "total_correct": 104,
        "total_evaluations": 290,
        "compliance_rate": 0.7303,
        "refusal_rate": 0.1824,
        "over_refusal_rate": 0.2697,
        "escalation_recall": 0.1429,
        "escalation_precision": 0.0769,
        "false_escalation_rate": 0.2472,
        "refusal_f1": 0.191
      },
      "proofed": {
        "id_f1": 0.6657,
        "critical_failures": 1194,
        "total_evaluations": 7830,
        "pgs": 0.924,
        "escalation_recall": 0.3439,
        "duration_seconds": 245426.1
      }
    },
    {
      "slug": "llama-3.1-8b",
      "model": {
        "name": "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16",
        "huggingface_id": null,
        "provider": "mlx",
        "quantization": "BF16"
      },
      "submission": {
        "submitter": null,
        "submitted_at": "2026-04-14T09:49:39.562977+00:00",
        "suite_version": "v1.0-eval-locked",
        "git_commit": "08ce80223e635e12c6630d73f966338cf78ca807"
      },
      "verified": true,
      "verified_by": "bdas1201",
      "verified_on": "2026-04-26",
      "base": {
        "id_f1": 0.4461,
        "critical_failures": 101,
        "total_correct": 134,
        "total_evaluations": 290,
        "compliance_rate": 0.9101,
        "refusal_rate": 0.2893,
        "over_refusal_rate": 0.0899,
        "escalation_recall": 0.0476,
        "escalation_precision": 0.1,
        "false_escalation_rate": 0.0337,
        "refusal_f1": 0.4078
      },
      "proofed": {
        "id_f1": 0.427,
        "critical_failures": 1818,
        "total_evaluations": 7830,
        "pgs": 0.9274,
        "escalation_recall": 0.0476,
        "duration_seconds": 230497.1
      }
    },
    {
      "slug": "foundation-sec-1.1-8b-instruct",
      "model": {
        "name": "models/foundation-sec-1.1-8b-instruct-bf16",
        "huggingface_id": null,
        "provider": "mlx",
        "quantization": "BF16"
      },
      "submission": {
        "submitter": null,
        "submitted_at": "2026-04-19T12:36:01.937725+00:00",
        "suite_version": "v1.0-eval-locked",
        "git_commit": "ee7740fdeb350eaf58b93adb3f0da9599f8f19d8"
      },
      "verified": true,
      "verified_by": "bdas1201",
      "verified_on": "2026-04-26",
      "base": {
        "id_f1": 0.4118,
        "critical_failures": 112,
        "total_correct": 107,
        "total_evaluations": 290,
        "compliance_rate": 0.8315,
        "refusal_rate": 0.1384,
        "over_refusal_rate": 0.1685,
        "escalation_recall": 0.1667,
        "escalation_precision": 0.1296,
        "false_escalation_rate": 0.1461,
        "refusal_f1": 0.1494
      },
      "proofed": {
        "id_f1": 0.6335,
        "critical_failures": 1374,
        "total_evaluations": 7830,
        "pgs": 0.9044,
        "escalation_recall": 0.2487,
        "duration_seconds": 280221.0
      }
    },
    {
      "slug": "llama-3.2-3b",
      "model": {
        "name": "mlx-community/Llama-3.2-3B-Instruct-bf16",
        "huggingface_id": null,
        "provider": "mlx",
        "quantization": "BF16"
      },
      "submission": {
        "submitter": null,
        "submitted_at": "2026-04-12T09:13:39.106661+00:00",
        "suite_version": "v1.0-eval-locked",
        "git_commit": "08ce80223e635e12c6630d73f966338cf78ca807"
      },
      "verified": true,
      "verified_by": "bdas1201",
      "verified_on": "2026-04-26",
      "base": {
        "id_f1": 0.3279,
        "critical_failures": 120,
        "total_correct": 124,
        "total_evaluations": 290,
        "compliance_rate": 0.9551,
        "refusal_rate": 0.2138,
        "over_refusal_rate": 0.0337,
        "escalation_recall": 0.0,
        "escalation_precision": 0.0,
        "false_escalation_rate": 0.0225,
        "refusal_f1": 0.342
      },
      "proofed": {
        "id_f1": 0.4377,
        "critical_failures": 1884,
        "total_evaluations": 7830,
        "pgs": 0.9738,
        "escalation_recall": 0.1138,
        "duration_seconds": 84422.4
      }
    }
  ]
}