Skip to content

Commit bdcd3dd

Browse files
committed
Update leaderboard
1 parent f702a7f commit bdcd3dd

File tree

3 files changed

+132
-4
lines changed

3 files changed

+132
-4
lines changed

data/leaderboards.json

Lines changed: 130 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1334,7 +1334,7 @@
13341334
"logo": [
13351335
"https://joycode.s3.cn-north-1.jdcloud-oss.com/joycodelogo.png"
13361336
],
1337-
"site": "https://joycode.jd.com",
1337+
"site": "https://github.com/jd-opensource/joycode-agent",
13381338
"folder": "20250915_JoyCode",
13391339
"resolved": 74.6,
13401340
"date": "2025-09-15",
@@ -1381,6 +1381,33 @@
13811381
],
13821382
"warning": null
13831383
},
1384+
{
1385+
"name": "Prometheus-v1.2.1 + GPT-5",
1386+
"logo": [
1387+
"https://raw.githubusercontent.com/EuniAI/Prometheus/main/docs/static/images/icon.jpg",
1388+
"https://raw.githubusercontent.com/EuniAI/Prometheus/main/docs/static/images/delysium.jpg"
1389+
],
1390+
"site": "https://euni.ai/",
1391+
"folder": "20251015_Prometheus_v1.2.1_gpt5",
1392+
"resolved": 74.4,
1393+
"date": "2025-10-15",
1394+
"logs": "s3://swe-bench-experiments/verified/20251015_Prometheus_v1.2.1_gpt5/logs",
1395+
"trajs": "s3://swe-bench-experiments/verified/20251015_Prometheus_v1.2.1_gpt5/trajs",
1396+
"trajs_docent": false,
1397+
"cost": null,
1398+
"instance_cost": null,
1399+
"instance_calls": null,
1400+
"os_model": false,
1401+
"os_system": true,
1402+
"checked": false,
1403+
"tags": [
1404+
"Model: gpt-5-2025-08-07",
1405+
"Org: EuniAI",
1406+
"Org: Delysium",
1407+
"System: Attempts - 2+"
1408+
],
1409+
"warning": null
1410+
},
13841411
{
13851412
"name": "Tools + Claude 4 Opus (2025-05-22)",
13861413
"logo": [
@@ -1405,6 +1432,32 @@
14051432
],
14061433
"warning": null
14071434
},
1435+
{
1436+
"name": "Salesforce AI Research SAGE (bash-only)",
1437+
"logo": [
1438+
"https://avatars.githubusercontent.com/u/137096229"
1439+
],
1440+
"site": "https://www.salesforce.com/blog/sage-swe/",
1441+
"folder": "20251021_SalesforceAIResearch_SAGE_bash_only",
1442+
"resolved": 73.0,
1443+
"date": "2025-10-21",
1444+
"logs": "s3://swe-bench-experiments/verified/20251021_SalesforceAIResearch_SAGE_bash_only/logs",
1445+
"trajs": "s3://swe-bench-experiments/verified/20251021_SalesforceAIResearch_SAGE_bash_only/trajs",
1446+
"trajs_docent": false,
1447+
"cost": null,
1448+
"instance_cost": null,
1449+
"instance_calls": null,
1450+
"os_model": false,
1451+
"os_system": false,
1452+
"checked": false,
1453+
"tags": [
1454+
"Model: claude-sonnet-4.5",
1455+
"Model: gpt-5",
1456+
"Org: Salesforce AI Research",
1457+
"System: Attempts - 2+"
1458+
],
1459+
"warning": null
1460+
},
14081461
{
14091462
"name": "Tools + Claude 4 Sonnet (2025-05-22)",
14101463
"logo": [
@@ -1471,7 +1524,7 @@
14711524
"instance_cost": null,
14721525
"instance_calls": null,
14731526
"os_model": false,
1474-
"os_system": false,
1527+
"os_system": true,
14751528
"checked": false,
14761529
"tags": [
14771530
"Model: gpt-5-2025-08-07",
@@ -1532,6 +1585,31 @@
15321585
],
15331586
"warning": null
15341587
},
1588+
{
1589+
"name": "Lingxi v1.5 x Kimi K2",
1590+
"logo": [
1591+
"https://upload.wikimedia.org/wikipedia/en/thumb/0/04/Huawei_Standard_logo.svg/1200px-Huawei_Standard_logo.svg.png"
1592+
],
1593+
"site": "https://github.com/lingxi-agent/Lingxi/tree/master",
1594+
"folder": "20251014_Lingxi_kimi_k2",
1595+
"resolved": 71.2,
1596+
"date": "2025-10-14",
1597+
"logs": "s3://swe-bench-experiments/verified/20251014_Lingxi_kimi_k2/logs",
1598+
"trajs": "s3://swe-bench-experiments/verified/20251014_Lingxi_kimi_k2/trajs",
1599+
"trajs_docent": false,
1600+
"cost": null,
1601+
"instance_cost": null,
1602+
"instance_calls": null,
1603+
"os_model": true,
1604+
"os_system": true,
1605+
"checked": false,
1606+
"tags": [
1607+
"Model: kimi-k2-0905-preview",
1608+
"Org: Huawei",
1609+
"System: Attempts - 1"
1610+
],
1611+
"warning": null
1612+
},
15351613
{
15361614
"name": "Warp",
15371615
"logo": [
@@ -2718,6 +2796,31 @@
27182796
"warning": null,
27192797
"mini-swe-agent_version": "1.9.1"
27202798
},
2799+
{
2800+
"name": "FrogBoss-32B-2510",
2801+
"logo": [
2802+
"https://upload.wikimedia.org/wikipedia/commons/4/44/Microsoft_logo.svg"
2803+
],
2804+
"site": "https://microsoft.github.io/debug-gym/blog/2025/10/bug-pilot/",
2805+
"folder": "20251110_frogboss-32b",
2806+
"resolved": 53.6,
2807+
"date": "2025-11-10",
2808+
"logs": "s3://swe-bench-experiments/verified/20251110_frogboss-32b/logs",
2809+
"trajs": "s3://swe-bench-experiments/verified/20251110_frogboss-32b/trajs",
2810+
"trajs_docent": false,
2811+
"cost": null,
2812+
"instance_cost": null,
2813+
"instance_calls": null,
2814+
"os_model": false,
2815+
"os_system": true,
2816+
"checked": "false (See README.md for info on how to get your results verified)",
2817+
"tags": [
2818+
"Model: FrogBoss-32B-2510",
2819+
"Org: Microsoft",
2820+
"System: Attempts - 1"
2821+
],
2822+
"warning": null
2823+
},
27212824
{
27222825
"name": "mini-SWE-agent + Gemini 2.5 Pro (2025-05-06)",
27232826
"logo": [
@@ -3312,6 +3415,31 @@
33123415
],
33133416
"warning": null
33143417
},
3418+
{
3419+
"name": "FrogMini-14B-2510",
3420+
"logo": [
3421+
"https://upload.wikimedia.org/wikipedia/commons/4/44/Microsoft_logo.svg"
3422+
],
3423+
"site": "https://microsoft.github.io/debug-gym/blog/2025/10/bug-pilot/",
3424+
"folder": "20251110_frogmini-14b",
3425+
"resolved": 45.0,
3426+
"date": "2025-11-10",
3427+
"logs": "s3://swe-bench-experiments/verified/20251110_frogmini-14b/logs",
3428+
"trajs": "s3://swe-bench-experiments/verified/20251110_frogmini-14b/trajs",
3429+
"trajs_docent": false,
3430+
"cost": null,
3431+
"instance_cost": null,
3432+
"instance_calls": null,
3433+
"os_model": false,
3434+
"os_system": true,
3435+
"checked": "false (See README.md for info on how to get your results verified)",
3436+
"tags": [
3437+
"Model: FrogMini-14B-2510",
3438+
"Org: Microsoft",
3439+
"System: Attempts - 1"
3440+
],
3441+
"warning": null
3442+
},
33153443
{
33163444
"name": "mini-SWE-agent + o4-mini (2025-04-16)",
33173445
"logo": [

js/mainResults.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ function renderLeaderboardTable(leaderboard) {
118118
<td>
119119
<div class="flex items-center gap-1">
120120
<div class="model-badges">
121-
${item.date >= "2025-06-25" ? '<span>🆕</span>' : ''}
121+
${item.date >= "2025-10-15" ? '<span>🆕</span>' : ''}
122122
${item.oss ? '<span>🤠</span>' : ''}
123123
${item.checked ? '<span title="The agent run was performed by or directly verified by the SWE-bench team">✅</span>' : ''}
124124
</div>

templates/pages/press.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ <h1>Press</h1>
1414
</div>
1515
</header>
1616

17-
<section class="container">
17+
<section class="container" style="margin-top: 1em;">
1818
{% for item in press %}
1919
<!-- <div class="content-section"> -->
2020
<p>

0 commit comments

Comments
 (0)