{"meta": {"exported_at": "2026-05-05T13:29:28Z", "total_benchmarks": 463, "total_sources": 524, "total_categories": 25, "site_url": "https://agentic-eval-benchmarks.pages.dev", "summaries_url": "https://agentic-eval-benchmarks.pages.dev/summaries_data.json", "export_url": "https://agentic-eval-benchmarks.pages.dev/export.json", "description": "Comprehensive taxonomy of agentic AI evaluation benchmarks. For full deep-dive content, fetch summaries_url."}, "benchmarks": [{"id": 1, "name": "SWE-bench", "publisher": "Princeton NLP", "date": "2023", "venue": "ICLR 2024", "url": "https://www.swebench.com/", "tasks": "2,294", "topScore": "~65%", "category": "Coding", "capabilities": "Bug fixing, code gen, repo understanding", "score": 95, "citations": "~750", "tier": "Tier 1", "importedAt": "2026-03-08"}, {"id": 2, "name": "SWE-bench Verified", "publisher": "OpenAI + Princeton", "date": "2024", "venue": "—", "url": "https://www.swebench.com/", "tasks": "500", "topScore": "~72%", "category": "Coding", "capabilities": "Bug fixing (verified)", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 3, "name": "SWE-bench Lite", "publisher": "Princeton NLP", "date": "2024", "venue": "—", "url": "https://www.swebench.com/", "tasks": "300", "topScore": "—", "category": "Coding", "capabilities": "Lighter subset", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 4, "name": "SWE-bench Multimodal", "publisher": "Princeton NLP", "date": "2024", "venue": "ICLR 2025", "url": "https://arxiv.org/abs/2410.03859", "tasks": "617", "topScore": "~30%", "category": "Coding", "capabilities": "Visual bugs, frontend, JS", "score": 65, "citations": "~180", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 5, "name": "SWE-bench+", "publisher": "Princeton NLP", "date": "2025", "venue": "Under review", "url": "", "tasks": "Quality audit", "topScore": "—", "category": "Coding", "capabilities": "Dataset quality analysis", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 6, "name": "SWE-bench-Live", "publisher": "Microsoft", "date": "2025", "venue": "NeurIPS 2025", "url": "https://swe-bench-live.github.io/", "tasks": "1,890+", "topScore": "—", "category": "Coding", "capabilities": "Multi-language, contamination-free", "score": 53, "citations": "~40", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 7, "name": "SWE-bench Pro", "publisher": "Scale AI", "date": "2025", "venue": "—", "url": "https://scale.com/leaderboard/swe_bench_pro_public", "tasks": "Rigorous subset", "topScore": "45.89%", "category": "Coding", "capabilities": "Software engineering", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 8, "name": "SWE-Lancer", "publisher": "OpenAI", "date": "2025", "venue": "ICML 2025", "url": "https://arxiv.org/abs/2502.12115", "tasks": "1,400+", "topScore": "$403K", "category": "Coding", "capabilities": "Freelance SE, economic value", "score": 67, "citations": "~40", "tier": "Tier 2", "importedAt": "2026-03-07"}, {"id": 9, "name": "SWE-EVO", "publisher": "Academic", "date": "2025", "venue": "—", "url": "https://arxiv.org/abs/2512.18470", "tasks": "48", "topScore": "21%", "category": "Coding", "capabilities": "Long-horizon evolution", "score": 41, "citations": "~5", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 10, "name": "SWE-Atlas QnA", "publisher": "Scale AI", "date": "2026", "venue": "—", "url": "https://scale.com/leaderboard/sweatlas-qna", "tasks": "124", "topScore": "<30%", "category": "Coding", "capabilities": "Deep code comprehension", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 11, "name": "FeatureBench", "publisher": "Academic", "date": "2026", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2602.10975", "tasks": "200", "topScore": "11%", "category": "Coding", "capabilities": "Feature development", "score": 53, "citations": "~8", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 12, "name": "ABC-Bench", "publisher": "OpenMOSS", "date": "2026", "venue": "—", "url": "https://arxiv.org/abs/2601.11077", "tasks": "224", "topScore": "—", "category": "Coding", "capabilities": "Backend, multi-language", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 13, "name": "LiveCodeBench", "publisher": "Academic", "date": "2024", "venue": "arXiv", "url": "https://livecodebench.github.io/", "tasks": "1,055+", "topScore": "—", "category": "Coding", "capabilities": "Contamination-free coding", "score": 69, "citations": "~120", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 14, "name": "InterCode", "publisher": "Princeton NLP", "date": "2023", "venue": "NeurIPS 2023", "url": "https://intercode-benchmark.github.io/", "tasks": "5 envs", "topScore": "—", "category": "Coding", "capabilities": "Interactive coding with feedback", "score": 76, "citations": "~150", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 15, "name": "CodeAct", "publisher": "All Hands AI", "date": "2024", "venue": "ICLR 2025", "url": "https://arxiv.org/abs/2402.01030", "tasks": "—", "topScore": "+20%", "category": "Coding", "capabilities": "Executable code actions", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 16, "name": "NL2Repo-Bench", "publisher": "CMU", "date": "2025", "venue": "—", "url": "https://arxiv.org/abs/2512.12730", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "Full repo generation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 17, "name": "Terminal-Bench 2.0", "publisher": "Laude Institute (Merrill, Carlini et al.)", "date": "2025", "venue": "arxiv", "url": "https://www.tbench.ai/", "tasks": "89", "topScore": "78.4% (Gemini 3.1 Pro)", "category": "Coding", "capabilities": "CLI, sysadmin, ML, reverse eng", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 18, "name": "OpenHands Index", "publisher": "All Hands AI", "date": "2026", "venue": "—", "url": "https://openhands.dev/blog/openhands-index", "tasks": "5 domains", "topScore": "—", "category": "Coding", "capabilities": "Broad coding agent eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 19, "name": "CooperBench", "publisher": "Academic", "date": "2026", "venue": "—", "url": "https://cooperbench.com/", "tasks": "600+", "topScore": "—", "category": "Coding", "capabilities": "Collaborative multi-agent coding", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 20, "name": "WebArena", "publisher": "CMU", "date": "2023", "venue": "ICLR 2024", "url": "https://webarena.dev/", "tasks": "812", "topScore": "61.7%", "category": "Web", "capabilities": "Realistic web navigation", "score": 90, "citations": "~520", "tier": "Tier 1", "importedAt": "2026-03-08"}, {"id": 21, "name": "VisualWebArena", "publisher": "CMU", "date": "2024", "venue": "ICLR 2025", "url": "https://jykoh.com/vwa", "tasks": "910", "topScore": "16.4%", "category": "Web", "capabilities": "Multimodal web", "score": 80, "citations": "~180", "tier": "Tier 1", "importedAt": "2026-03-08"}, {"id": 22, "name": "BrowseComp", "publisher": "OpenAI", "date": "2025", "venue": "Announcement", "url": "https://arxiv.org/abs/2504.12516", "tasks": "1,266", "topScore": "51.5%", "category": "Web", "capabilities": "Deep web research", "score": 57, "citations": "~55", "tier": "Tier 3", "importedAt": "2026-03-07"}, {"id": 23, "name": "Mind2Web", "publisher": "Ohio State", "date": "2023", "venue": "NeurIPS 2023", "url": "https://osu-nlp-group.github.io/Mind2Web/", "tasks": "2,000+", "topScore": "—", "category": "Web", "capabilities": "Cross-domain web agent", "score": 90, "citations": "~758", "tier": "Tier 1", "importedAt": "2026-03-08"}, {"id": 24, "name": "Mind2Web 2", "publisher": "Ohio State", "date": "2025", "venue": "NeurIPS 2025", "url": "https://arxiv.org/abs/2506.21506", "tasks": "130", "topScore": "—", "category": "Web", "capabilities": "Agentic search, Agent-as-Judge", "score": 55, "citations": "~20", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 25, "name": "WorkArena", "publisher": "ServiceNow", "date": "2024", "venue": "ICML 2024", "url": "https://servicenow.github.io/WorkArena/", "tasks": "19,912", "topScore": "—", "category": "Web", "capabilities": "Enterprise web workflows", "score": 75, "citations": "~80", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 26, "name": "WebShop", "publisher": "Princeton NLP", "date": "2022", "venue": "arXiv", "url": "https://webshop-pnlp.github.io/", "tasks": "12,087", "topScore": "—", "category": "Web", "capabilities": "E-commerce navigation", "score": 72, "citations": "~450", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 27, "name": "BFCL", "publisher": "UC Berkeley", "date": "2024", "venue": "—", "url": "https://gorilla.cs.berkeley.edu/leaderboard.html", "tasks": "Multi-cat", "topScore": "—", "category": "Tool Use", "capabilities": "Function calling, AST eval", "score": 81, "citations": "~150", "tier": "Tier 1", "importedAt": "2026-03-08"}, {"id": 28, "name": "MCP-Atlas", "publisher": "Scale AI", "date": "2026", "venue": "—", "url": "https://arxiv.org/abs/2602.00933", "tasks": "1,000", "topScore": "62.3%", "category": "Tool Use", "capabilities": "MCP tool orchestration", "score": 64, "citations": "~5", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 29, "name": "Toolathlon", "publisher": "HKUST / Duke", "date": "2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2510.25726", "tasks": "108", "topScore": "38.6%", "category": "Tool Use", "capabilities": "MCP tool use, long-horizon", "score": 59, "citations": "~15", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 30, "name": "MCPAgentBench", "publisher": "Academic", "date": "2025", "venue": "—", "url": "https://arxiv.org/abs/2512.24565", "tasks": "20K+", "topScore": "—", "category": "Tool Use", "capabilities": "MCP tool selection", "score": 51, "citations": "~5", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 31, "name": "ToolComp", "publisher": "Scale AI", "date": "2025", "venue": "—", "url": "https://scale.com/leaderboard/tool_use_enterprise", "tasks": "485", "topScore": "—", "category": "Tool Use", "capabilities": "Dependent tool calling", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 32, "name": "GAIA", "publisher": "Meta AI / HuggingFace", "date": "2023", "venue": "ICLR 2024", "url": "https://huggingface.co/gaia-benchmark", "tasks": "466", "topScore": "~75%", "category": "General", "capabilities": "General AI assistant", "score": 88, "citations": "~390", "tier": "Tier 1", "importedAt": "2026-03-08"}, {"id": 33, "name": "AgentBench", "publisher": "Tsinghua", "date": "2023", "venue": "ICLR 2024", "url": "https://arxiv.org/abs/2308.03688", "tasks": "8 envs", "topScore": "—", "category": "General", "capabilities": "Multi-environment agent", "score": 92, "citations": "~620", "tier": "Tier 1", "importedAt": "2026-03-08"}, {"id": 34, "name": "TheAgentCompany", "publisher": "CMU", "date": "2024", "venue": "NeurIPS 2025", "url": "https://the-agent-company.com/", "tasks": "175", "topScore": "24%", "category": "General", "capabilities": "Enterprise workplace", "score": 78, "citations": "~70", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 35, "name": "HAL", "publisher": "Princeton", "date": "2025", "venue": "ICLR 2026", "url": "https://hal.cs.princeton.edu/", "tasks": "21,730 rollouts", "topScore": "—", "category": "General", "capabilities": "Cross-domain meta-evaluation", "score": 66, "citations": "~30", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 36, "name": "LiveAgentBench", "publisher": "Academic", "date": "2026", "venue": "—", "url": "https://arxiv.org/abs/2603.02586", "tasks": "374", "topScore": "—", "category": "General", "capabilities": "Multi-capability, live", "score": 50, "citations": "~3", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 37, "name": "AppWorld", "publisher": "Stony Brook NLP", "date": "2024", "venue": "ACL 2024", "url": "https://appworld.dev/", "tasks": "750", "topScore": "—", "category": "General", "capabilities": "Interactive app coding", "score": 74, "citations": "~65", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 38, "name": "OSWorld", "publisher": "XLANG Lab", "date": "2024", "venue": "NeurIPS 2024", "url": "https://os-world.github.io/", "tasks": "369", "topScore": "60.8%", "category": "OS", "capabilities": "OS interaction, GUI", "score": 86, "citations": "~280", "tier": "Tier 1", "importedAt": "2026-03-08"}, {"id": 39, "name": "OSWorld-Verified", "publisher": "XLANG Lab", "date": "2025", "venue": "—", "url": "https://xlang.ai/blog/osworld-verified", "tasks": "300+", "topScore": "—", "category": "OS", "capabilities": "Improved eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 40, "name": "OSWorld-MCP", "publisher": "X-PLUG", "date": "2025", "venue": "Announcement", "url": "https://github.com/X-PLUG/OSWorld-MCP", "tasks": "158 MCP tools", "topScore": "—", "category": "OS", "capabilities": "OS + MCP", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 41, "name": "Spider 2.0", "publisher": "XLANG Lab", "date": "2024", "venue": "ICLR 2025", "url": "https://spider2-sql.github.io/", "tasks": "3000+ cols", "topScore": "6%", "category": "OS", "capabilities": "Enterprise text-to-SQL", "score": 58, "citations": "~40", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 42, "name": "xbench", "publisher": "XLANG Lab", "date": "2025", "venue": "—", "url": "https://arxiv.org/abs/2506.13651", "tasks": "Profession-aligned", "topScore": "—", "category": "OS", "capabilities": "Evergreen, profession eval", "score": 43, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 43, "name": "tau-bench", "publisher": "Sierra AI", "date": "2024", "venue": "—", "url": "https://taubench.com/", "tasks": "Retail + Airline", "topScore": "—", "category": "Customer Service", "capabilities": "Tool-Agent-User, pass^k", "score": 82, "citations": "~120", "tier": "Tier 1", "importedAt": "2026-03-08"}, {"id": 44, "name": "tau2-bench", "publisher": "Sierra Research", "date": "2025", "venue": "—", "url": "https://arxiv.org/abs/2506.07982", "tasks": "+ Telecom", "topScore": "34%", "category": "Customer Service", "capabilities": "Dual-control Dec-POMDP", "score": 62, "citations": "~15", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 45, "name": "tau-Knowledge", "publisher": "Sierra Research", "date": "2026", "venue": "—", "url": "https://arxiv.org/abs/2603.04370", "tasks": "Banking", "topScore": "—", "category": "Customer Service", "capabilities": "Unstructured knowledge", "score": 44, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 259, "name": "τ³-bench", "publisher": "Sierra AI", "date": "2026", "venue": "—", "url": "https://sierra.ai/resources/research/tau-3-bench", "tasks": "τ-Banking (~700 docs) + voice", "topScore": "~25.5%", "category": "Customer Service", "capabilities": "Knowledge retrieval, voice interaction, customer service", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-26"}, {"id": 260, "name": "c-CRAB", "publisher": "NUS / Zhejiang University / SonarSource", "date": "2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.23448", "tasks": "184 PR instances, 234 executable tests, 67 repos", "topScore": "32.1% (Claude Code)", "category": "Coding", "capabilities": "Code review, issue detection, test-based review evaluation, review agent pipeline", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 261, "name": "ProjectEval", "publisher": "—", "date": "2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2503.07010", "tasks": "20 projects, 284 test cases, 3 input levels", "topScore": "—", "category": "Coding", "capabilities": "Project-level code generation, user interaction simulation, cascaded generation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 262, "name": "EvoCodeBench", "publisher": "—", "date": "2026", "venue": "KDD 2026", "url": "https://arxiv.org/abs/2602.10171", "tasks": "3,822 LeetCode problems; 100-problem eval set; 5 languages", "topScore": "—", "category": "Coding", "capabilities": "Self-evolving LLM coding, multilingual, runtime/memory efficiency", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 263, "name": "PRDBench", "publisher": "—", "date": "2025", "venue": "AAMAS 2026", "url": "https://arxiv.org/abs/2510.24358", "tasks": "50 Python projects, 20 domains, 1,258 metrics", "topScore": "45.5% (Claude Code)", "category": "Coding", "capabilities": "PRD-to-code, full project implementation, LLM-as-judge", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 264, "name": "LoCoBench-Agent", "publisher": "Salesforce AI Research", "date": "2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2511.13998", "tasks": "8K scenarios; 10 languages; 10K–1M token contexts; 8 tools", "topScore": "—", "category": "Coding", "capabilities": "Long-context software engineering, interactive agent, memory retention", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 265, "name": "SWE-PolyBench", "publisher": "Amazon AWS", "date": "2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2504.08703", "tasks": "2,110 instances across Java/JS/TS/Python from 21 repos", "topScore": "14.1% (AiderPB + Claude 3.5 Sonnet)", "category": "Coding", "capabilities": "Multi-language bug fixing, code retrieval, CST-level analysis", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 266, "name": "OctoBench", "publisher": "MiniMax", "date": "2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2601.10343", "tasks": "217 instances, 7,098 checklist items, 3 scaffold types", "topScore": "28.11% ISR (Claude Opus 4.5)", "category": "Coding", "capabilities": "Scaffold-aware coding agent, instruction following, per-check vs end-to-end compliance", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 267, "name": "Multi-SWE-bench", "publisher": "ByteDance Seed", "date": "2025", "venue": "NeurIPS 2025", "url": "https://arxiv.org/abs/2504.02605", "tasks": "1,632 validated instances; 7 non-Python languages + Python → ~2,132 total", "topScore": "—", "category": "Coding", "capabilities": "Multilingual issue resolution, multi-language code repair, RL training data", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 268, "name": "MobileDev-Bench", "publisher": "LSU / University of Kentucky", "date": "2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.24946", "tasks": "384 tasks from 18 mobile repos (Android, React Native, Flutter); 35.7% multi-artifact", "topScore": "5.21% (Claude Sonnet 4.5)", "category": "Coding", "capabilities": "Mobile app development, multi-artifact coordination, fault localization", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 269, "name": "SDBench", "publisher": "Microsoft AI", "date": "2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2506.22405", "tasks": "304 NEJM clinicopathological cases; cost-constrained; Gatekeeper + Judge models", "topScore": "85.5% (MAI-DxO ensemble)", "category": "Medical", "capabilities": "Sequential clinical diagnosis, interactive info-gathering, cost-constrained decision-making, multi-agent", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 270, "name": "RiemannBench", "publisher": "Surge AI Research", "date": "2026", "venue": "Announcement", "url": "https://cdn.prod.website-files.com/68dc970bd6e945ea3fb0f426/69c2d73f5d377a9428089ff7_88b9c61d478380737e8f8dc285adba31_RiemannBench.pdf", "tasks": "25 private PhD-level math problems; Python interpreter + search; double-blind verification", "topScore": "6% pass@1 (Gemini 3.1 Pro, Claude Opus 4.6)", "category": "Reasoning", "capabilities": "Frontier math reasoning, research agent tool use, theorem verification", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 271, "name": "ARC-AGI-3", "publisher": "ARC Prize Foundation", "date": "2025", "venue": "Announcement", "url": "https://arcprize.org/blog/arc-agi-3-preview-30-day-learnings", "tasks": "Interactive video-game environments; scored as action efficiency vs. human baseline", "topScore": "12.58% human efficiency (StochasticGoose)", "category": "Reasoning", "capabilities": "Interactive reasoning, exploration, memory, goal acquisition, on-the-fly learning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-27"}, {"id": 46, "name": "MLE-bench", "publisher": "OpenAI", "date": "2024", "venue": "NeurIPS 2024", "url": "", "tasks": "75 Kaggle", "topScore": "16.9%", "category": "ML/Research", "capabilities": "ML engineering", "score": 77, "citations": "~95", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 47, "name": "RE-Bench", "publisher": "METR", "date": "2024", "venue": "arXiv", "url": "https://metr.org/research/", "tasks": "7 envs", "topScore": "4x human", "category": "ML/Research", "capabilities": "Research engineering", "score": 73, "citations": "~60", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 48, "name": "HCAST", "publisher": "METR", "date": "2025", "venue": "—", "url": "https://arxiv.org/abs/2503.17354", "tasks": "189", "topScore": "70-80%", "category": "ML/Research", "capabilities": "Autonomous tasks (calibrated)", "score": 60, "citations": "~25", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 49, "name": "PaperBench", "publisher": "OpenAI", "date": "2025", "venue": "ICML 2025", "url": "https://arxiv.org/abs/2504.01848", "tasks": "8,316", "topScore": "21-27%", "category": "ML/Research", "capabilities": "Research replication", "score": 70, "citations": "~55", "tier": "Tier 2", "importedAt": "2026-03-07"}, {"id": 50, "name": "SciCode", "publisher": "Princeton", "date": "2024", "venue": "arXiv", "url": "https://scicode-bench.github.io/", "tasks": "338", "topScore": "4.6%", "category": "ML/Research", "capabilities": "Scientific coding", "score": 68, "citations": "~55", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 51, "name": "FrontierScience", "publisher": "OpenAI", "date": "2025", "venue": "—", "url": "https://arxiv.org/abs/2601.21165", "tasks": "2 tracks", "topScore": "77%/25%", "category": "ML/Research", "capabilities": "Expert scientific reasoning", "score": 50, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 52, "name": "Humanity's Last Exam", "publisher": "CAIS + Scale AI", "date": "2025", "venue": "Nature 2025", "url": "https://agi.safe.ai/", "tasks": "2,500", "topScore": "37.5%", "category": "Reasoning", "capabilities": "Expert-level reasoning", "score": 56, "citations": "~45", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 53, "name": "ARC-AGI-2", "publisher": "ARC Prize", "date": "2025", "venue": "—", "url": "https://arcprize.org/", "tasks": "Abstract", "topScore": "24%", "category": "Reasoning", "capabilities": "Abstract generalization", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 54, "name": "D-REX", "publisher": "CAIS / Gray Swan", "date": "2025", "venue": "—", "url": "https://arxiv.org/abs/2509.17938", "tasks": "—", "topScore": "—", "category": "Reasoning", "capabilities": "Deceptive CoT detection", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 55, "name": "APEX-Agents", "publisher": "Mercor", "date": "2026", "venue": "—", "url": "https://arxiv.org/abs/2601.14242", "tasks": "480", "topScore": "24%", "category": "Enterprise", "capabilities": "IB, consulting, law", "score": 54, "citations": "~5", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 258, "name": "APEX-SWE", "publisher": "Mercor + Cognition", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2601.08806", "tasks": "200 (100 Integration + 100 Observability)", "topScore": "40.5% (Claude Opus 4.6)", "category": "Coding", "capabilities": "Multi-service integration, production debugging/observability, MCP tool use", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-26"}, {"id": 56, "name": "Remote Labor Index", "publisher": "Scale AI + CAIS", "date": "2025", "venue": "—", "url": "https://www.remotelabor.ai/", "tasks": "240", "topScore": "2.5%", "category": "Enterprise", "capabilities": "Real-world remote work", "score": 46, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 57, "name": "Finance Agent Benchmark", "publisher": "Vals AI", "date": "2025", "venue": "—", "url": "https://www.vals.ai/benchmarks/finance_agent", "tasks": "537", "topScore": "48.3%", "category": "Enterprise", "capabilities": "Financial analysis (SEC)", "score": 47, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 58, "name": "Context-Bench", "publisher": "Letta", "date": "2025", "venue": "—", "url": "https://www.letta.com/blog/context-bench", "tasks": "File + entity", "topScore": "—", "category": "Memory", "capabilities": "Context engineering", "score": 49, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 59, "name": "MEMTRACK", "publisher": "Patronus AI", "date": "2025", "venue": "NeurIPS SEA", "url": "https://www.patronus.ai/blog/memtrack", "tasks": "Slack + Linear", "topScore": "—", "category": "Memory", "capabilities": "Long-term memory", "score": 48, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 60, "name": "CL-bench", "publisher": "Tencent Hunyuan", "date": "2026", "venue": "Announcement", "url": "https://github.com/Tencent-Hunyuan/CL-bench", "tasks": "1,899", "topScore": "—", "category": "Memory", "capabilities": "Context learning", "score": 42, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 61, "name": "EnIGMA", "publisher": "Princeton NLP", "date": "2024", "venue": "ICML 2025", "url": "https://arxiv.org/abs/2409.16165", "tasks": "CTF", "topScore": "—", "category": "Security", "capabilities": "Interactive security eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 62, "name": "CVE-Bench", "publisher": "UIUC", "date": "2025", "venue": "ICML 2025", "url": "https://arxiv.org/abs/2503.17332", "tasks": "40 CVEs", "topScore": "10-13%", "category": "Security", "capabilities": "Vulnerability exploitation", "score": 40, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 63, "name": "ZeroDayBench", "publisher": "Academic", "date": "2026", "venue": "ICLR 2026 WS", "url": "https://arxiv.org/html/2603.02297", "tasks": "22", "topScore": "—", "category": "Security", "capabilities": "Zero-day discovery", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 64, "name": "AgentHarm", "publisher": "Gray Swan AI", "date": "2024", "venue": "ICLR 2025", "url": "https://arxiv.org/abs/2410.09024", "tasks": "110", "topScore": "—", "category": "Safety", "capabilities": "Agent safety, jailbreak", "score": 52, "citations": "~25", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 65, "name": "TRAIL", "publisher": "Patronus AI", "date": "2025", "venue": "—", "url": "https://www.patronus.ai/blog/introducing-trail", "tasks": "148 traces", "topScore": "<11%", "category": "Safety", "capabilities": "Agentic debugging", "score": 63, "citations": "~10", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 66, "name": "PropensityBench", "publisher": "Scale AI / UMD", "date": "2025", "venue": "—", "url": "", "tasks": "979", "topScore": "—", "category": "Safety", "capabilities": "Risk behavior under pressure", "score": 45, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 67, "name": "DABStep", "publisher": "Academic / HF", "date": "2025", "venue": "—", "url": "https://arxiv.org/abs/2506.23719", "tasks": "450+", "topScore": "14.55%", "category": "Data Science", "capabilities": "Financial data analytics", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 68, "name": "MultiAgentBench", "publisher": "Academic", "date": "2025", "venue": "ACL 2025", "url": "https://arxiv.org/abs/2503.01935", "tasks": "Multi-agent", "topScore": "—", "category": "Multi-Agent", "capabilities": "Coordination, KPIs", "score": 57, "citations": "~20", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 69, "name": "MedAgentBench", "publisher": "Stanford", "date": "2025", "venue": "NEJM AI", "url": "", "tasks": "300 clinical", "topScore": "70%", "category": "Medical", "capabilities": "Clinical EHR", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 70, "name": "HELM", "publisher": "Stanford CRFM", "date": "2022", "venue": "—", "url": "https://crfm.stanford.edu/helm/", "tasks": "Multi-faceted", "topScore": "—", "category": "Framework", "capabilities": "Holistic LLM eval", "score": null, "citations": "~1800", "tier": "Tier 4", "importedAt": "2026-03-08"}, {"id": 71, "name": "Chatbot Arena", "publisher": "LMSYS / LMArena", "date": "2023", "venue": "—", "url": "https://lmsys.org/", "tasks": "6M+ votes", "topScore": "—", "category": "Framework", "capabilities": "Human preference ranking", "score": null, "citations": "~3200", "tier": "Tier 4", "importedAt": "2026-03-08"}, {"id": 72, "name": "SEAL Leaderboards", "publisher": "Scale AI", "date": "2024", "venue": "—", "url": "https://scale.com/leaderboard", "tasks": "15+ benchmarks", "topScore": "—", "category": "Framework", "capabilities": "Multi-domain eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 73, "name": "Wiz Cyber Arena", "publisher": "Wiz", "date": "2026", "venue": "—", "url": "https://www.wiz.io/cyber-model-arena", "tasks": "257", "topScore": "—", "category": "Security", "capabilities": "Offensive cybersecurity", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 74, "name": "BioAgent Bench", "publisher": "Entropic / FER Zagreb", "date": "Jan 2026", "venue": "—", "url": "https://arxiv.org/abs/2601.21800", "tasks": "10 bioinformatics tasks (RNA-seq, variant calling, metagenomics)", "topScore": "90–100% frontier; 82.5% open-weight", "category": "Data Science", "capabilities": "Bioinformatics pipeline orchestration, tool use, robustness testing", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-01"}, {"id": 75, "name": "Search Arena", "publisher": "LMArena", "date": "2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2506.05334", "tasks": "24K convos", "topScore": "—", "category": "Framework", "capabilities": "Search-augmented LLMs", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 76, "name": "BLUR", "publisher": "Patronus AI", "date": "2025", "venue": "ACL 2025", "url": "https://www.patronus.ai/blog/the-blur-benchmark", "tasks": "573", "topScore": "~56%", "category": "Reasoning", "capabilities": "Multilingual search", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-07"}, {"id": 77, "name": "GDPval", "publisher": "OpenAI", "date": "Oct 2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2510.04374", "tasks": "1,320 tasks, 44 occupations, 9 US GDP sectors; 220-task open gold subset", "topScore": "~47.6% win-or-tie (Claude Opus 4.1)", "category": "Enterprise", "capabilities": "Economic value of agents, multi-modal deliverables, long-horizon professional work", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-01"}, {"id": 78, "name": "ScienceAgentBench", "publisher": "Ohio State", "date": "2024", "venue": "ICLR 2025", "url": "https://arxiv.org/abs/2410.05080", "tasks": "102", "topScore": "—", "category": "ML/Research", "capabilities": "Scientific discovery", "score": 71, "citations": "~45", "tier": "Tier 2", "importedAt": "2026-03-08"}, {"id": 79, "name": "CORE-Bench", "publisher": "Princeton", "date": "2024", "venue": "TMLR 2025", "url": "https://arxiv.org/abs/2409.11363", "tasks": "270", "topScore": "—", "category": "ML/Research", "capabilities": "Reproducibility", "score": 59, "citations": "~20", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 80, "name": "BrowserGym", "publisher": "ServiceNow/CMU", "date": "2025", "venue": "TMLR 2025", "url": "", "tasks": "Framework", "topScore": "—", "category": "Web", "capabilities": "Web agent eval ecosystem", "score": 61, "citations": "~40", "tier": "Tier 3", "importedAt": "2026-03-08"}, {"id": 81, "name": "ToolLLM/ToolBench", "publisher": "Academic", "date": "2023", "venue": "—", "url": "", "tasks": "16,464 APIs", "topScore": "—", "category": "Tool Use", "capabilities": "Tool retrieval, API usage", "score": 87, "citations": "~580", "tier": "Tier 1", "importedAt": "2026-03-08"}, {"id": 82, "name": "RefactorBench", "publisher": "Microsoft", "date": "2025", "venue": "ICLR 2025", "url": "https://arxiv.org/abs/2503.07832", "tasks": "100", "topScore": "22%", "category": "Coding", "capabilities": "Multi-file refactoring, stateful reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 83, "name": "DeepPlanning", "publisher": "Qwen / Alibaba", "date": "2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2601.18137", "tasks": "120+", "topScore": "~35%", "category": "Reasoning", "capabilities": "Long-horizon planning, constraints", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 84, "name": "AgencyBench", "publisher": "SJTU GAIR-NLP (Pengfei Liu)", "date": "2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2601.11044", "tasks": "138", "topScore": "56.5%", "category": "General", "capabilities": "Autonomous agents, 1M tokens, 90 tool calls", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 85, "name": "SkillsBench", "publisher": "Multi-institutional", "date": "2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2602.12670", "tasks": "86", "topScore": "+16.2pp", "category": "General", "capabilities": "Agent skill augmentation, 11 domains", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 86, "name": "OfficeBench", "publisher": "UCSD (Wang et al.)", "date": "2024", "venue": "arxiv", "url": "https://arxiv.org/abs/2407.19056", "tasks": "~300", "topScore": "47%", "category": "Enterprise", "capabilities": "Office automation, 9 apps", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 87, "name": "CRMArena", "publisher": "Salesforce AI Research", "date": "2024", "venue": "NAACL 2025", "url": "https://arxiv.org/abs/2411.02305", "tasks": "9 types × 3 personas", "topScore": "<55%", "category": "Enterprise", "capabilities": "CRM operations, Salesforce", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 88, "name": "EnterpriseBench", "publisher": "Fujitsu Research", "date": "2025", "venue": "EMNLP 2025", "url": "https://arxiv.org/abs/2510.27287", "tasks": "500", "topScore": "<30%", "category": "Enterprise", "capabilities": "SWE, HR, finance, admin", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 89, "name": "AssistantBench", "publisher": "Tel Aviv U / Princeton", "date": "2024", "venue": "EMNLP 2024", "url": "https://arxiv.org/abs/2407.15711", "tasks": "214", "topScore": "<26%", "category": "Web", "capabilities": "Web research, time-consuming tasks", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 90, "name": "PinchBench", "publisher": "Kilo Code", "date": "2025", "venue": "Industry", "url": "https://pinchbench.com/", "tasks": "23", "topScore": "97.8%", "category": "Coding", "capabilities": "OpenClaw coding agent eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 91, "name": "SWE-rebench V2", "publisher": "Nebius", "date": "Feb 2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2602.23866", "tasks": "32,000+ (20 langs)", "topScore": "—", "category": "Coding", "capabilities": "Language-agnostic SWE, 3,600+ repos", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 92, "name": "AMA-Bench", "publisher": "UCSD + Meta FAIR", "date": "Feb 2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2602.22769", "tasks": "Real + synthetic trajectories", "topScore": "57.22%", "category": "Memory", "capabilities": "Long-horizon agent memory, causality tracking", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 93, "name": "Gaia2", "publisher": "Meta AI (FAIR)", "date": "Feb 2026", "venue": "ICLR 2026 Oral", "url": "https://arxiv.org/abs/2602.11964", "tasks": "1,120 scenarios", "topScore": "42% (GPT-5)", "category": "General", "capabilities": "Dynamic/async environments, temporal reasoning, multi-agent", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 94, "name": "KoCo-Bench", "publisher": "Peking U / Wuhan U", "date": "Jan 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2601.13240", "tasks": "131 functions + 978 tests", "topScore": "34.2% (Claude Code)", "category": "Coding", "capabilities": "Domain-specific code gen, knowledge corpora, 6 domains", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 95, "name": "FeatBench", "publisher": "Tsinghua ISE", "date": "Feb 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2509.22237", "tasks": "157 tasks (27 repos)", "topScore": "29.94%", "category": "Coding", "capabilities": "Feature-level code gen, NL-only inputs, regression testing", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 96, "name": "RubberDuckBench", "publisher": "Bryn Mawr / Google", "date": "Jan 2026", "venue": "LLM4Code @ ICSE 2026", "url": "https://arxiv.org/abs/2601.16456", "tasks": "15 questions", "topScore": "69.29% (Grok 4)", "category": "Coding", "capabilities": "Code understanding Q&A, hallucination detection", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 97, "name": "A.S.E", "publisher": "Tencent / Peking U", "date": "Sep 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2508.18106", "tasks": "120 instances", "topScore": "63.01%", "category": "Security", "capabilities": "Repo-level code security, CVE-based, 5 languages", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 98, "name": "ASTRA", "publisher": "HackerRank", "date": "Feb 2025", "venue": "arxiv (2502.00226)", "url": "https://www.hackerrank.com/ai/astra-reports", "tasks": "65 projects", "topScore": "81.96% (GPT-4.1)", "category": "Coding", "capabilities": "Multi-file project coding, 7 frameworks, consistency k=32", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 99, "name": "MCPVerse", "publisher": "CUHK-SZ / SenseTime", "date": "Aug 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2508.16260", "tasks": "250 tasks (552 tools)", "topScore": "44.2%", "category": "Tool Use", "capabilities": "Large-scale real MCP tools, 65 servers, outcome-based eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 100, "name": "ML-Tool-Bench", "publisher": "Amazon / Academic", "date": "Dec 2025", "venue": "ICML 2025", "url": "https://arxiv.org/abs/2512.00672", "tasks": "15 Kaggle + 61 tools", "topScore": "17.10%", "category": "Tool Use", "capabilities": "Tool-augmented ML planning, long-horizon, MCTS", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 101, "name": "MCP-AgentBench", "publisher": "USTC / Metastone", "date": "Sep 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2509.09734", "tasks": "600 queries", "topScore": "64.7%", "category": "Tool Use", "capabilities": "MCP tool use, 33 servers, 188 tools, ReAct vs TC", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 102, "name": "MCPEval", "publisher": "Salesforce AI Research", "date": "Jul 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2507.12806", "tasks": "676 verified", "topScore": "0.926 (O3)", "category": "Tool Use", "capabilities": "Automatic MCP eval, 5 domains, LLM-judge", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 103, "name": "MCP-RADAR", "publisher": "Xi'an Jiaotong / UMass", "date": "May 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2505.16700", "tasks": "507 tasks", "topScore": "61.2%", "category": "Tool Use", "capabilities": "Multi-dimensional MCP, 6 domains, 49 tools", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 104, "name": "MCPToolBench++", "publisher": "Ant Group", "date": "Aug 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2508.07575", "tasks": "1,509 QA pairs", "topScore": "varies", "category": "Tool Use", "capabilities": "Large-scale MCP, 4K+ servers, multilingual, AST DAG", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 105, "name": "SWE-smith", "publisher": "Princeton NLP", "date": "2025", "venue": "NeurIPS 2025 Spotlight", "url": "https://arxiv.org/abs/2504.21798", "tasks": "50K synthetic from 128 repos", "topScore": "—", "category": "Coding", "capabilities": "SE data generation pipeline", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 106, "name": "SWE-rebench", "publisher": "Academic", "date": "May 2025", "venue": "—", "url": "https://arxiv.org/abs/2505.20411", "tasks": "21,000+ tasks", "topScore": "—", "category": "Coding", "capabilities": "Contamination-free pipeline", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 107, "name": "SWT-Bench", "publisher": "ETH Zurich", "date": "2024", "venue": "NeurIPS 2024", "url": "https://swtbench.com/", "tasks": "1,900 test-gen tasks", "topScore": "—", "category": "Coding", "capabilities": "Software testing", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 108, "name": "Cline-Bench", "publisher": "Cline", "date": "Nov 2025", "venue": "—", "url": "https://cline.bot/blog/cline-bench-initiative", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "Containerized RL dev scenarios", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 109, "name": "ColBench", "publisher": "Academic", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2503.15478", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "Collaborative reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 110, "name": "Aider Polyglot", "publisher": "Aider", "date": "2024 (ongoing)", "venue": "—", "url": "https://aider.chat/docs/leaderboards/", "tasks": "225 Exercism problems", "topScore": "—", "category": "Coding", "capabilities": "Multi-language coding", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 111, "name": "GitTaskBench", "publisher": "QuantaAlpha", "date": "Aug 2025", "venue": "—", "url": "https://arxiv.org/abs/2508.18993", "tasks": "54 tasks (7 modalities)", "topScore": "—", "category": "Coding", "capabilities": "Repo-level understanding", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 112, "name": "Hybrid-Gym", "publisher": "CMU (Neubig)", "date": "Feb 2026", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2602.16819", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "Coding agent training env", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 113, "name": "WebArena Verified", "publisher": "ServiceNow", "date": "2025", "venue": "Announcement", "url": "https://github.com/ServiceNow/BrowserGym", "tasks": "Audited 812 tasks", "topScore": "—", "category": "Web", "capabilities": "Reduced false-negative by 11.3pp", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 114, "name": "VideoWebArena", "publisher": "CMU", "date": "2025", "venue": "ICLR 2025", "url": "https://arxiv.org/abs/2410.19100", "tasks": "2,021 video tasks", "topScore": "—", "category": "Web", "capabilities": "Video + web", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 115, "name": "WebChoreArena", "publisher": "Academic", "date": "Jun 2025", "venue": "—", "url": "https://arxiv.org/abs/2506.01952", "tasks": "532 tedium tasks", "topScore": "—", "category": "Web", "capabilities": "Memory, calculation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 116, "name": "WebVoyager", "publisher": "Academic", "date": "2024", "venue": "arXiv", "url": "https://arxiv.org/abs/2401.13919", "tasks": "643 tasks (15 sites)", "topScore": "97.1% (Surfer 2)", "category": "Web", "capabilities": "End-to-end browsing", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 117, "name": "Online-Mind2Web", "publisher": "Ohio State", "date": "2025", "venue": "—", "url": "", "tasks": "300 live website tasks", "topScore": "—", "category": "Web", "capabilities": "Real-time web interaction", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 118, "name": "BrowserArena", "publisher": "Academic", "date": "Oct 2025", "venue": "—", "url": "https://arxiv.org/html/2510.02418v2", "tasks": "Open-ended tasks", "topScore": "—", "category": "Web", "capabilities": "Live evaluation platform", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 119, "name": "WorkArena++", "publisher": "ServiceNow", "date": "2025", "venue": "—", "url": "", "tasks": "682 compositional tasks", "topScore": "—", "category": "Web", "capabilities": "Complex enterprise web", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 120, "name": "MCP-Bench", "publisher": "Accenture", "date": "Aug 2025", "venue": "—", "url": "https://arxiv.org/abs/2508.20453", "tasks": "Real-world MCP tasks", "topScore": "—", "category": "Tool Use", "capabilities": "MCP tool use", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 121, "name": "AI-NativeBench", "publisher": "AINativeOps", "date": "Jan 2026", "venue": "—", "url": "https://arxiv.org/abs/2601.09393", "tasks": "8 apps, 3 domains", "topScore": "—", "category": "Tool Use", "capabilities": "MCP + A2A protocols", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 122, "name": "WildToolBench", "publisher": "Academic", "date": "2025", "venue": "arXiv", "url": "https://openreview.net/forum?id=yz7fL5vfpn", "tasks": "Real-world patterns", "topScore": "<15% all models", "category": "Tool Use", "capabilities": "Real-world tool use", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 123, "name": "Nexus Function Calling", "publisher": "Nexusflow", "date": "2024", "venue": "—", "url": "", "tasks": "762 cases (9 tasks)", "topScore": "—", "category": "Tool Use", "capabilities": "Zero-shot function calling", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 124, "name": "MCPMark", "publisher": "Community", "date": "2025", "venue": "—", "url": "", "tasks": "127 MCP tasks", "topScore": "—", "category": "Tool Use", "capabilities": "MCP tool use", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 125, "name": "MINT", "publisher": "All Hands AI", "date": "2023", "venue": "ICLR 2024", "url": "https://xwang.dev/mint-bench/", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "Multi-turn tool use", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 126, "name": "Galileo Agent Leaderboard", "publisher": "Galileo AI", "date": "2025 (v2)", "venue": "Announcement", "url": "https://huggingface.co/spaces/galileo-ai/agent-leaderboard", "tasks": "30+ LLMs, 14 datasets", "topScore": "—", "category": "General", "capabilities": "Cross-domain ranking", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 127, "name": "AgentArch", "publisher": "Academic", "date": "Sep 2025", "venue": "arXiv", "url": "https://arxiv.org/html/2509.10769v1", "tasks": "18 configurations", "topScore": "—", "category": "General", "capabilities": "Architecture eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 128, "name": "General Agent Eval", "publisher": "Academic", "date": "Feb 2026", "venue": "—", "url": "https://arxiv.org/abs/2602.22953", "tasks": "6 environments", "topScore": "—", "category": "General", "capabilities": "Unified protocol", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 129, "name": "OSWorld-Gold", "publisher": "XLANG Lab", "date": "2025", "venue": "ICML 2025", "url": "", "tasks": "Annotated trajectories", "topScore": "—", "category": "OS", "capabilities": "Efficiency evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 130, "name": "OSWorld-Human", "publisher": "XLANG Lab", "date": "Jun 2025", "venue": "—", "url": "https://arxiv.org/abs/2506.16042", "tasks": "Human trajectories", "topScore": "—", "category": "OS", "capabilities": "Computer-use efficiency", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 131, "name": "OpenCUA / AgentNetBench", "publisher": "XLANG Lab", "date": "2025", "venue": "—", "url": "https://opencua.xlang.ai/", "tasks": "3 OS, 200+ apps", "topScore": "—", "category": "OS", "capabilities": "Offline computer-use eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 132, "name": "MultiChallenge", "publisher": "Scale AI", "date": "Jan 2025", "venue": "—", "url": "https://scale.com/leaderboard/multichallenge", "tasks": "Multi-turn conv", "topScore": "—", "category": "Customer Service", "capabilities": "Instruction retention", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 133, "name": "TurnBench-MS", "publisher": "Academic", "date": "2025", "venue": "EMNLP 2025", "url": "https://arxiv.org/abs/2506.01341", "tasks": "Code-breaking", "topScore": "18% (vs 100% human)", "category": "Reasoning", "capabilities": "Multi-turn reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 134, "name": "StoryBench", "publisher": "Academic", "date": "Jun 2025", "venue": "—", "url": "https://arxiv.org/abs/2506.13356", "tasks": "Interactive fiction", "topScore": "—", "category": "Reasoning", "capabilities": "Long-term memory", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 135, "name": "DPAI Arena", "publisher": "JetBrains / Linux Foundation", "date": "Oct 2025", "venue": "—", "url": "https://dpaia.dev/", "tasks": "140+ Spring tasks", "topScore": "—", "category": "Enterprise", "capabilities": "Enterprise coding workflows", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 136, "name": "Spring AI Bench", "publisher": "Spring / JetBrains", "date": "Oct 2025", "venue": "—", "url": "https://spring.io/", "tasks": "Java-centric", "topScore": "—", "category": "Enterprise", "capabilities": "Enterprise Java agents", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 137, "name": "Context-Bench Skills", "publisher": "Letta", "date": "Nov 2025", "venue": "—", "url": "https://www.letta.com/blog/context-bench-skills", "tasks": "Skill discovery + loading", "topScore": "—", "category": "Memory", "capabilities": "Skill management", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 138, "name": "Mem-Gallery", "publisher": "Academic", "date": "Jan 2026", "venue": "—", "url": "https://arxiv.org/abs/2601.03515", "tasks": "13 memory systems", "topScore": "—", "category": "Memory", "capabilities": "Multimodal memory", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 139, "name": "Recovery-Bench", "publisher": "Letta", "date": "2025", "venue": "—", "url": "", "tasks": "Error scenarios", "topScore": "—", "category": "Memory", "capabilities": "Error recovery", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 140, "name": "HTB NeuroGrid CTF", "publisher": "Hack The Box", "date": "2025-2026", "venue": "Competition", "url": "", "tasks": "36 challenges (9 domains)", "topScore": "—", "category": "Security", "capabilities": "CTF competition", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 141, "name": "DataSciBench", "publisher": "Tsinghua", "date": "Feb 2025", "venue": "—", "url": "https://arxiv.org/abs/2502.13897", "tasks": "—", "topScore": "—", "category": "Data Science", "capabilities": "Data science", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 142, "name": "DSBench", "publisher": "Academic", "date": "2024", "venue": "—", "url": "", "tasks": "540 tasks", "topScore": "—", "category": "Data Science", "capabilities": "Data analysis + modeling", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 143, "name": "FDABench", "publisher": "Academic", "date": "Sep 2025", "venue": "—", "url": "https://arxiv.org/abs/2509.02473", "tasks": "2,007 tasks", "topScore": "—", "category": "Data Science", "capabilities": "Multi-source analytics", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 144, "name": "Spider 2.0-DBT", "publisher": "XLANG Lab", "date": "May 2025", "venue": "—", "url": "", "tasks": "68 repo-level tasks", "topScore": "—", "category": "Data Science", "capabilities": "Data transformation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 145, "name": "WebMall", "publisher": "Uni Mannheim", "date": "Aug 2025", "venue": "—", "url": "https://arxiv.org/abs/2508.13024", "tasks": "91 cross-shop tasks", "topScore": "—", "category": "E-commerce", "capabilities": "Multi-shop navigation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 146, "name": "ShoppingBench", "publisher": "Academic", "date": "Aug 2025", "venue": "arXiv", "url": "https://arxiv.org/html/2508.04266v1", "tasks": "3,310 instructions", "topScore": "48.2% (GPT-4.1)", "category": "E-commerce", "capabilities": "Intent-grounded shopping", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 147, "name": "MedAgentBoard", "publisher": "Academic", "date": "2025", "venue": "—", "url": "https://medagentboard.netlify.app/", "tasks": "Medical scenarios", "topScore": "—", "category": "Medical", "capabilities": "Medical multi-agent", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 148, "name": "SafeAgentBench", "publisher": "Academic", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2412.13178", "tasks": "Safety in embodied agents", "topScore": "—", "category": "Medical", "capabilities": "Medical/safety", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 149, "name": "WideSearch", "publisher": "ByteDance Seed", "date": "Aug 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2508.07999", "tasks": "200 tasks (18 domains)", "topScore": "5.1% (o3 multi-agent)", "category": "Web", "capabilities": "Broad info-seeking, web research", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 150, "name": "NIKA", "publisher": "KAUST (SANDS Lab)", "date": "Dec 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2512.16381", "tasks": "640 incidents (54 issue types)", "topScore": "89% detect / 55% RCA (GPT-5)", "category": "Enterprise", "capabilities": "Network troubleshooting, MCP tool use", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 151, "name": "Plancraft", "publisher": "U Edinburgh", "date": "Dec 2024", "venue": "AAAI 2025", "url": "https://arxiv.org/abs/2412.21033", "tasks": "2,295 tasks (634 recipes)", "topScore": "67% (Llama 70B + search)", "category": "Reasoning", "capabilities": "Planning, multi-modal, infeasibility detection", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 152, "name": "DatasetResearch", "publisher": "Academic", "date": "Aug 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2508.06960", "tasks": "208 tasks (6 NLP categories)", "topScore": "72.7% (o3 w/ refs)", "category": "Data Science", "capabilities": "Dataset discovery, agent research", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 153, "name": "KAMI", "publisher": "Kamiwaza AI", "date": "Nov 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2511.08042", "tasks": "570 tasks (5 enterprise domains)", "topScore": "88.8% (Qwen3 235B)", "category": "Enterprise", "capabilities": "Enterprise agentic, tool use, contamination-free", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 154, "name": "EVMbench", "publisher": "OpenAI / Paradigm / OtterSec", "date": "Feb 2026", "venue": "arxiv", "url": "https://openai.com/index/introducing-evmbench/", "tasks": "~120 vulnerabilities", "topScore": "72.2% exploit (GPT-5.3-Codex)", "category": "Security", "capabilities": "Smart contract security: detect, patch, exploit", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 155, "name": "SCONE-bench", "publisher": "Anthropic", "date": "2025", "venue": "—", "url": "https://red.anthropic.com/2025/smart-contracts/", "tasks": "405 real contracts", "topScore": "65% (Opus 4.5)", "category": "Security", "capabilities": "Smart contract exploitation, dollar-value", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 156, "name": "SEA-HELM", "publisher": "Stanford + partners", "date": "2025", "venue": "ACL 2025", "url": "https://arxiv.org/abs/2502.14301", "tasks": "SE Asian languages", "topScore": "—", "category": "Framework", "capabilities": "5 pillars, regional eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 157, "name": "aiRank", "publisher": "Community", "date": "2025+", "venue": "—", "url": "https://airank.dev/", "tasks": "50+ coding benchmarks", "topScore": "—", "category": "Framework", "capabilities": "Model comparison", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 165, "name": "MCP-AgentBench", "publisher": "USTC / Metastone", "date": "Sep 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2509.09734", "tasks": "600 queries (33 servers)", "topScore": "64.7% (Qwen3-235B)", "category": "Tool Use", "capabilities": "MCP tool use, multi-server", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 169, "name": "ST-WebAgentBench", "publisher": "IBM Research", "date": "Oct 2024", "venue": "arxiv", "url": "https://arxiv.org/abs/2410.06703", "tasks": "234 policy-annotated tasks", "topScore": "20% CuP (AWM)", "category": "Web", "capabilities": "Web agent safety, policy compliance", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 170, "name": "MM-BrowseComp", "publisher": "Multi-institutional", "date": "Aug 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2508.13186", "tasks": "224 multimodal questions", "topScore": "29.02% (o3)", "category": "Web", "capabilities": "Multimodal web browsing", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 171, "name": "SHADE-Arena", "publisher": "Anthropic", "date": "Jun 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2506.15740", "tasks": "17 task pairs (340+ tools)", "topScore": "27% sabotage (Claude 3.7)", "category": "Safety", "capabilities": "Sabotage detection, agent monitoring", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 172, "name": "WorldGUI", "publisher": "NUS", "date": "Feb 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2502.08047", "tasks": "611 tasks (10 apps)", "topScore": "45.8% SR (GPT-5.1)", "category": "OS", "capabilities": "Desktop GUI automation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 173, "name": "AmbiBench", "publisher": "Fudan University", "date": "Feb 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2602.11750", "tasks": "240 tasks (25 apps)", "topScore": "40.4% TSR (Fairy)", "category": "OS", "capabilities": "Mobile GUI, ambiguity handling", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 174, "name": "MemGUI-Bench", "publisher": "Multi-institutional", "date": "Feb 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2602.06075", "tasks": "128 tasks (26 apps)", "topScore": "32.8% (M3A)", "category": "OS", "capabilities": "Mobile GUI memory, cross-app", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 175, "name": "ProBench", "publisher": "Tsinghua University", "date": "Nov 2025", "venue": "AAAI 2026", "url": "https://arxiv.org/abs/2511.09157", "tasks": "200+ tasks (34 apps)", "topScore": "40.1% (Gemini 2.5 Pro)", "category": "OS", "capabilities": "Process-aware GUI eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 176, "name": "A3: Android Agent Arena", "publisher": "CUHK / Shanghai AI Lab", "date": "Jan 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2501.01149", "tasks": "100 daily-life tasks", "topScore": "53.0% SR (T3A+Gemini)", "category": "OS", "capabilities": "Dynamic mobile GUI", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 177, "name": "MobileWorld", "publisher": "Salesforce / HKUST", "date": "Dec 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2512.19432", "tasks": "201 tasks (20 apps)", "topScore": "51.7% SR (GPT-5)", "category": "OS", "capabilities": "Multi-app mobile, MCP-augmented", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 178, "name": "AgentRewardBench", "publisher": "Mila / Google DeepMind", "date": "Apr 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2504.08942", "tasks": "1,302 trajectories", "topScore": "69.8% precision (GPT-4o)", "category": "General", "capabilities": "Meta-benchmark, evaluator eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 179, "name": "BrowserArena Live", "publisher": "U Penn", "date": "Oct 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2510.02418", "tasks": "109 user-submitted tasks", "topScore": "DeepSeek R1 (top ELO)", "category": "Web", "capabilities": "Live web eval, arena-style", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 180, "name": "MobilityBench", "publisher": "Ant Group / USTC", "date": "Feb 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2602.22638", "tasks": "100K episodes (350+ cities)", "topScore": "69.09% (Gemini-3-Pro)", "category": "Reasoning", "capabilities": "Route planning, constraint extraction", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 181, "name": "FineState-Bench", "publisher": "Multi-institutional", "date": "Aug 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2508.09241", "tasks": "2,257 tasks (3 platforms)", "topScore": "32.8% SA-Int SR", "category": "OS", "capabilities": "Fine-grained GUI state control", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 182, "name": "MMBench-GUI", "publisher": "OpenCompass", "date": "Jul 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2507.19478", "tasks": "8,123 tasks (6 platforms)", "topScore": "26.60% SR (GPT-4o)", "category": "OS", "capabilities": "Hierarchical multi-platform GUI", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 183, "name": "MAS-Bench", "publisher": "Multi-institutional", "date": "Sep 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2509.06477", "tasks": "139 tasks (11 Android apps)", "topScore": "64.1% SR", "category": "OS", "capabilities": "Hybrid GUI-shortcut mobile", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 184, "name": "COMMA", "publisher": "UC Santa Barbara", "date": "Oct 2024", "venue": "arxiv", "url": "https://arxiv.org/abs/2410.07553", "tasks": "Multimodal puzzles", "topScore": "Below random (CoT)", "category": "Multi-Agent", "capabilities": "Multi-agent communication", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 185, "name": "OmniEAR", "publisher": "Zhejiang University", "date": "Aug 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2508.05614", "tasks": "1,500 embodied scenarios", "topScore": "56-96%", "category": "Multi-Agent", "capabilities": "Embodied agent reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 186, "name": "Silo-Bench", "publisher": "Multi-institutional", "date": "Mar 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.01045", "tasks": "30 tasks (1,620 experiments)", "topScore": "—", "category": "Multi-Agent", "capabilities": "Distributed coordination", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 187, "name": "EmCoop", "publisher": "Carnegie Mellon", "date": "Feb 2026", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2603.00349", "tasks": "2 environments (scalable)", "topScore": "—", "category": "Multi-Agent", "capabilities": "Embodied cooperation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 188, "name": "REALM-Bench", "publisher": "Stanford", "date": "Feb 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2502.18836", "tasks": "14 problems + 188 JSSP", "topScore": "85-95% static", "category": "Multi-Agent", "capabilities": "Multi-agent planning/scheduling", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 189, "name": "DPBench", "publisher": "Independent", "date": "Feb 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2602.13255", "tasks": "8 experimental conditions", "topScore": "25% deadlock (GPT-5.2)", "category": "Multi-Agent", "capabilities": "Simultaneous coordination", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 190, "name": "MAESTRO", "publisher": "KAUST / UBC", "date": "Jan 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2601.00481", "tasks": "12 multi-agent systems", "topScore": "75% silent failure rate", "category": "Multi-Agent", "capabilities": "Multi-agent framework eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 191, "name": "LLM-Coordination", "publisher": "UC Santa Cruz", "date": "Oct 2023", "venue": "NAACL 2025", "url": "https://arxiv.org/abs/2310.03903", "tasks": "4 games + 198 QA", "topScore": "260.0 Overcooked (GPT-4t)", "category": "Multi-Agent", "capabilities": "Coordination, theory of mind", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 192, "name": "CivBench", "publisher": "ClashAI", "date": "2025-2026", "venue": "Platform", "url": "https://clashai.live", "tasks": "Strategy game competitions", "topScore": "—", "category": "Multi-Agent", "capabilities": "Long-horizon strategy games", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 193, "name": "HeroBench", "publisher": "Independent", "date": "Aug 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2508.12782", "tasks": "844 RPG tasks", "topScore": "91.7% (Grok-4)", "category": "Reasoning", "capabilities": "Long-horizon planning, RPG", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 194, "name": "UltraHorizon", "publisher": "Tsinghua", "date": "Sep 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2509.21766", "tasks": "60-400+ tool calls", "topScore": "Humans > all LLMs", "category": "Reasoning", "capabilities": "Ultra long-horizon agent", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 195, "name": "Blocksworld-MCP", "publisher": "Hamburg U of Tech", "date": "Dec 2025", "venue": "IFAC (submitted)", "url": "https://arxiv.org/abs/2512.03955", "tasks": "5 complexity categories", "topScore": "—", "category": "Reasoning", "capabilities": "Planning via MCP", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 196, "name": "PLANET", "publisher": "Emory University", "date": "Apr 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2504.14773", "tasks": "Survey of planning benchmarks", "topScore": "—", "category": "Reasoning", "capabilities": "Planning benchmark collection", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 197, "name": "ReasonBENCH", "publisher": "EPFL", "date": "Dec 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2512.07795", "tasks": "Multi-run reasoning eval", "topScore": "4x CI variance", "category": "Reasoning", "capabilities": "Reasoning stability", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 198, "name": "PaperArena", "publisher": "USTC", "date": "Oct 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2510.10909", "tasks": "Scientific research questions", "topScore": "38.78% (best agent)", "category": "ML/Research", "capabilities": "Tool-augmented scientific reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 199, "name": "CUARewardBench", "publisher": "Tencent", "date": "Oct 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2510.18596", "tasks": "10 software categories", "topScore": "89.8% precision (UPE)", "category": "General", "capabilities": "Reward models for CUA", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 200, "name": "FutureX", "publisher": "Fudan / Princeton", "date": "Aug 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2508.11987", "tasks": "1,272 events (11 domains)", "topScore": "Live leaderboard", "category": "Reasoning", "capabilities": "Future prediction, live eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 201, "name": "Constraint Violations Benchmark", "publisher": "McGill / NRC Canada", "date": "Dec 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2512.20798", "tasks": "40 scenarios", "topScore": "1.3% violation (best)", "category": "Safety", "capabilities": "Ethical/legal constraint adherence", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 202, "name": "Factorio Learning Env", "publisher": "Jack Hopkins et al.", "date": "Mar 2025", "venue": "arxiv (2503.09617)", "url": "https://github.com/JackHopkins/factorio-learning-environment", "tasks": "Open-ended factory building", "topScore": "Dynamic leaderboard", "category": "Reasoning", "capabilities": "Game-based planning, code synthesis", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 203, "name": "PRBench", "publisher": "Scale AI", "date": "Nov 2025", "venue": "arxiv (2511.11562)", "url": "https://scale.com/research/prbench", "tasks": "1,100 tasks (Finance+Law)", "topScore": "0.39 (Finance Hard)", "category": "Enterprise", "capabilities": "Professional reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 204, "name": "AuditBench", "publisher": "Anthropic", "date": "Mar 2026", "venue": "Alignment Blog", "url": "https://alignment.anthropic.com/2026/auditbench/", "tasks": "56 models, 14 categories", "topScore": "—", "category": "Safety", "capabilities": "Hidden behavior auditing", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-13"}, {"id": 205, "name": "BigLaw Bench: Research", "publisher": "Harvey AI / Snorkel AI", "date": "Mar 2026", "venue": "Blog", "url": "https://www.harvey.ai/blog/introducing-big-law-bench-research", "tasks": "12+ practice areas", "topScore": "— (frontier models struggle)", "category": "Enterprise", "capabilities": "Legal research, tool-augmented reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-18"}, {"id": 206, "name": "EnterpriseOps-Gym", "publisher": "ServiceNow-AI", "date": "Mar 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.13594", "tasks": "1,150 tasks (8 verticals, 512 tools)", "topScore": "37.4% (Claude Opus 4.5)", "category": "Enterprise", "capabilities": "Stateful agentic planning, enterprise workflows", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-18"}, {"id": 207, "name": "DRACO", "publisher": "Perplexity AI", "date": "Feb 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2602.11685", "tasks": "100 tasks, 10 domains", "topScore": "67.15% (Perplexity DR)", "category": "ML/Research", "capabilities": "Deep research agents, cross-domain evaluation, citation quality", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 208, "name": "DeepResearch Bench", "publisher": "Multi-institutional", "date": "Jun 2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2506.11763", "tasks": "100 PhD-level tasks, 22 fields", "topScore": null, "category": "ML/Research", "capabilities": "PhD-level research tasks, report quality, citation trustworthiness", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 209, "name": "Dr. Bench", "publisher": "Multi-institutional", "date": "Oct 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2510.02190", "tasks": "214 tasks, 10 domains", "topScore": null, "category": "ML/Research", "capabilities": "Long-form report generation, retrieval trustworthiness", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 210, "name": "IDRBench", "publisher": "Multi-institutional", "date": "Jan 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2601.06676", "tasks": "Multi-task interactive eval", "topScore": null, "category": "ML/Research", "capabilities": "Interactive deep research, multi-agent framework, user simulation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 211, "name": "MMDeepResearch-Bench", "publisher": "AIoT-MLSys Lab", "date": "Jan 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2601.12346", "tasks": "140 tasks, 21 domains", "topScore": null, "category": "ML/Research", "capabilities": "Multimodal deep research, visual-text integrity, citation alignment", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 212, "name": "ReportBench", "publisher": "ByteDance", "date": "Aug 2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2508.15804", "tasks": "Survey-paper-based eval", "topScore": null, "category": "ML/Research", "capabilities": "Academic survey evaluation, citation faithfulness", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 213, "name": "DeepSearchQA", "publisher": "Google DeepMind", "date": "Dec 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2601.20975", "tasks": "900 prompts, 17 domains", "topScore": "Gemini DR (SOTA)", "category": "Web", "capabilities": "Multi-step information-seeking, entity resolution, search stopping criteria", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 214, "name": "MemoryAgentBench", "publisher": "HUST / UCSD", "date": "Jul 2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2507.05257", "tasks": "Multi-turn incremental tasks", "topScore": null, "category": "Memory", "capabilities": "Retrieval, test-time learning, long-range understanding, selective forgetting", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 215, "name": "Evo-Memory", "publisher": "UIUC / Google DeepMind", "date": "Nov 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2511.20857", "tasks": "Multi-domain streaming tasks", "topScore": null, "category": "Memory", "capabilities": "Self-evolving memory, streaming evaluation, sequence robustness", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 216, "name": "DevOps-Gym", "publisher": "Multi-institutional", "date": "Jan 2026", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2601.20882", "tasks": "700+ tasks, 30+ projects", "topScore": null, "category": "Coding", "capabilities": "DevOps workflows: build, monitoring, issue resolving, test generation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 217, "name": "ProjDevBench", "publisher": "Multi-institutional", "date": "Feb 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2602.01655", "tasks": "20 problems, 8 categories", "topScore": "27.38% acceptance", "category": "Coding", "capabilities": "End-to-end project development, architecture design, iterative refinement", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 218, "name": "SWE-CI", "publisher": "SKYLENAGE-AI", "date": "Mar 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.03823", "tasks": "100 tasks (500+ LOC each)", "topScore": null, "category": "Coding", "capabilities": "Continuous integration loop, long-term maintainability, EvoScore metric", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 219, "name": "RepoReason", "publisher": "Multi-institutional", "date": "Jan 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2601.03731", "tasks": "Repository-level Python tasks", "topScore": null, "category": "Coding", "capabilities": "Abductive assertion verification, execution-driven mutation, reasoning diagnostics", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 220, "name": "petscagent-bench", "publisher": "Argonne National Lab", "date": "Mar 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.15976", "tasks": "PETSc scientific computing tasks", "topScore": null, "category": "Coding", "capabilities": "HPC scientific code generation, agents-evaluating-agents, A2A/MCP protocols", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 221, "name": "FieldWorkArena", "publisher": "Fujitsu Research / CMU", "date": "May 2025", "venue": "AAAI 2026", "url": "https://arxiv.org/abs/2505.19662", "tasks": "Real factory/warehouse scenarios", "topScore": null, "category": "Enterprise", "capabilities": "Manufacturing/logistics field agents, safety-rule violation detection", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 222, "name": "CUB", "publisher": "Theta Software", "date": "Jun 2025", "venue": "Blog", "url": "https://thetasoftware.com/blog/introducing-cub/", "tasks": "106 workflows, 7 industries", "topScore": "10.4% (Writer Action Agent)", "category": "OS", "capabilities": "Cross-industry desktop+browser workflows, enterprise software (SAP, CapIQ)", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 223, "name": "UI-CUBE", "publisher": "UiPath", "date": "Nov 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2511.17131", "tasks": "226 tasks, 2 tiers", "topScore": null, "category": "OS", "capabilities": "Enterprise CUA benchmarking, multi-resolution testing, interface variation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 224, "name": "MCPWorld", "publisher": "SAAgent", "date": "Jun 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2506.07672", "tasks": "201 curated tasks", "topScore": "75.12%", "category": "Tool Use", "capabilities": "Unified API+GUI+hybrid computer use testbed, white-box MCP apps", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 225, "name": "BrowseComp-V3", "publisher": "Multi-institutional", "date": "Feb 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2602.12876", "tasks": "300 questions, 24 sub-domains", "topScore": "36% (SOTA)", "category": "Web", "capabilities": "Multimodal deep search, cross-page evidence, gold-standard trajectories", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 226, "name": "CUBE Standard", "publisher": "AI Alliance", "date": "Mar 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.15798", "tasks": "Meta-standard", "topScore": null, "category": "Framework", "capabilities": "Universal MCP+Gym protocol standard for unifying agent benchmarks", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 227, "name": "AgentDojo", "publisher": "ETH Zurich (SPYLab)", "date": "Jun 2024", "venue": "ICLR 2025", "url": "https://arxiv.org/abs/2406.13352", "tasks": "97 tasks, 629 security tests", "topScore": "69% benign / 45% under attack (GPT-4o)", "category": "Security", "capabilities": "Prompt injection attacks/defenses, dynamic evaluation framework", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 228, "name": "AgentDyn", "publisher": "SaFo-Lab", "date": "Feb 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2602.03117", "tasks": "60 tasks, 560 injection tests", "topScore": null, "category": "Security", "capabilities": "Dynamic prompt injection, open-ended attacks, defense evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 229, "name": "AILuminate", "publisher": "MLCommons", "date": "Mar 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2503.05731", "tasks": "24,000+ prompts, 12 hazard categories", "topScore": null, "category": "Safety", "capabilities": "AI risk/reliability, harmful prompt resistance, multilingual", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 230, "name": "VLAIR", "publisher": "Vals AI", "date": "Oct 2025", "venue": "Blog", "url": "https://www.vals.ai/vlair", "tasks": "210 questions, 9 legal research types", "topScore": "80% (AI) vs 71% (lawyers)", "category": "Enterprise", "capabilities": "Legal research evaluation, statutory analysis, 50-state surveys", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 231, "name": "TRACE", "publisher": "Multi-institutional", "date": "Feb 2026", "venue": "WWW 2026", "url": "https://arxiv.org/abs/2602.21230", "tasks": "Controllable complexity tasks", "topScore": null, "category": "Framework", "capabilities": "Trajectory-aware evaluation, process efficiency, scaffolded capability assessment", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-08"}, {"id": 232, "name": "OpenAgentSafety", "publisher": "CMU / AI2", "date": "Jul 2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2507.06134", "tasks": "356 multi-turn tasks (8 risk categories)", "topScore": "~49% unsafe rate (best model)", "category": "Safety", "capabilities": "Agent safety: computer security, data loss, privacy, code execution, financial integrity, content moderation, legal compliance, harmful decisions", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-22"}, {"id": 233, "name": "MT-AgentRisk", "publisher": "Independent", "date": "Feb 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2602.13379", "tasks": "Multi-turn jailbreak scenarios (5 tool types)", "topScore": "85.4% ASR (DeepSeek-v3.2)", "category": "Safety", "capabilities": "Multi-turn jailbreak resistance, tool composition attack detection, capability-safety gap measurement", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-22"}, {"id": 234, "name": "AGENTSAFE", "publisher": "Zhejiang U / AI2", "date": "Jun 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2506.14697", "tasks": "9,900 instructions (450 normal + 8,100 adversarial)", "topScore": "—", "category": "Safety", "capabilities": "Embodied agent safety, hazard recognition (human/env/self-harm), planning-stage refusal, adversarial jailbreak resistance", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-22"}, {"id": 235, "name": "Risky-Bench", "publisher": "Academic", "date": "Feb 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2602.03100", "tasks": "750 tasks (3 deployment domains, 5 attack surfaces)", "topScore": "25–60% ASR (all models)", "category": "Safety", "capabilities": "Agentic safety under deployment attacks: user instruction manipulation, env injection, tool feedback manipulation, memory poisoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-22"}, {"id": 236, "name": "VPI-Bench", "publisher": "NUS / CETRD", "date": "Jun 2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2506.02456", "tasks": "306 test cases (5 platforms)", "topScore": "Up to 100% (BUA), 51% (CUA with defenses)", "category": "Security", "capabilities": "Visual prompt injection attacks against computer-use agents, system-level and browser-level threat resistance", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-22"}, {"id": 237, "name": "InjecAgent", "publisher": "Academic", "date": "Mar 2024", "venue": "ACL 2024 Findings", "url": "https://arxiv.org/abs/2403.02691", "tasks": "1,054 test cases (17 user-tool × 62 attacker-tool scenarios)", "topScore": "86%+ ASR (Llama2-70B)", "category": "Security", "capabilities": "Indirect prompt injection resistance, tool-integrated LLM agent safety, data exfiltration prevention", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-22"}, {"id": 238, "name": "ATBench", "publisher": "Shanghai AI Lab", "date": "Jan 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2601.18491", "tasks": "500 trajectories (250 safe / 250 unsafe)", "topScore": "F1=0.87 (AgentDoG-L)", "category": "Safety", "capabilities": "Agent trajectory safety classification, 3D risk taxonomy (source + failure mode + harm), multi-turn tool-augmented eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-22"}, {"id": 239, "name": "MCP-SafetyBench", "publisher": "Multi-institutional", "date": "Dec 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2512.15163", "tasks": "245 tasks (5 domains, 20 attack types)", "topScore": "29–48% ASR (all 13 LLMs vulnerable)", "category": "Security", "capabilities": "MCP agent safety: 20 attack types across server/host/user attack surfaces, safety-utility trade-off measurement", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-22"}, {"id": 240, "name": "MCPSecBench", "publisher": "Multi-institutional", "date": "Aug 2025", "venue": "ICML 2026", "url": "https://arxiv.org/abs/2508.13220", "tasks": "17 attack types across 4 attack surfaces", "topScore": "100% success (protocol-side attacks)", "category": "Security", "capabilities": "MCP security: prompt injection resistance, protocol integrity, tool trustworthiness, host authorization enforcement", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-22"}, {"id": 241, "name": "MCP Security Bench (MSB)", "publisher": "Multi-institutional", "date": "Oct 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2510.15994", "tasks": "2,000 attack instances (400+ tools, 65 tasks, 10 domains)", "topScore": "40.71% avg ASR", "category": "Security", "capabilities": "End-to-end MCP attack evaluation: 12 attack types, tool selection under adversarial conditions, inverse scaling law", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-22"}, {"id": 242, "name": "Bloom", "publisher": "Anthropic", "date": "Dec 2025", "venue": "arxiv", "url": "https://github.com/safety-research/bloom", "tasks": "16 models × 4 alignment behaviors", "topScore": "r=0.86 human correlation (Claude Opus 4.1)", "category": "Safety", "capabilities": "Automated behavioral alignment evaluation: sycophancy, instructed sabotage, self-preservation, self-preferential bias", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-22"}, {"id": 243, "name": "Agent-SafetyBench", "publisher": "Tsinghua University (CoAI)", "date": "Dec 2024", "venue": "ICLR 2025", "url": "https://arxiv.org/abs/2412.14470", "tasks": "2,000 test cases (349 environments, 8 risk categories)", "topScore": "59.8% safe (Claude-3-Opus); avg 38.5%", "category": "Safety", "capabilities": "LLM agent behavioral safety: tool-use risk awareness, robustness in tool invocation, content safety, constraint following — 10 failure modes, fine-tuned judge at 91.5% accuracy", "score": 65, "citations": null, "tier": "Tier 2", "importedAt": "2026-03-23"}, {"id": 244, "name": "b³ (Backbone Breaker Benchmark)", "publisher": "Lakera AI / UK AI Security Institute / ETH Zürich", "date": "Oct 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2510.22620", "tasks": "210 adversarial attacks (194K+ crowdsourced); 34 LLMs evaluated across 30 threat snapshots", "topScore": "claude-haiku-4-5 most secure; reasoning consistently improves security", "category": "Security", "capabilities": "Backbone LLM security via threat snapshots: DIO, IIO, DTI, ITI, DCE, DAIS — 6 attack categories, prompt injection, system prompt extraction, memory poisoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-23"}, {"id": 245, "name": "SecureAgentBench", "publisher": "Singapore Management University et al.", "date": "Sep 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2509.22097", "tasks": "105 repository-level coding tasks (real OSS-Fuzz CVEs)", "topScore": "15.2% correct-and-secure (SWE-agent + DeepSeek-V3.1); avg 9.2%", "category": "Security", "capabilities": "Secure code generation: vulnerability avoidance, functional correctness under security constraints, cross-file reasoning — PoC exploit replay + Semgrep SAST evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-23"}, {"id": 246, "name": "CyberGym", "publisher": "UC Berkeley (Dawn Song lab)", "date": "Jun 2025", "venue": "ICLR 2026 (submitted)", "url": "https://arxiv.org/abs/2506.02548", "tasks": "1,507 instances (188 OSS-Fuzz projects, 28 sanitizer crash types, C/C++)", "topScore": "22.0% (GPT-5 + OpenHands high-reasoning)", "category": "Security", "capabilities": "Vulnerability reproduction: PoC generation, deep codebase reasoning, tool use, iterative execution feedback — 35 zero-days discovered during eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-23"}, {"id": 247, "name": "General AgentBench", "publisher": "CMU", "date": "Feb 2026", "venue": "ICML 2026 (submitted)", "url": "https://arxiv.org/abs/2602.18998", "tasks": "496 tasks from 7 benchmarks across 4 domains via shared MCP interface", "topScore": "Claude Sonnet 4.5 most robust (0.2% domain-specific drop)", "category": "General", "capabilities": "Cross-domain tool selection, intent inference, multi-turn planning, long-context reasoning, test-time scaling analysis", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-23"}, {"id": 248, "name": "CLEAR Enterprise Task Suite", "publisher": "Academic", "date": "Nov 2025", "venue": "AAAI 2026 Workshop", "url": "https://arxiv.org/abs/2511.14136", "tasks": "300 tasks across 6 enterprise domains (customer support, data analysis, process automation, software dev, compliance, multi-stakeholder)", "topScore": "CLEAR score predicts production readiness at ρ=0.83 vs ρ=0.41 for accuracy-only", "category": "Enterprise", "capabilities": "Multi-dimensional enterprise eval: Cost-Normalized Accuracy, Cost-Per-Success, SLA Compliance Rate, Policy Adherence Score, pass@k reliability", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-23"}, {"id": 249, "name": "DPAB-α (Dria Pythonic Agent Benchmark)", "publisher": "Dria / FirstBatch", "date": "Jan 2025", "venue": "—", "url": "https://huggingface.co/blog/andthattoo/dpab-a", "tasks": "100 synthetic tasks (Easy/Hard; Pythonic vs JSON calling)", "topScore": "Claude 3.5 Sonnet: 87 Pythonic / 45 JSON", "category": "Tool Use", "capabilities": "Pythonic function calling (executable Python) vs JSON-based tool calling — shows ~2× advantage for Pythonic approach; 3-step LLM validator", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-23"}, {"id": 250, "name": "VeRO", "publisher": "Scale Labs", "date": "Mar 2026", "venue": "—", "url": "https://labs.scale.com/blog/vero", "tasks": "105 runs across 5 benchmarks (GAIA, GPQA, MATH, TAU-Bench, SimpleQA); N=3 per task", "topScore": "~8–9% gain on tool-use tasks (Claude Sonnet/Opus led)", "category": "General", "capabilities": "Agent-builds-agent meta-optimization: builder agent edits target agent prompts, tools, control flow via Git versioning; cross-model generalization eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-23"}, {"id": 251, "name": "JJ Benchmark", "publisher": "TabbyML", "date": "2025–2026", "venue": "—", "url": "https://tabbyml.github.io/jj-benchmark/", "tasks": "63 tasks (Jujutsu VCS CLI operations)", "topScore": "87% (claude-opus-4-6)", "category": "Coding", "capabilities": "Jujutsu version-control tool use: CLI command execution, developer workflow automation, agentic VCS interaction", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-23"}, {"id": 252, "name": "WirelessBench", "publisher": "Shenzhen University / HKUST", "date": "Mar 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.21251", "tasks": "3,392 items (knowledge-reasoning + network slicing + mobile service assurance)", "topScore": "—", "category": "Tool Use", "capabilities": "Wireless network management, tool-use with 3GPP ray-tracing, tolerance-aware scoring, telecom domain", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-25"}, {"id": 253, "name": "AgentDS", "publisher": "University of Minnesota / Cisco Research", "date": "Mar 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.19005", "tasks": "17 challenges, 6 industry domains (commerce, food, healthcare, insurance, manufacturing, retail banking)", "topScore": "Rank 10/29 (Claude Code)", "category": "Data Science", "capabilities": "Domain-specific data science, multimodal reasoning, human-AI collaboration, competition format", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-25"}, {"id": 254, "name": "LiveCultureBench", "publisher": "Monash University", "date": "Mar 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.01952", "tasks": "1,000 agent profiles, full-day social simulations, 9 national cultures", "topScore": "74.2% (Gemini 2.5 Pro)", "category": "Multi-Agent", "capabilities": "Multi-cultural norm adherence, multi-agent social simulation, goal completion vs norm trade-off", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-25"}, {"id": 255, "name": "ResearchRubrics", "publisher": "Scale AI", "date": "Nov 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2511.07685", "tasks": "101 prompts × 2,593 expert-written rubric criteria (9 domains)", "topScore": "68% (best DR system)", "category": "ML/Research", "capabilities": "Deep research agent evaluation, rubric compliance, mandatory vs optional criteria, synthesis quality", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-25"}, {"id": 256, "name": "PPBench (Pencil Puzzle Bench)", "publisher": "Justin Waugh (independent)", "date": "Mar 2026", "venue": "—", "url": "https://ppbench.com/", "tasks": "300 puzzles (94 variety types), single-shot + agentic mode", "topScore": "56% (GPT-5.2@xhigh, agentic)", "category": "Reasoning", "capabilities": "Multi-step verifiable reasoning, intermediate board-state verification, dense RL reward signals", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-25"}, {"id": 257, "name": "ABC (Agentic Benchmark Checklist)", "publisher": "UIUC / Stanford / Berkeley / Princeton / UK AISI + partners", "date": "Jul 2025", "venue": "NeurIPS 2025", "url": "https://arxiv.org/abs/2507.02825", "tasks": "Meta-checklist: task validity + outcome validity + reporting items; applied to 10 major benchmarks", "topScore": "—", "category": "Framework", "capabilities": "Benchmark meta-evaluation, task/outcome validity, corrupt success detection, systematic benchmark review", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-25"}, {"id": 272, "name": "FuncBenchGen", "publisher": "Megagon Labs", "date": "Sep 2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2509.26553", "tasks": "Synthetic DAG-based; up to 20-function chains", "topScore": "15% (GPT-5, 20-chain)", "category": "Tool Use", "capabilities": "Contamination-free multi-step function calling, dependency graph complexity control", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 273, "name": "TPS-Bench", "publisher": "Shanghai Jiao Tong University", "date": "Nov 2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2511.01527", "tasks": "200 tasks (Easy/Hard), 141-tool MCP repo", "topScore": "64.72% Hard (GLM-4.5)", "category": "Tool Use", "capabilities": "Tool planning & scheduling in compounding tasks, sequential vs parallel trade-off", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 274, "name": "IFEval-FC", "publisher": "Higher School of Economics / SberDevices", "date": "Sep 2025", "venue": "—", "url": "https://arxiv.org/abs/2509.18420", "tasks": "750 test cases, 15 instruction types", "topScore": "<80% (all models)", "category": "Tool Use", "capabilities": "Format instruction following in function calling, JSON schema constraint adherence", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 275, "name": "HammerBench", "publisher": "OPPO + SJTU", "date": "Dec 2024", "venue": "—", "url": "https://arxiv.org/abs/2412.16516", "tasks": "~10K+ instances, 1,063 APIs, 60+ app categories", "topScore": "~74% (Claude 3.5 Sonnet)", "category": "Tool Use", "capabilities": "Multi-turn function calling in mobile assistant scenarios, 8 interaction types", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 276, "name": "ITC (International Tool Calling)", "publisher": "Shenzhen University", "date": "Mar 2026", "venue": "ACL 2026", "url": "https://arxiv.org/abs/2603.05515", "tasks": "3,571 REST APIs; 17,540 tasks, 29 languages, 40 countries", "topScore": "Tool Sel. F1: 89% (GPT-4o)", "category": "Tool Use", "capabilities": "Multilingual tool calling, language-matching metric, real REST API diversity", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 277, "name": "ASTRA-bench", "publisher": "Apple", "date": "Mar 2026", "venue": "ICML 2026", "url": "https://arxiv.org/abs/2603.01357", "tasks": "2,413 scenarios (4 user profiles, longitudinal personal context)", "topScore": "0.9112 macro-avg (Claude Opus 4.6)", "category": "Tool Use", "capabilities": "Personal assistant evaluation, referential/functional/informational complexity, tool-augmented reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 278, "name": "Tool-Genesis", "publisher": "HKU + Xiaohongshu", "date": "Mar 2026", "venue": "ICML 2026", "url": "https://arxiv.org/abs/2603.05578", "tasks": "86 MCP servers, 508 tools, 2,150 tasks, 9,441 unit tests", "topScore": "—", "category": "Tool Use", "capabilities": "MCP tool creation from abstract requirements, 4-level eval hierarchy", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 279, "name": "Agent-Diff", "publisher": "Minerva University", "date": "Feb 2026", "venue": "—", "url": "https://arxiv.org/abs/2602.11224", "tasks": "224 tasks (Slack, Box, Linear, Google Calendar)", "topScore": "88.1% (DeepSeek-V3.2)", "category": "Enterprise", "capabilities": "Enterprise API tasks via code execution, state-diff evaluation, side-effect penalization", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 280, "name": "SlopCodeBench", "publisher": "UW-Madison / WSU / MIT", "date": "Mar 2026", "venue": "—", "url": "https://arxiv.org/abs/2603.24755", "tasks": "20 problems, 93 checkpoints", "topScore": "17.2% (0 end-to-end solutions)", "category": "Coding", "capabilities": "Code quality degradation in long-horizon iterative agentic coding, structural erosion metrics", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 281, "name": "CodeClash", "publisher": "Stanford / Princeton / Cornell", "date": "Nov 2025", "venue": "—", "url": "https://arxiv.org/abs/2511.00839", "tasks": "1,680 tournaments, 25,200 rounds", "topScore": "0/150 vs. expert bot", "category": "Coding", "capabilities": "Goal-oriented software engineering via competitive code arenas (BattleSnake, Poker, RoboCode)", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 282, "name": "PostTrainBench", "publisher": "ELLIS Institute / MPI Tübingen", "date": "Mar 2026", "venue": "ICML 2026", "url": "https://arxiv.org/abs/2603.08640", "tasks": "7 target benchmarks; 1.7B–4B models, 10h/H100", "topScore": "23.2% (Claude Opus 4.6)", "category": "ML/Research", "capabilities": "Autonomous LLM post-training automation, hyperparameter search, reward hacking detection", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 283, "name": "LemmaBench", "publisher": "ENS Rennes / École des Ponts", "date": "Feb 2026", "venue": "NeurIPS 2025", "url": "https://arxiv.org/abs/2602.24173", "tasks": "~405 lemmas (live, auto-extracted from arXiv)", "topScore": "15% (GPT-5)", "category": "Reasoning", "capabilities": "Live research-level theorem proving, auto-extracted from recent arXiv math preprints", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 284, "name": "MirrorBench", "publisher": "SAP Labs", "date": "Jan 2026", "venue": "—", "url": "https://arxiv.org/abs/2601.08118", "tasks": "4 conversational datasets, 6 metrics", "topScore": "—", "category": "General", "capabilities": "User-proxy agent human-likeness, lexical diversity + LLM-judge evaluation, dialogue realism", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 285, "name": "PeerBench", "publisher": "Community-governed", "date": "Oct 2025", "venue": "—", "url": "https://arxiv.org/abs/2510.07575", "tasks": "Prototype at peerbench.ai", "topScore": "—", "category": "Framework", "capabilities": "Anti-contamination benchmarking: sealed execution, rolling test renewal, reputation-weighted scoring", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 286, "name": "EconWebArena", "publisher": "Capital One / Georgia Tech", "date": "Jun 2025", "venue": "—", "url": "https://arxiv.org/abs/2506.08136", "tasks": "360 tasks, 82 authoritative websites, 10 economic domains", "topScore": "46.9% (o4-mini) vs 93.3% human", "category": "Web", "capabilities": "Web navigation + economic domain reasoning, exact numeric extraction, source fidelity verification", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 287, "name": "WebTestBench", "publisher": "Northeastern Univ / Kuaishou", "date": "Mar 2026", "venue": "—", "url": "https://arxiv.org/abs/2603.25226", "tasks": "100 web apps, 1,750 test items, 4 dimensions", "topScore": "26.4% F1 (GPT-5.1)", "category": "Web", "capabilities": "End-to-end automated web testing: checklist generation + defect detection on AI-generated apps", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 288, "name": "GUIDE", "publisher": "KAIST / CMU / Oxford / Google", "date": "Mar 2026", "venue": "CVPR 2026", "url": "https://arxiv.org/abs/2603.25864", "tasks": "67.5h screen recordings, 120 novice users, 10 apps, 3 tasks", "topScore": "44.6% behavior detection (Claude Sonnet 4.5)", "category": "Computer Use", "capabilities": "GUI user behavior state detection, intent prediction, help-need prediction from screen recordings", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 289, "name": "RealWebAssist", "publisher": "Johns Hopkins / Amazon", "date": "Apr 2025", "venue": "AAAI 2026", "url": "https://arxiv.org/abs/2504.10445", "tasks": "1,885 instructions, 107 tasks, 66 real websites", "topScore": "14.0% task success (o3 + GTA-1) vs 93.4% human", "category": "Web", "capabilities": "Long-horizon web assistance with real user sessions: spatial/temporal reasoning, multi-step planning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 290, "name": "PA-Bench", "publisher": "Vibrant Labs", "date": "Feb 2026", "venue": "—", "url": "https://vibrantlabs.com/blog/pa-bench", "tasks": "Personal assistant workflows across email + calendar", "topScore": "68.8% task success (Claude Opus 4.6)", "category": "Web", "capabilities": "Multi-app web navigation, cross-app context reasoning, personal assistant long-horizon planning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 291, "name": "MolQuest", "publisher": "Alibaba Group", "date": "Mar 2026", "venue": "—", "url": "https://arxiv.org/abs/2603.25253", "tasks": "530 tasks from post-2025 chemistry literature", "topScore": "51.5% (Gemini 3 Flash)", "category": "Science", "capabilities": "Abductive reasoning, multi-modal spectral data integration (NMR/MS/IR), hypothesis-driven tool use", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 292, "name": "FURINA-Bench", "publisher": "HKUST(GZ) / HKU / Datawhale", "date": "Oct 2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2510.06800", "tasks": "1,459 dialogues / 7,181 test utterances, bilingual zh/en", "topScore": "43.98 (o3, English); 73.38 (DeepSeek-R1, Chinese)", "category": "Multi-Agent", "capabilities": "Role-playing agent evaluation: context reliance, factual recall, reflective reasoning, preference alignment", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 293, "name": "Buyout Game", "publisher": "lechmazur (independent)", "date": "2025", "venue": "—", "url": "https://github.com/lechmazur/buyout_game", "tasks": "468 games, 21 models, 8-player elimination format", "topScore": "GPT-5.4 high reasoning: 2052.8 BT", "category": "Multi-Agent", "capabilities": "Multi-agent bargaining, coalition building, deception detection, economic coordination", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 294, "name": "MASEval", "publisher": "Parameter Lab", "date": "Mar 2026", "venue": "—", "url": "https://arxiv.org/abs/2603.08835", "tasks": "7 benchmarks × 4 agent frameworks cross-matrix", "topScore": "System-level gap: 30.9 pp (MACS Travel, same model)", "category": "Framework", "capabilities": "Multi-agent evaluation infrastructure: system-level comparison, framework-vs-model contribution isolation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 295, "name": "ISD-Agent-Bench", "publisher": "Upstage / Korea Univ / Indiana Univ", "date": "Feb 2026", "venue": "—", "url": "https://arxiv.org/abs/2602.10620", "tasks": "25,795 scenarios (51 vars × 33 ADDIE sub-steps); 1,202 test cases", "topScore": "86.49 (React-ADDIE)", "category": "Enterprise", "capabilities": "Instructional systems design: full ADDIE lifecycle, multi-step planning, contextual adaptation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 296, "name": "PRBench (Physics)", "publisher": "Peking University Physics", "date": "Mar 2026", "venue": "—", "url": "https://arxiv.org/abs/2603.27646", "tasks": "30 tasks, 11 physics subfields", "topScore": "34% overall (GPT-5.3-Codex); 0% end-to-end", "category": "Science", "capabilities": "End-to-end physics paper reproduction: algorithm implementation, numerical simulation, result matching", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 297, "name": "DraftNEPABench", "publisher": "OpenAI / PNNL", "date": "Feb 2026", "venue": "—", "url": "https://openai.com/index/pacific-northwest-national-laboratory/", "tasks": "102 NEPA drafting tasks, 18 federal agencies", "topScore": "~15% time reduction; rated 1–5 by 19 SMEs", "category": "Enterprise", "capabilities": "Federal document drafting: multi-source synthesis, legal/technical structured output, enterprise agent eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-31"}, {"id": 298, "name": "DAB", "publisher": "UC Berkeley / Hasura PromptQL", "date": "Mar 2026", "venue": "—", "url": "https://arxiv.org/abs/2603.20576", "tasks": "54 queries, 12 datasets, 9 domains, 4 DB types", "topScore": "38% pass@1 (Gemini-3-Pro)", "category": "Data Science", "capabilities": "Multi-DB integration, ill-formatted join key reconciliation, unstructured text transformation, domain knowledge", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-01"}, {"id": 299, "name": "MonitorBench", "publisher": "UIUC / UW / UCSD", "date": "Mar 2026", "venue": "COLM 2026", "url": "https://arxiv.org/abs/2603.28590", "tasks": "1,514 instances, 19 tasks, 7 categories", "topScore": "~90% (Dual Objectives); ~35% stress test", "category": "Safety", "capabilities": "Chain-of-thought monitorability: input intervention, outcome justification, solution process", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-01"}, {"id": 300, "name": "PARTNR", "publisher": "Meta FAIR", "date": "Nov 2024", "venue": "ICLR 2025", "url": "https://arxiv.org/abs/2411.00081", "tasks": "100,000 tasks, 60 houses, 5,819 objects; 4 task types", "topScore": "30% (Llama 3.1-70B) vs 93% human", "category": "Multi-Agent", "capabilities": "Embodied multi-agent planning & reasoning: constraint-free, spatial, temporal, heterogeneous collaboration", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-01"}, {"id": 301, "name": "YC-Bench", "publisher": "Collinear AI", "date": "Apr 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2604.01212", "tasks": "12 models × 3 seeds; 1-year startup sim (~hundreds of turns)", "topScore": "$1.27M avg funds (Claude Opus 4.6)", "category": "Reasoning", "capabilities": "Long-horizon planning, strategic coherence, memory/scratchpad management, adversarial agent detection, resource allocation, POMDP", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-02"}, {"id": 302, "name": "TRAJECT-Bench", "publisher": "Michigan State / Amazon / Hippocratic AI / Penn State", "date": "Oct 2025", "venue": "ICLR 2026", "url": "https://arxiv.org/abs/2510.04550", "tasks": "1,228 tools; 5,670 queries across 10 domains; parallel + sequential trajectories (3–10+ tools); simple + hard query difficulty", "topScore": "—", "category": "Tool Use", "capabilities": "Trajectory-aware tool-use evaluation: tool selection, parameterization, call ordering; trajectory exact-match & inclusion metrics; parallel and sequential tool-calling", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-03"}, {"id": 303, "name": "MiroEval", "publisher": "MiroMind AI / NUS / NTU", "date": "Mar 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.28407", "tasks": "100 tasks (70 text + 30 multimodal); 12 domains; 10 task types", "topScore": "77.5 text / 74.5 multimodal (MiroThinker-H1); 83.3 factuality (OpenAI Deep Research)", "category": "Research", "capabilities": "Deep research agent eval: adaptive synthesis quality, agentic factuality, process-centric evaluation; multimodal report generation; web search", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-03"}, {"id": 304, "name": "Vision2Web", "publisher": "Tsinghua / Zhipu AI", "date": "Mar 2026", "venue": "ICML 2026", "url": "https://arxiv.org/abs/2603.26648", "tasks": "193 tasks; 918 prototype images; 1,255 test cases; 4 website categories; 3 hierarchical levels", "topScore": "VS=38.4, FS=57.6 (Claude-Opus-4.5 + OpenHands, full-stack)", "category": "Coding", "capabilities": "Visual website development: static reproduction, interactive multi-page frontend, full-stack construction; code generation; visual consistency evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-03"}, {"id": 305, "name": "HippoCamp", "publisher": "NTU S-Lab / Synvo AI", "date": "Apr 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2604.01221", "tasks": "581 QA pairs; 46,100 fine-grained trajectories; 42.4 GB real-world files; 3 user personas", "topScore": "48.3% profiling accuracy (ChatGPT Agent Mode)", "category": "OS", "capabilities": "Contextual personal computer agents: factual retention, preference/routine inference; multimodal file understanding (text, docs, images, video, audio)", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-03"}, {"id": 306, "name": "BeSafe-Bench", "publisher": "SUSTech / Huawei RAMS Lab", "date": "Mar 2026", "venue": "ICML 2026", "url": "https://arxiv.org/abs/2603.25747", "tasks": "1,312 tasks; 4 domains (Web, Mobile, Embodied VLM, VLA); 9 safety risk categories", "topScore": "35.19% joint Success-Safe rate (OpenVLA-OFT)", "category": "Safety", "capabilities": "Behavioral safety: privacy leakage, data loss, financial harm, physical harm, ethical risks, toxic/false info, availability compromise, malicious code, network safety", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-03"}, {"id": 307, "name": "FORTRESS", "publisher": "Scale AI", "date": "Jun 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2506.14922", "tasks": "500 adversarial + 500 benign prompts; 3 domains (CBRNE, Political Violence, Criminal/Financial); 10 subcategories", "topScore": "ARS=14.09 (Claude-3.5-Sonnet, most robust)", "category": "Safety", "capabilities": "National security & public safety red-teaming: CBRNE threats, political violence, criminal/financial illicit activities; adversarial robustness vs. over-refusal balance", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-03"}, {"id": 308, "name": "HomeSafe-Bench", "publisher": "Renmin U China / UCAS / BUPT", "date": "Mar 2026", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.11975", "tasks": "438 video cases; 6 household areas; 4 danger categories; 4 severity levels", "topScore": "24.94 WSS @ 3.10s latency (HD-Guard)", "category": "Safety", "capabilities": "Unsafe action detection for embodied household agents: video understanding, physical reasoning, real-time risk monitoring, safety classification", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-03"}, {"id": 309, "name": "Agent Security Bench (ASB)", "publisher": "Zhejiang U / Rutgers U", "date": "Oct 2024", "venue": "ICLR 2025", "url": "https://arxiv.org/abs/2410.02644", "tasks": "400 attack tasks; 10 scenarios; 420+ tools; 10 attack types; 11 defense methods", "topScore": "NRP=43.56% (Claude-3.5 Sonnet)", "category": "Safety", "capabilities": "LLM agent security: direct/indirect prompt injection, memory poisoning, backdoor attacks; Net Resilient Performance metric; attack-defense trade-offs", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-03"}, {"id": 310, "name": "AgentLAB", "publisher": "Stony Brook U", "date": "Feb 2026", "venue": "ICML 2026", "url": "https://arxiv.org/abs/2602.16901", "tasks": "644 test cases; 28 agentic environments; 5 long-horizon attack types; 9-10 risk categories", "topScore": "Claude-4.5 most resistant (ASR=28.9%); Qwen-3 most vulnerable (ASR=81.5%)", "category": "Safety", "capabilities": "Multi-turn long-horizon agent attack benchmark: intent hijacking, tool chaining, objective drifting, task injection, memory poisoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-03"}, {"id": 311, "name": "TeleAI-Safety", "publisher": "China Telecom TeleAI", "date": "Dec 2025", "venue": "arxiv", "url": "https://arxiv.org/abs/2512.05485", "tasks": "342 curated samples; 12 risk categories; 19 attack + 29 defense + 19 eval methods; 14 models", "topScore": "—", "category": "Safety", "capabilities": "LLM jailbreak evaluation framework: adversarial attack robustness, defense coverage, modular pipeline; Morpheus multi-round attacker; RADAR multi-agent evaluator", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-03"}, {"id": 312, "name": "ForecastBench", "publisher": "Forecasting Research Institute", "date": "2024", "venue": "ICLR 2025", "url": "https://www.forecastbench.org/", "tasks": "Dynamic binary forecasting questions; auto-generated + market question types; human superforecaster baselines (LEAP)", "topScore": "Human superforecasters significantly outperform top LLMs (p<0.001)", "category": "Reasoning", "capabilities": "Probabilistic forecasting of real-world future events: Brier Index metric; contamination-free dynamic generation; tournament + baseline tracks", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-03"}, {"id": 313, "name": "LOCA-bench", "publisher": "HKUST", "date": "Feb 2026", "venue": "—", "url": "https://arxiv.org/abs/2602.07962", "tasks": "525 samples (15 seed tasks × 7 context lengths × 5 seeds), 7 mock services, ~280 tools", "topScore": "68.1% avg (Claude-4.5-Opus); 96% at 8K → 14.7% at 256K", "category": "Tool Use", "capabilities": "Long-context agentic tool use: context rot measurement, multi-step reasoning under extreme context growth, context engineering strategies", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-10"}, {"id": 314, "name": "ClawArena", "publisher": "UC Santa Cruz / Aiming Lab", "date": "Apr 2026", "venue": "—", "url": "https://arxiv.org/abs/2604.04202", "tasks": "64 scenarios, 1,879 eval rounds, 365 dynamic updates, 8 professional domains", "topScore": "0.735 (Claude Opus 4.6)", "category": "Enterprise", "capabilities": "Multi-source conflict reasoning, dynamic belief revision, implicit personalization, workspace grounding across professional domains", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-10"}, {"id": 315, "name": "FoodTruck Bench", "publisher": "Independent (@foodtruckbench)", "date": "Mar 2026", "venue": "—", "url": "https://foodtruckbench.com/", "tasks": "30-day business simulation, 34 agent tools, 24 models tested, 5 runs per model", "topScore": "$49,519 / +2376% ROI (Claude Opus 4.6)", "category": "Reasoning", "capabilities": "Sustained multi-step strategic reasoning, resource management, capital allocation, inventory optimization, long-horizon business decision-making", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-10"}, {"id": 316, "name": "KellyBench", "publisher": "General Reasoning, Inc.", "date": "Apr 2026", "venue": "—", "url": "https://www.gr.inc/KellyBenchPaper.pdf", "tasks": "Full EPL season sim (100-150 matchdays, 500-900 tool calls, 30-500M tokens/episode)", "topScore": "-11.0% ROI (Claude Opus 4.6); all models lose money", "category": "Reasoning", "capabilities": "Long-horizon sequential decision-making, ML model building, risk management, bet sizing, knowledge-action gap exposure", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-10"}, {"id": 317, "name": "CAR-bench", "publisher": "University of Augsburg", "date": "Jan 2026", "venue": "—", "url": "https://arxiv.org/abs/2601.22027", "tasks": "254 tasks (completion, hallucination, disambiguation); 58 tools; 48 cities, 130K POIs", "topScore": "—", "category": "Tool Use", "capabilities": "Multi-turn voice assistant tool use; limit-awareness (hallucination avoidance); disambiguation; consistency vs. latency (Pass^k vs Pass@k)", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 318, "name": "ToolScan (SpecTool)", "publisher": "Salesforce AI Research", "date": "Nov 2024", "venue": "—", "url": "https://arxiv.org/abs/2411.13547", "tasks": "150 annotated queries; 10 tool-use environments; 7-category error taxonomy", "topScore": "All tested LLMs exhibit multiple error patterns", "category": "Tool Use", "capabilities": "Tool-use error diagnosis: wrong tool, missing arg, wrong format, hallucinated arg, wrong value, extra arg; first systematic error taxonomy for LLM function calling", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 319, "name": "TOP-Bench", "publisher": "IIE / CAS", "date": "Dec 2025", "venue": "—", "url": "https://arxiv.org/abs/2512.16310", "tasks": "7 privacy categories; paired leakage/benign scenarios", "topScore": "H-Score <0.3 baseline; PEP mitigation: RLR=46.58%", "category": "Safety", "capabilities": "Cross-tool privacy leakage evaluation; TOP-R metric; 7 privacy categories; paired benign/adversarial evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 320, "name": "ToolPRMBench", "publisher": "Arizona State / Bosch", "date": "Jan 2026", "venue": "—", "url": "https://arxiv.org/abs/2601.12294", "tasks": "Large-scale multi-source; offline + online trajectory sampling", "topScore": "Tool PRMs >> general PRMs >> LLM-as-PRM", "category": "Tool Use", "capabilities": "Step-level process reward modeling for tool-using agents; specialized tool PRMs substantially outperform general-purpose PRMs", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 321, "name": "ClawBench", "publisher": "UBC / Vector / CMU / UWaterloo", "date": "Apr 2026", "venue": "—", "url": "https://arxiv.org/abs/2604.08523", "tasks": "153 tasks; 144 live production websites; 15 life categories", "topScore": "Claude Sonnet 4.6: 33.3% (vs 65–75% on sandboxed benchmarks)", "category": "Web", "capabilities": "Real-world everyday web tasks on live production websites; write-heavy multi-step workflows; reveals 2× benchmark-to-reality gap", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 322, "name": "WebSP-Eval", "publisher": "University of Wisconsin–Madison", "date": "Apr 2026", "venue": "—", "url": "https://arxiv.org/abs/2604.06367", "tasks": "200 instances; 138 distinct tasks; 28 sites; 7 categories", "topScore": "Gemini-3-Pro: 76.5% (autonomous)", "category": "Web", "capabilities": "Web agent security & privacy task evaluation; cookie/privacy settings; agents fail 45%+ on stateful UI (toggles, checkboxes); first security/privacy web agent framework", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 323, "name": "InterruptBench", "publisher": "UIC / McGill / MBZUAI / UCSB / USC", "date": "Apr 2026", "venue": "—", "url": "https://arxiv.org/abs/2604.00892", "tasks": "Derived from WebArena-Lite; 3 interruption types (addition, revision, retraction)", "topScore": "All 6 LLMs struggle", "category": "Web", "capabilities": "Mid-task user interruptions in long-horizon web navigation; agent replanning upon intent changes; first interruptible agent benchmark", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 324, "name": "Emergence WebVoyager", "publisher": "Emergence AI / Northwestern", "date": "Mar 2026", "venue": "—", "url": "https://arxiv.org/abs/2603.29020", "tasks": "535 tasks (35/category; 45 for search); 15 website categories", "topScore": "OpenAI Operator: 68.6% (vs 87% self-reported)", "category": "Web", "capabilities": "Standardized reproducible WebVoyager evaluation; corrects self-reported scores; inter-annotator agreement study", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 325, "name": "Ego2Web", "publisher": "UNC / Google", "date": "Mar 2026", "venue": "—", "url": "https://arxiv.org/abs/2603.22529", "tasks": "AR/egocentric use cases", "topScore": "Ego2WebJudge ~84% human agreement", "category": "Web", "capabilities": "Egocentric video-grounded web agent evaluation; cross-modal object recognition for AR paradigm; novel grounding methodology", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 326, "name": "TimeWarp", "publisher": "University of Utah", "date": "Mar 2026", "venue": "—", "url": "https://arxiv.org/abs/2603.04949", "tasks": "Multiple historical website snapshots", "topScore": "TimeTraj outperforms baselines", "category": "Web", "capabilities": "Temporal robustness of web agents on archived past website states; interface drift robustness; plan distillation across UI versions", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 327, "name": "Persona2Web", "publisher": "Yonsei University", "date": "Feb 2026", "venue": "—", "url": "https://arxiv.org/abs/2602.17003", "tasks": "—", "topScore": "Clarify-to-personalize outperforms silent inference", "category": "Web", "capabilities": "Personalized web agent evaluation with user browsing history; preference inference; ambiguity resolution; clarify-to-personalize paradigm", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 328, "name": "PATHWAYS", "publisher": "University of Dhaka", "date": "Feb 2026", "venue": "—", "url": "https://arxiv.org/abs/2602.05354", "tasks": "289 core tasks (150 Shopping + 139 Reddit) + 50 adversarial", "topScore": "GPT: 67.3% (best of 4 models)", "category": "Web", "capabilities": "Multi-hop web investigation and context discovery; fraud detection, content moderation, hidden context uncovering; behavioral forensics", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 329, "name": "WARC-Bench", "publisher": "Uniphore", "date": "Oct 2025", "venue": "—", "url": "https://arxiv.org/abs/2510.09872", "tasks": "438 test tasks (200 real-world); 1,059/238 train/dev", "topScore": "64.8% (best frontier); RLVR: 52.8%", "category": "Web", "capabilities": "GUI subtask execution on WARC-archived real websites; sandboxed authentic web content; RLVR training support", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 330, "name": "OS-Marathon", "publisher": "Oxford / Microsoft / Georgia Tech", "date": "Jan 2026", "venue": "—", "url": "https://arxiv.org/abs/2601.20650", "tasks": "242 workflows; 5–25 repetitions each", "topScore": "Condensed-demonstration few-shot outperforms", "category": "OS", "capabilities": "Long-horizon repetitive workflow execution for computer-use agents; consistency across iterations; error recovery; condensed-demonstration learning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 331, "name": "macOSWorld", "publisher": "Show Lab, NUS", "date": "Jun 2025", "venue": "NeurIPS 2025", "url": "https://arxiv.org/abs/2506.04135", "tasks": "231 tasks (202 core + 29 safety); 5 languages", "topScore": "~40% (Claude 3.7 Sonnet / GPT-4o)", "category": "OS", "capabilities": "Multilingual macOS GUI automation; 5 languages; safety/deception subset; significant cross-language performance gap", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 332, "name": "NaturalGAIA", "publisher": "Anonymous (ACL 2026 submission)", "date": "Aug 2025", "venue": "ACL 2026 (submitted)", "url": "https://arxiv.org/abs/2508.01330", "tasks": "Multi-level tasks + high-quality trajectory dataset", "topScore": "57.0% ATSR / 44.1% WPSR (LightManus+Jarvis)", "category": "OS", "capabilities": "Challenging multi-level GUI agent benchmark; atomic subtask decomposition; high-quality annotated trajectory dataset", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 333, "name": "VenusBench-GD", "publisher": "Ant Group (Venus Team)", "date": "Dec 2025", "venue": "—", "url": "https://arxiv.org/abs/2512.16501", "tasks": "6,100+ sample pairs; 97+ apps; 3 platforms (Android, iOS, PC)", "topScore": "75.0% (UI-Venus-1.5-30B-A3B)", "category": "OS", "capabilities": "Multi-platform GUI grounding; UI element localization; bilingual EN+ZH; 3-platform cross-environment evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 334, "name": "EntWorld", "publisher": "Zhongguancun Lab / Tsinghua", "date": "Jan 2026", "venue": "—", "url": "https://arxiv.org/abs/2601.17722", "tasks": "1,756 tasks", "topScore": "56.89% (EntAgent-RL); human ~85%", "category": "Enterprise", "capabilities": "Enterprise GUI agent benchmark; multi-app coordination; SQL-verified state checking; RL agents outperform prompted baselines", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 335, "name": "LPS-Bench", "publisher": "ShanghaiTech / Shanghai AI Lab / Rice", "date": "Feb 2026", "venue": "—", "url": "https://arxiv.org/abs/2602.03255", "tasks": "130 instances (65 benign + 65 adversarial); 7 domains; 9 risk types", "topScore": "All frontier models show substantial safety deficiencies", "category": "Safety", "capabilities": "Safety awareness for computer-use agents in long-horizon planning; MCP tool risk recognition; benign vs adversarial paired evaluation; 7 application domains", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 336, "name": "CUAVerifierBench", "publisher": "Microsoft Research / Browserbase", "date": "Apr 2026", "venue": "—", "url": "https://arxiv.org/abs/2604.06240", "tasks": "140 web task trajectories with dual human labels (process + outcome)", "topScore": "Universal Verifier: human-level inter-annotator agreement", "category": "General", "capabilities": "Meta-evaluation benchmark for computer-use agent verifier quality; dual-label annotation; taxonomy of 6 failure types; near-zero false positives", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 337, "name": "DECEPTICON", "publisher": "Stanford", "date": "Dec 2025", "venue": "—", "url": "https://arxiv.org/abs/2512.22894", "tasks": "700 tasks (600 synthetic + 100 real-world)", "topScore": "Agents >70% manipulated vs 31% human", "category": "Safety", "capabilities": "Dark pattern manipulation of web agents; adversarial UI robustness; 7 dark pattern categories; agents far more susceptible than humans", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 338, "name": "SusBench", "publisher": "University of Washington", "date": "Oct 2025", "venue": "—", "url": "https://arxiv.org/abs/2510.11035", "tasks": "313 tasks; 55 websites; 9 dark pattern types", "topScore": "Most susceptible: Preselection, Trick Wording, Hidden Info", "category": "Safety", "capabilities": "Online benchmark for dark pattern susceptibility of computer-use agents; 9 dark pattern categories; human-agent comparison", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 339, "name": "SecureWebArena", "publisher": "Beihang University / CAS", "date": "Oct 2025", "venue": "—", "url": "https://arxiv.org/abs/2510.10073", "tasks": "2,970 trajectories; 6 web environments; 6 attack vectors", "topScore": "All 9 tested LVLMs universally vulnerable", "category": "Safety", "capabilities": "Holistic security evaluation for LVLM web agents; 6 attack types (prompt injection, jailbreak, pop-up, ad injection, distractor, indirect); all SOTA models fail", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 340, "name": "KnowU-Bench", "publisher": "Zhejiang University (ZJU-REAL)", "date": "Apr 2026", "venue": "—", "url": "https://arxiv.org/abs/2604.08455", "tasks": "192 tasks (42 general, 86 personalized, 64 proactive); 23 Android apps", "topScore": "Claude Sonnet 4.6: 44.2% (hard personalized split)", "category": "OS", "capabilities": "Interactive personalized mobile agent evaluation; user preference inference, proactive intervention, multi-turn elicitation; open-source models <12%", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 341, "name": "PSPA-Bench", "publisher": "NPU / Tsinghua / PKU", "date": "Mar 2026", "venue": "—", "url": "https://arxiv.org/abs/2603.29318", "tasks": "12,855 instances; 10 daily scenarios; 22 apps; 100 user personas", "topScore": "—", "category": "OS", "capabilities": "Personalized smartphone GUI benchmark; largest personalized mobile agent benchmark; user behavior alignment across 100 personas", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 342, "name": "TelcoAgent-Bench", "publisher": "Bariah, Mefgouda, Tavakkoli et al.", "date": "Apr 2026", "venue": "—", "url": "https://arxiv.org/abs/2604.06209", "tasks": "1,470 samples; 49 fault blueprints; 20 fault types; bilingual EN+AR", "topScore": "IRA=0.94, SAS=1.00 (Qwen-3-8B)", "category": "Tool Use", "capabilities": "Multilingual telecom AI agent evaluation; intent inference, ordered tool sequencing, distractor avoidance; bilingual Arabic/English evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 343, "name": "WebDS", "publisher": "—", "date": "Aug 2025", "venue": "—", "url": "https://arxiv.org/abs/2508.01222", "tasks": "—", "topScore": "—", "category": "Data Science", "capabilities": "End-to-end web-based data science benchmark; data discovery, retrieval, and analysis pipeline across real web sources", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 344, "name": "PokeGym", "publisher": "UESTC", "date": "Apr 2026", "venue": "—", "url": "https://arxiv.org/abs/2604.08340", "tasks": "Milestone-based (badges + story events)", "topScore": "—", "category": "Reasoning", "capabilities": "Visually-driven long-horizon VLM benchmark using Pokémon game environment; sequential decision-making; milestone-based progress evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-13"}, {"id": 346, "name": "On the Tool Manipulation Capability of Open-source Large Lan", "publisher": "Qiantong Xu", "date": "2023", "venue": "arXiv", "url": "https://arxiv.org/abs/2305.16504", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "tool-use, function-calling, code-generation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-28"}, {"id": 349, "name": "GPQA", "publisher": "David Rein et al.", "date": "2023", "venue": "arXiv", "url": "https://arxiv.org/abs/2311.12022", "tasks": "—", "topScore": "—", "category": "Reasoning", "capabilities": "reasoning, scalable-oversight", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-28"}, {"id": 353, "name": "API-BLEND", "publisher": "Kinjal Basu", "date": "2024", "venue": "arXiv", "url": "https://arxiv.org/abs/2402.15491", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "function-calling, tool-use", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-28"}, {"id": 354, "name": "Benchmarking Data Science Agents", "publisher": "Yuge Zhang et al.", "date": "2024", "venue": "arXiv", "url": "https://arxiv.org/abs/2402.17168", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "code-generation, reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-28"}, {"id": 356, "name": "Introducing the WorkArena Benchmark", "publisher": "ServiceNow Research", "date": "2024", "venue": "Announcement", "url": "https://www.servicenow.com/blogs/2024/introducing-workarena-benchmark", "tasks": "—", "topScore": "—", "category": "Enterprise", "capabilities": "web-agents, knowledge-work, enterprise, browser-automation, servicenow", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-07"}, {"id": 357, "name": "Nexus Function Calling Evaluation (NexusFCEval)", "publisher": "Nexusflow", "date": "2024", "venue": "Announcement", "url": "https://huggingface.co/datasets/Nexusflow/NexusFCEval", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "function-calling, tool-use", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 370, "name": "An Illusion of Progress? Assessing the Current State of Web ", "publisher": "Tianci Xue", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2504.01382", "tasks": "—", "topScore": "—", "category": "Web", "capabilities": "web-navigation, planning, survey", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-28"}, {"id": 371, "name": "WASP", "publisher": "Ivan Evtimov et al.", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2504.18575", "tasks": "—", "topScore": "—", "category": "Web", "capabilities": "web-navigation, security, prompt-injection, adversarial, neurips-2025, facebook-", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-15"}, {"id": 374, "name": "Introducing TRAIL", "publisher": "Patronus AI", "date": "2025", "venue": "Announcement", "url": "https://www.patronus.ai/blog/introducing-trail-a-benchmark-for-agentic-evaluation", "tasks": "—", "topScore": "—", "category": "General", "capabilities": "debugging, trace-analysis, error-detection, agent-evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-07"}, {"id": 377, "name": "A Functionality-Grounded Benchmark for Evaluating Web Agents", "publisher": "Xianren Zhang et al.", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2508.15832", "tasks": "—", "topScore": "—", "category": "Web", "capabilities": "web-navigation, e-commerce, amazon, safety, functionality, web-agent, account-ma", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-15"}, {"id": 379, "name": "Introducing Recovery-Bench", "publisher": "Letta", "date": "2025", "venue": "Announcement", "url": "https://www.letta.com/blog/recovery-bench", "tasks": "—", "topScore": "—", "category": "General", "capabilities": "error-recovery, context-pollution, terminal-use, resilience, continual-learning", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-28"}, {"id": 380, "name": "Holistic Agent Leaderboard", "publisher": "Sayash Kapoor", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2510.11977", "tasks": "—", "topScore": "—", "category": "General", "capabilities": "meta-evaluation, infrastructure, reproducibility, princeton, iclr-2026", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-07"}, {"id": 382, "name": "Introducing Spring AI Agents and Spring AI Bench", "publisher": "Spring AI Community (VMware Tanzu / Broa", "date": "2025", "venue": "Announcement", "url": "https://spring.io/blog/2025/10/28/agents-and-benchmarks/", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "coding, enterprise, java, spring, developer-productivity, tool-use, pr-review, i", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-28"}, {"id": 383, "name": "GUI-360°", "publisher": "Jian Mu et al.", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2511.04307", "tasks": "—", "topScore": "—", "category": "OS", "capabilities": "gui, computer-use, windows, desktop, grounding, screen-parsing, action-predictio", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-15"}, {"id": 384, "name": "PropensityBench | SEAL by Scale AI", "publisher": "Scale AI", "date": "2025", "venue": "Announcement", "url": "https://scale.com/leaderboard/propensitybench", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "safety, propensity, alignment, tool-use, biosecurity, cybersecurity, chemical-se", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-09"}, {"id": 385, "name": "NeuroGrid CTF", "publisher": "Hack The Box", "date": "2025", "venue": "Announcement", "url": "https://www.hackthebox.com/events/neurogrid", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "cybersecurity, ctf, offensive-security, ai-vs-human, tool-use, reasoning, autono", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-28"}, {"id": 388, "name": "MiniMax Open-Sources New Benchmark", "publisher": "MiniMax", "date": "2026", "venue": "Announcement", "url": "https://www.minimax.io/news/production-grade-benchmark-for-coding-agents", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "coding-agent, instruction-following, production-grade, open-source, tool-use", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 389, "name": "Introducing APEX-Agents", "publisher": "Mercor", "date": "2026", "venue": "Announcement", "url": "https://www.mercor.com/blog/introducing-apex-agents/", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "professional-services, investment-banking, consulting, corporate-law, long-horiz", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-07"}, {"id": 392, "name": "Scale Labs Leaderboard", "publisher": "Scale AI (Scale Labs)", "date": "2026", "venue": "Announcement", "url": "https://scale.com/leaderboard/tool_use_chat", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "tool-use, function-calling, compositional, chain-of-tools, scale-ai", "score": null, "citations": null, "tier": null, "importedAt": "2026-03-29"}, {"id": 393, "name": "AgentCE-Bench", "publisher": "Wang Yang et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.06111", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "planning, tool-use, reasoning, configurable, lightweight, training-eval", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-15"}, {"id": 394, "name": "The Amazing Agent Race", "publisher": "Zae Myung Kim et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.10261", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "tool-use, web-navigation, wikipedia, dag, multi-step, harbor, reasoning, composi", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-15"}, {"id": 395, "name": "CocoaBench", "publisher": "Shibo Hao et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.11201", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "multimodal, vision, web-navigation, coding, tool-use, long-horizon, unified-agen", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-15"}, {"id": 398, "name": "WebGames", "publisher": "George Thomas", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2502.18356", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "web-navigation, tool-use, planning, reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-16"}, {"id": 399, "name": "Open-world evaluations for measuring frontier AI capabilitie", "publisher": "Sayash Kapoor", "date": "2026", "venue": "Announcement", "url": "https://cruxevals.com/open-world-evaluations.pdf", "tasks": "—", "topScore": "—", "category": "General", "capabilities": "announcement, crux, open_world_evaluation, frontier_capabilities, ios_app_develo", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-16"}, {"id": 400, "name": "AlphaEval", "publisher": "Pengrui Lu", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.12162", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "enterprise, multi-agent, tool-use, code-generation, reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-16"}, {"id": 401, "name": "LiveClawBench", "publisher": "Xiang Long", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.13072", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "tool-use, reasoning, planning, memory, multi-agent, web-navigation, os-interacti", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-16"}, {"id": 402, "name": "SIR-Bench", "publisher": "Daniel Begimher", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.12040", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "security, tool-use, reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-16"}, {"id": 403, "name": "FrontierSWE", "publisher": "Proximal Labs", "date": "2026", "venue": "Announcement", "url": "https://www.frontierswe.com/blog", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "coding, software_engineering, long_horizon, frontier_swe, proximal_labs", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-16"}, {"id": 404, "name": "ParseBench", "publisher": "LlamaIndex (Boyang Zhang", "date": "2026", "venue": "Announcement", "url": "https://www.parsebench.ai/", "tasks": "—", "topScore": "—", "category": "General", "capabilities": "announcement, document-parsing, ocr, llamaindex, tables, charts, visual-groundin", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-16"}, {"id": 405, "name": "PAC-Bench", "publisher": "Minjun Park", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.11523", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "multi-agent, privacy, tool-use, reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-16"}, {"id": 406, "name": "N-Day-Bench", "publisher": "Winfunc Research", "date": "2026", "venue": "Announcement", "url": "https://ndaybench.winfunc.com", "tasks": "—", "topScore": "—", "category": "Security", "capabilities": "security, vulnerability_discovery, n_day, code_analysis, llm_as_judge, contamina", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-16"}, {"id": 407, "name": "GDP.pdf", "publisher": "Surge AI", "date": "2026", "venue": "Announcement", "url": "https://surgehq.ai/blog/gdp-pdf-can-100b-ai-models-master-the-documents-that-run-the-world", "tasks": "—", "topScore": "—", "category": "Enterprise", "capabilities": "announcement, document_understanding, pdf, enterprise, multimodal, extraction, p", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-16"}, {"id": 408, "name": "Benchmarking LLMs' Swarm Intelligence", "publisher": "RUC-GSAI (Renmin University of China)", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2505.04364", "tasks": "—", "topScore": "—", "category": "Multi-Agent", "capabilities": "multi-agent, swarm-intelligence, decentralized, coordination, embodied", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-17"}, {"id": 409, "name": "AssetOpsBench", "publisher": "IBM Research", "date": "2025", "venue": "Announcement", "url": "https://huggingface.co/blog/ibm-research/assetopsbench-playground-on-hugging-face", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "multi-agent, industrial, enterprise, asset-management, tool-use", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-17"}, {"id": 410, "name": "AgentsNet", "publisher": "Florian Grötschla", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2507.08616", "tasks": "—", "topScore": "—", "category": "Multi-Agent", "capabilities": "multi-agent, coordination, distributed-systems, graph-theory, collaboration", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-17"}, {"id": 411, "name": "AgentLeak", "publisher": "Privatris team (GitHub: Privatris/AgentL", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2602.11510", "tasks": "—", "topScore": "—", "category": "Security", "capabilities": "multi-agent, privacy, security, llm, data-leakage", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-17"}, {"id": 412, "name": "ACIArena", "publisher": "ACIArena team", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.07775", "tasks": "—", "topScore": "—", "category": "Security", "capabilities": "multi-agent, security, prompt-injection, robustness, mas", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-17"}, {"id": 413, "name": "AgentSocialBench", "publisher": "Prince Zizhuang Wang", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.01487", "tasks": "—", "topScore": "—", "category": "Multi-Agent", "capabilities": "multi-agent, privacy, social-networks, human-agent-interaction, llm", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-17"}, {"id": 414, "name": "SAGE", "publisher": "Ling Shi", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.09285", "tasks": "—", "topScore": "—", "category": "Multi-Agent", "capabilities": "multi-agent, service-agent, customer-service, sop-compliance", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-17"}, {"id": 415, "name": "MirrorCode", "publisher": "Epoch AI & METR", "date": "2026", "venue": "Announcement", "url": "https://epoch.ai/blog/mirrorcode-preliminary-results", "tasks": "Long-horizon library reimplementation tasks (gotree bioinformatics toolkit 16K lines 40+ commands — estimated 2-17 human-weeks)", "topScore": "Claude Opus 4.6: first to fully succeed", "category": "Coding", "capabilities": "long-horizon, software-engineering, autonomous-coding, safety, metr", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-19"}, {"id": 416, "name": "Plan-RewardBench", "publisher": "(pending)", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.08178", "tasks": "4 task families: Safety Refusal, Tool-Irrelevance, Complex Planning, Robust Error Recovery", "topScore": "All RM types degrade sharply on long-horizon trajectories", "category": "Reasoning", "capabilities": "reward-modeling, planning, trajectory-level, tool-use, evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-19"}, {"id": 417, "name": "DeltaBench", "publisher": "LivingFutureLab / OpenStellarTeam", "date": "2025", "venue": "ACL 2025", "url": "https://arxiv.org/abs/2502.19361", "tasks": "1,236 samples: Math, Code, PCB, General Reasoning; long CoT error detection", "topScore": "GPT-4-turbo: F1=40.8%", "category": "Reasoning", "capabilities": "chain-of-thought, error-detection, process-reward-models, meta-evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-19"}, {"id": 418, "name": "FaithCoT-Bench", "publisher": "(pending)", "date": "2025", "venue": "ICLR 2026 Workshop", "url": "https://arxiv.org/abs/2510.04040", "tasks": "1,000+ annotated CoT trajectories; 4 LLMs × 4 domains; 300+ unfaithful instances", "topScore": "All detection methods show inconsistent reliability", "category": "Reasoning", "capabilities": "chain-of-thought, faithfulness, interpretability, meta-evaluation, evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-19"}, {"id": 420, "name": "FinMTM", "publisher": "(pending)", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2602.03130", "tasks": "11,133 bilingual QA pairs; objective QA, multi-turn dialogue, agent tasks (MCP tools)", "topScore": "22 VLMs evaluated; all show significant limitations", "category": "Reasoning", "capabilities": "financial, multimodal, multi-turn, agent, tool-use, bilingual, reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-19"}, {"id": 421, "name": "AEC-Bench", "publisher": "Nomic AI", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2603.29199", "tasks": "196 tasks across 9 families; real AEC domain documents", "topScore": "—", "category": "General", "capabilities": "multimodal, domain-specific, reasoning, drawing-understanding, cross-sheet", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-19"}, {"id": 422, "name": "AgentDrive-MCQ", "publisher": "Ferrag, Lakas, Debbah", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2601.16964", "tasks": "100,000 MCQs (5 reasoning dims); 300K training scenarios; 50 LLMs evaluated", "topScore": "Frontier models best in policy reasoning; open models closing gap", "category": "General", "capabilities": "autonomous-driving, reasoning, planning, domain-specific, MCQ", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-19"}, {"id": 423, "name": "From Plan to Action", "publisher": "(pending full author list)", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.12147", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "planning, software-engineering, agent-behavior, plan-compliance, swe-bench", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-19"}, {"id": 424, "name": "CFE-Bench", "publisher": "Analogy AI / Northwestern U / UC Santa Cruz / Duke / U Birmingham / U Rochester", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2602.19517", "tasks": "449 problems (305 text-only, 144 multimodal); 20+ STEM domains", "topScore": "Gemini-3.1-pro-preview: 59.69%", "category": "Reasoning", "capabilities": "reasoning, multimodal, multi-step, mathematical, scientific, symbolic-manipulation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-21"}, {"id": 425, "name": "MoReBench", "publisher": "Scale AI / U Washington / NYU / Harvard / UMich / UNC / Center for AI Safety / Stanford / MIT / Oxford", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2510.16380", "tasks": "1,000 moral dilemma scenarios with 23,018 rubric criteria; MoReBench-Theory: 150 additional scenarios", "topScore": "Logical Process: ~41.5%; Harmless Outcome: ~81%", "category": "Reasoning", "capabilities": "moral-reasoning, safety, procedural-reasoning, rubric-evaluation, normative-ethics", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-21"}, {"id": 426, "name": "BankerToolBench", "publisher": "Handshake AI Research", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.11304", "tasks": "100 end-to-end IB tasks; ~15,000 rubric evaluation points; Excel+PowerPoint+Word deliverables", "topScore": "Best model fails ~50% rubric criteria; 0% client-ready", "category": "Enterprise", "capabilities": "tool-use, long-horizon, financial-modeling, multi-file-generation, enterprise, reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-21"}, {"id": 427, "name": "HiL-Bench", "publisher": "Scale AI", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.09408", "tasks": "300 tasks (150 SWE + 150 text-to-SQL); 200 public + 100 private; 3–5 blockers per task", "topScore": "Ask-F1: frontier models 4–24% in HiL condition vs. 75–89% with full context", "category": "Agentic", "capabilities": "planning, reasoning, clarification-seeking, uncertainty-awareness, help-seeking, SWE, SQL", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-21"}, {"id": 428, "name": "CIK-Bench", "publisher": "UC Santa Cruz / NUS / Tencent / ByteDance / UCB / UNC", "date": "2026-04", "venue": "arxiv", "url": "https://arxiv.org/abs/2604.04759", "tasks": "Personal AI agent attack scenarios (Capability injection, Identity manipulation, Knowledge poisoning); privacy leakage & irreversible harm eval", "topScore": "(pending)", "category": "Safety", "capabilities": ["Safety", "Multi-turn", "Long Horizon", "Reliability"], "score": 50, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 429, "name": "SafePro", "publisher": "UC Santa Cruz / UC Santa Barbara / eBay", "date": "2026-01", "venue": "arxiv", "url": "https://arxiv.org/abs/2601.06663", "tasks": "275 tasks across 51 occupations in 9 U.S. economy sectors; UnsafeRate & SafetyScore via LLM-as-judge", "topScore": "(pending)", "category": "Safety", "capabilities": ["Safety", "Enterprise", "Reliability"], "score": 52, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 430, "name": "AgentMisalignment", "publisher": "Akshat Naik et al.", "date": "2025-06", "venue": "arxiv", "url": "https://arxiv.org/abs/2506.04018", "tasks": "9 agentic environments measuring misalignment: oversight avoidance, shutdown resistance, sandbagging, power-seeking, resource acquisition, deception, moral flexibility", "topScore": "(CMS metric)", "category": "Safety", "capabilities": ["Safety", "Reasoning", "Multi-turn", "Multi-Agent", "Long Horizon"], "score": 55, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 431, "name": "The Last Ones (TLO)", "publisher": "UK AI Security Institute (AISI)", "date": "2026-03", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.11214", "tasks": "32 sequential steps / 9 milestones: full corporate network attack kill chain (recon → lateral movement → exfiltration)", "topScore": "(pending)", "category": "Cybersecurity", "capabilities": ["Safety", "Reasoning", "Tool Use", "OS/GUI", "Multi-turn", "Long Horizon"], "score": 55, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 432, "name": "Cooling Tower", "publisher": "UK AI Security Institute (AISI)", "date": "2026-03", "venue": "arxiv", "url": "https://arxiv.org/abs/2603.11214", "tasks": "7 sequential steps: ICS/OT attack on simulated power plant (HMI web exploitation, protocol reverse engineering, PLC register manipulation)", "topScore": "(pending)", "category": "Cybersecurity", "capabilities": ["Safety", "Reasoning", "Tool Use", "OS/GUI", "Multi-turn", "Long Horizon"], "score": 54, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 433, "name": "LinuxArena", "publisher": "Redwood Research + Equistamp", "date": "2026-04", "venue": "arxiv", "url": "https://arxiv.org/abs/2604.15384", "tasks": "1,671 SE tasks (code gen, debugging, refactoring, infra, docs, testing) + 184 sabotage side tasks in 20 Docker Compose production environments", "topScore": "Claude Opus 4.6: ~23% undetected sabotage at 1% FPR", "category": "AI Control", "capabilities": ["Code Gen", "Bug Fix", "Tool Use", "OS/GUI", "Reasoning", "Safety", "Multi-Agent", "Long Horizon", "Reliability"], "score": 58, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 434, "name": "LaStraj", "publisher": "Redwood Research + Equistamp", "date": "2026-04", "venue": "arxiv", "url": "https://arxiv.org/abs/2604.15384", "tasks": "Human red-teaming attack trajectory dataset; upper bound for AI control difficulty", "topScore": "Human attacks dominate model-generated attacks", "category": "AI Control", "capabilities": ["Safety", "Reliability"], "score": 50, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 435, "name": "OccuBench", "publisher": "Qwen Team, Alibaba Group + CUHK", "date": "2026-04", "venue": "arxiv", "url": "https://arxiv.org/abs/2604.10866", "tasks": "100 scenarios, 382 evaluation instances across 65 occupational domains in 10 industries; Language Environment Simulators for stateful tool simulation", "topScore": "GPT-5.2: 79.6% CR; Gemini 3.1 Pro: 72.3%; Claude Opus 4.6: 71.5%", "category": "Enterprise", "capabilities": ["Tool Use", "Function Call", "Reasoning", "Multi-turn", "Enterprise", "Long Horizon", "Reliability"], "score": 57, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 436, "name": "MedAgentBoard", "publisher": "Yuhao Zhu et al. / NeurIPS 2025", "date": "2025-05", "venue": "NeurIPS 2025", "url": "https://arxiv.org/abs/2505.12371", "tasks": "4 medical task categories: medical QA (MedQA/PubMedQA/PathVQA/VQA-RAD), lay summary generation, EHR prediction (MIMIC-IV), clinical workflow automation", "topScore": "Multi-agent outperforms single-LLM on 3/4 tasks", "category": "Multi-Agent", "capabilities": ["Reasoning", "Multi-turn", "Memory", "Multi-Agent", "Scientific", "Multimodal", "Long Horizon", "Reliability"], "score": 62, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 437, "name": "AgentArch", "publisher": "ServiceNow", "date": "2025-09", "venue": "arxiv", "url": "https://arxiv.org/abs/2509.10769", "tasks": "18 architectural configs × 6 LLMs × 2 enterprise tasks × 60 samples × 8 trials ≈ 17,280 runs; Time Off/PTO + Customer Routing enterprise workflows", "topScore": "GPT-4.1: 70.8% (simple); Claude Sonnet 4: 35.3% (complex); Pass^K: 6.34%", "category": "Multi-Agent", "capabilities": ["Tool Use", "Function Call", "Reasoning", "Multi-turn", "Memory", "Multi-Agent", "Enterprise", "Cost Eval", "Long Horizon", "Reliability"], "score": 60, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 438, "name": "MASLegalBench", "publisher": "HKUST KnowComp / Tsinghua University", "date": "2025-09", "venue": "arxiv", "url": "https://arxiv.org/abs/2509.24922", "tasks": "Multi-agent deductive legal reasoning (GDPR); role-based specialisation; BM25+embedding RAG retrieval; multi-choice legal QA", "topScore": "(pending)", "category": "Multi-Agent", "capabilities": ["Reasoning", "Multi-turn", "Memory", "Multi-Agent", "Long Horizon", "Reliability"], "score": 55, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 439, "name": "CoopEval", "publisher": "Tewolde, Zhang et al.", "date": "2026-04", "venue": "arxiv", "url": "https://arxiv.org/abs/2604.15267", "tasks": "Mixed-motive social dilemma games; 4 cooperation mechanisms (repetition, reputation, mediation, contracting); replicator-dynamics population fitness", "topScore": "(pending)", "category": "Multi-Agent", "capabilities": ["Reasoning", "Multi-turn", "Multi-Agent", "Long Horizon"], "score": 52, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 440, "name": "TRAIL", "publisher": "Patronus AI", "date": "2025-05", "venue": "arxiv", "url": "https://arxiv.org/abs/2505.08638", "tasks": "148 annotated agentic traces, 1,987 OTel spans, 841 errors; 20+ error types; long-context up to 6M tokens; MIT license", "topScore": "Gemini-2.5-Pro: 11% joint accuracy", "category": "Trace Evaluation", "capabilities": ["Tool Use", "Reasoning", "Multi-turn", "Memory", "Safety", "Multi-Agent", "Long Horizon", "Reliability"], "score": 60, "citations": 0, "tier": 3, "importedAt": "2026-04-23T13:48:17.229413Z"}, {"id": 441, "name": "Introducing Aardvark", "publisher": "OpenAI (no individual author attributed;", "date": "2025", "venue": "Announcement", "url": "https://openai.com/index/introducing-aardvark/", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "security, cybersecurity, vulnerability-detection, code-analysis, patching, gpt-5", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 442, "name": "AgenticRed", "publisher": "Jiayi Yuan et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2601.13518", "tasks": "—", "topScore": "—", "category": "Safety", "capabilities": "red-teaming, safety, jailbreak, automated-attack, evolutionary-search, llm-safet", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 443, "name": "AJAR", "publisher": "Yipu Dou et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2601.10971", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "safety, red-teaming, jailbreak, mcp, multi-turn, tool-use, llm-safety, adversari", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 444, "name": "A3", "publisher": "Anthropic Fellows Program; Constellation", "date": "2026", "venue": "Announcement", "url": "https://alignment.anthropic.com/2026/automated-alignment-agent/", "tasks": "—", "topScore": "—", "category": "Safety", "capabilities": "alignment, safety, finetuning, sycophancy, political-neutrality, jailbreak, auto", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 445, "name": "OdysseyBench", "publisher": "Weixuan Wang", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2508.09124", "tasks": "—", "topScore": "—", "category": "Enterprise", "capabilities": "enterprise, long-horizon, office-applications, word, excel, pdf, email, calendar", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 446, "name": "Evaluating Long-Context Reasoning in LLM-Based WebAgents", "publisher": "Andy Chung", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2512.04307", "tasks": "—", "topScore": "—", "category": "Web", "capabilities": "web, long-context, multi-session, irrelevant-trajectory-injection", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 447, "name": "LongCLI-Bench", "publisher": "Yukang Feng", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2602.14337", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "coding, software-engineering, long-horizon, command-line, programming", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 448, "name": "MemoryArena", "publisher": "Zexue He", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2602.16313", "tasks": "—", "topScore": "—", "category": "Web", "capabilities": "memory, long-horizon, multi-session, web-navigation, planning, reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 449, "name": "TRIP-Bench", "publisher": "Yuanzhe Shen", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2602.01675", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "long-horizon, tool-use, travel-planning, multi-tool, constraint-satisfaction, in", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 450, "name": "LifeBench", "publisher": "Zihao Cheng", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2603.03781", "tasks": "—", "topScore": "—", "category": "Memory", "capabilities": "memory, long-horizon, multi-source, personalized, declarative, procedural", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 451, "name": "LMEB", "publisher": "Xinping Zhao", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2603.12572", "tasks": "—", "topScore": "—", "category": "Memory", "capabilities": "memory, embedding, retrieval, long-horizon, episodic, dialogue, semantic, proced", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 452, "name": "ATANT v1.1", "publisher": "Samuel Sameer Tanguturi", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.10981", "tasks": "—", "topScore": "—", "category": "Memory", "capabilities": "memory, continuity-evaluation, long-context, agentic-memory, companion-paper", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-23"}, {"id": 453, "name": "RewardHackingAgents", "publisher": "Yonas Atinafu", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2603.11337", "tasks": "—", "topScore": "—", "category": "ML/Research", "capabilities": "ml-engineering, evaluation-integrity, reward-hacking, train-test-leakage", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-24"}, {"id": 454, "name": "Evaluating LLM Agents on Automated Software Analysis Tasks", "publisher": "Islem Bouzenia", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.11270", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "software-engineering, software-analysis, c-cpp, java, tool-configuration", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-24"}, {"id": 455, "name": "AutomationBench", "publisher": "Daniel Shepard", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.18934", "tasks": "—", "topScore": "—", "category": "Enterprise", "capabilities": "enterprise, workflow-automation, rest-api, cross-application, business-process", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-24"}, {"id": 456, "name": "Frontier-Eng", "publisher": "Yizhe Chi", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.12290", "tasks": "—", "topScore": "—", "category": "General", "capabilities": "engineering, generative-optimization, iterative-design, industrial-simulator", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-24"}, {"id": 457, "name": "Litmus (Re)Agent", "publisher": "Avni Mittal", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.08970", "tasks": "—", "topScore": "—", "category": "General", "capabilities": "multilingual, predictive-evaluation, transfer-learning, evidence-aggregation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-24"}, {"id": 458, "name": "ProdCodeBench", "publisher": "Smriti Jha", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.01527", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "coding, software-engineering, production-derived, multi-language, harness-design", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-24"}, {"id": 459, "name": "Announcing AutoBench Agentic", "publisher": "Peter Kruger", "date": "2026", "venue": "Announcement", "url": "https://huggingface.co/blog/PeterKruger/autobench-agentic-1", "tasks": "—", "topScore": "—", "category": "Enterprise", "capabilities": "dynamic-generation, un-gameable, virtual-environments, enterprise, react, multi-", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-24"}, {"id": 460, "name": "Introducing o11y-bench", "publisher": "Grafana Labs", "date": "2026", "venue": "Announcement", "url": "https://grafana.com/blog/o11y-bench-open-benchmark-for-observability-agents/", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "observability, tool-use, grafana, mcp, sre, open-source", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-24"}, {"id": 461, "name": "CodeElo", "publisher": "Shanghaoran Quan et al.", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2501.01257", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "code-generation, competitive-programming, elo-rating, codeforces, reasoning, alg", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-25"}, {"id": 462, "name": "Terminal-Bench", "publisher": "Laude Institute et al. (Stanford x Laude", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2601.11868", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "terminal, cli, os-interaction, system-administration, security, devops, machine-", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-25"}, {"id": 463, "name": "AACR-Bench", "publisher": "L. Zhang et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2601.19494", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "code-review, repository-level, multilingual, automated-code-review, llm-evaluati", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-25"}, {"id": 464, "name": "SWE-Next", "publisher": "Jiarong Liang et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2603.20691", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "software-engineering, code-generation, training-data, swe-bench, fine-tuning, ex", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-25"}, {"id": 465, "name": "CaP-X", "publisher": "Max Fu et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2603.22435", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "robotics, code-generation, embodied-ai, tool-use, sim-to-real, reinforcement-lea", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-25"}, {"id": 466, "name": "A Benchmark for Evaluating Repository-Level Code Agents with", "publisher": "Shuhan Liu et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2603.26337", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "code-generation, software-engineering, repository-level, reasoning, feature-addi", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-25"}, {"id": 467, "name": "ELT-Bench-Verified", "publisher": "Christopher Zanoli et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2603.29399", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "data-engineering, elt, code-generation, tool-use, benchmark-quality, annotation-", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-25"}, {"id": 468, "name": "Evaluating Tool-Using Language Agents", "publisher": "Bhaskar Gurram et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.16706", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "tool-use, judge-reliability, error-propagation, runtime-mitigation, function-cal", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-25"}, {"id": 469, "name": "Precise Debugging Benchmark", "publisher": "Wang Bill Zhu et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.17338", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "debugging, code-generation, llm, precision, recall, software-engineering, tool-u", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-25"}, {"id": 470, "name": "WildToolBench", "publisher": "USTC / Tencent", "date": "2025", "venue": "arXiv (ICLR 2026)", "url": "https://arxiv.org/abs/2604.06185", "tasks": "1,024 tasks / 256 scenarios / 1,600+ real APIs", "topScore": "<15% session accuracy (best of 57 models)", "category": "Tool Use", "capabilities": "multi-turn tool orchestration, compositional tool-use, implicit intent inference, instruction transition, real-world APIs", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-27"}, {"id": 471, "name": "FinTrace", "publisher": "Stevens Institute of Technology / The FinAI / Duke University", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.10015", "tasks": "800 expert-annotated trajectories; 34 financial task categories", "topScore": "—", "category": "Tool Use", "capabilities": "financial tool-calling, long-horizon reasoning, trajectory-level evaluation, tool selection, process quality, preference learning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-27"}, {"id": 472, "name": "GeoAgentBench", "publisher": "Bo Yu et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.13888", "tasks": "53 spatial analysis tasks; 117 atomic GIS tools; 6 domains", "topScore": "Gemini-2.5-Flash (best TAO/PEA)", "category": "Tool Use", "capabilities": "GIS tool use, spatial analysis, dynamic execution, parameter inference, map visualization, multimodal evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-27"}, {"id": 473, "name": "FinMCP-Bench", "publisher": "Alibaba Cloud / Qwen DianJin Team / YINGMI / Soochow University", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2603.24943", "tasks": "613 samples; 65 financial MCP tools; 33 sub-scenarios", "topScore": "Qwen3-235B-A22B-Thinking", "category": "Tool Use", "capabilities": "financial MCP tool selection, multi-tool dependency planning, multi-turn financial dialogue, MCP protocol invocation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-27"}, {"id": 474, "name": "ComplexFuncBench", "publisher": "Tsinghua University (THUDM / KE Lab)", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2501.10132", "tasks": "1,000 samples; 43 real-time APIs; 5 travel domains (Booking.com)", "topScore": "—", "category": "Tool Use", "capabilities": "multi-step function calling, user-constraint adherence, implicit parameter reasoning, long-parameter handling, long-context tool use (128k)", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-27"}, {"id": 475, "name": "SafeToolBench", "publisher": "Beijing Institute of Technology / CUHK / Beihang / Baidu", "date": "2025", "venue": "EMNLP 2025 (Findings)", "url": "https://arxiv.org/abs/2509.07315", "tasks": "1,200 adversarial instructions; 16 everyday domains; 4 risk types", "topScore": "GPT-4o (best, but gaps remain)", "category": "Tool Use", "capabilities": "prospective tool-use safety, pre-execution risk detection, adversarial instruction analysis, tool-instruction joint safety", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-27"}, {"id": 476, "name": "GTA-2", "publisher": "Shanghai Jiao Tong University / Shanghai AI Lab / NTU / University of Sydney", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.15715", "tasks": "GTA-Atomic + GTA-Workflow (6 productivity domains); 37 tools", "topScore": "Manus (best on GTA-Workflow)", "category": "Tool Use", "capabilities": "long-horizon multi-tool orchestration, deliverable creation, data analysis, creative design, planning, web retrieval, open-ended workflow evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-27"}, {"id": 477, "name": "VoiceAgentBench", "publisher": "Ola Krutrim", "date": "2024", "venue": "arXiv", "url": "https://arxiv.org/abs/2510.07978", "tasks": "6,000+ spoken queries; 6 eval categories; English + 6 Indic languages", "topScore": "—", "category": "Tool Use", "capabilities": "voice-based tool calling, parallel/sequential tool invocation, multi-turn dialogue, safety/refusal, multilingual agentic tasks", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-27"}, {"id": 478, "name": "GeoBenchX", "publisher": "World Bank / J.P. Morgan Chase", "date": "2025", "venue": "arXiv (ACM SIGSPATIAL GeoGenAgent 2025)", "url": "https://arxiv.org/abs/2503.18129", "tasks": "202 geospatial tasks; 23 GIS functions; 4 complexity groups; solvable/unsolvable split", "topScore": "o4-mini (~90% unsolvable detection)", "category": "Tool Use", "capabilities": "multi-step GIS tool-calling, spatial analysis, data join/filter, visualization, task feasibility judgment, hallucination avoidance", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-27"}, {"id": 479, "name": "AI Idea Bench 2025", "publisher": "Yansheng Qiu et al.", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2504.14191", "tasks": "—", "topScore": "—", "category": "ML/Research", "capabilities": "research, planning, reasoning, idea-generation, creativity", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-29"}, {"id": 480, "name": "BioProBench", "publisher": "Liu et al.", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2505.07889", "tasks": "—", "topScore": "—", "category": "Reasoning", "capabilities": "reasoning, biology, scientific-reasoning, procedural-reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-29"}, {"id": 481, "name": "BeyondBench", "publisher": "Unknown et al.", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2509.24210", "tasks": "—", "topScore": "—", "category": "Reasoning", "capabilities": "reasoning, contamination, algorithmic-reasoning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-29"}, {"id": 482, "name": "ProAgentBench", "publisher": "Unknown et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2602.04482", "tasks": "—", "topScore": "—", "category": "Memory", "capabilities": "proactive-assistance, planning, memory, real-world", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-29"}, {"id": 483, "name": "HWE-Bench", "publisher": "Unknown et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.14709", "tasks": "—", "topScore": "—", "category": "Coding", "capabilities": "hardware, debugging, tool-use, code-generation, repository-level", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-29"}, {"id": 484, "name": "Do Agents Dream of Root Shells? Partial-Credit Evaluation of", "publisher": "Ali Al-Kaswan", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.19354", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "cybersecurity, ctf, tool-use, reasoning, planning", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-29"}, {"id": 485, "name": "AgentSearchBench", "publisher": "Bin Wu et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.22436", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "agent-discovery, retrieval, tool-use, multi-agent", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-29"}, {"id": 486, "name": "Odysseys", "publisher": "Unknown et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.24964", "tasks": "—", "topScore": "—", "category": "Web", "capabilities": "web-navigation, long-horizon, multi-site, real-world", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-29"}, {"id": 487, "name": "RiskWebWorld", "publisher": "Unknown et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.13531", "tasks": "—", "topScore": "—", "category": "Web", "capabilities": "web-navigation, gui, e-commerce, risk-management, real-world", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-29"}, {"id": 488, "name": "PRL-Bench", "publisher": "Unknown et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.15411", "tasks": "—", "topScore": "—", "category": "ML/Research", "capabilities": "reasoning, research, planning, physics, scientific-reasoning, long-horizon", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-29"}, {"id": 489, "name": "GAIA-v2-LILT", "publisher": "Yunsu Kim et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.24929", "tasks": "—", "topScore": "—", "category": "General", "capabilities": "multilingual, agent, general-assistant, gaia, adaptation", "score": null, "citations": null, "tier": null, "importedAt": "2026-04-29"}, {"id": 490, "name": "PyMARLzoo+ (Extended MARL Benchmarking)", "publisher": "University of Piraeus (AI Lab DS)", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2502.04773", "tasks": "18 MARL algorithms × 83 cooperative tasks, 9 env families", "topScore": "Algorithm-dependent; SMAC rankings don't generalize", "category": "Multi-Agent", "capabilities": "multi-agent, cooperative-rl, algorithm-benchmarking, marl, image-observations", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-01"}, {"id": 491, "name": "POGEMA", "publisher": "AIRI Institute / Cognitive-AI-Systems", "date": "2024", "venue": "ICLR 2025", "url": "https://arxiv.org/abs/2407.14931", "tasks": "Grid-world MAPF + LMAPF cooperative pathfinding scenarios", "topScore": "Algorithm-dependent", "category": "Multi-Agent", "capabilities": "multi-agent, navigation, pathfinding, cooperative, collision-avoidance", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-01"}, {"id": 492, "name": "Multi-Agent Craftax", "publisher": "Oxford / Foerster Lab", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2511.04904", "tasks": "Open-ended 2D survival MARL; 1B env-interaction budget; homogeneous + role-specialized variants", "topScore": "—", "category": "Multi-Agent", "capabilities": "multi-agent, cooperative-rl, long-horizon, exploration, credit-assignment, role-specialization", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-01"}, {"id": 493, "name": "Sequential Industrial Control MARL Benchmark", "publisher": "Ruhr University Bochum (INI)", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2510.20408", "tasks": "Industrial sorting/production control; specialized vs. centralized RL agents", "topScore": "Centralized outperforms on most tasks", "category": "Multi-Agent", "capabilities": "multi-agent, industrial-control, marl, specialization, action-masking", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-01"}, {"id": 494, "name": "MEAL", "publisher": "TU Eindhoven / U Edinburgh / U Liverpool", "date": "2025", "venue": "ICML 2025 (ContinualFomo Workshop)", "url": "https://arxiv.org/abs/2506.14990", "tasks": "4 cooperative MARL environments with sequential task regimes; continual learning evaluation", "topScore": "—", "category": "Multi-Agent", "capabilities": "multi-agent, continual-learning, marl, catastrophic-forgetting, knowledge-transfer", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-01"}, {"id": 495, "name": "Collab-Overcooked", "publisher": "Beijing Univ. of Posts & Telecomm. (BUPT)", "date": "2025", "venue": "EMNLP 2025", "url": "https://arxiv.org/abs/2502.20073", "tasks": "30 collaborative cooking tasks in Overcooked-AI; 13 LLMs; process-oriented collaboration metrics", "topScore": "Strong goal interpretation; low active collaboration across all LLMs", "category": "Multi-Agent", "capabilities": "multi-agent, llm-collaboration, natural-language, planning, task-adaptation", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-01"}, {"id": 496, "name": "DEBATE", "publisher": "U. Wisconsin-Madison / collaborators", "date": "2025", "venue": "arXiv", "url": "https://arxiv.org/abs/2510.25110", "tasks": "30,707 messages from 2,832 participants, 708 groups, 107 topics; opinion dynamics simulation", "topScore": "Human-validated reference for stance alignment", "category": "Multi-Agent", "capabilities": "multi-agent, opinion-dynamics, role-playing, social-simulation, stance-alignment", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-01"}, {"id": 497, "name": "M3MAD-Bench", "publisher": "Ao Li et al.", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2601.02854", "tasks": "13 datasets across 5 domains (text + multimodal); 6 MAD methods × 9 base models", "topScore": "MAD helps reasoning; marginal on factual recall", "category": "Multi-Agent", "capabilities": "multi-agent, debate, multimodal, reasoning, efficiency-evaluation", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-01"}, {"id": 498, "name": "SocialGrid", "publisher": "TU Darmstadt (Shindo, Lin, Helff, Schramowski, Kersting)", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.16022", "tasks": "Procedurally generated gridworld (Among Us-inspired); configurable map/room/agent params; Crewmate + Impostor roles; spatial navigation + task completion + deception detection", "topScore": "<60% task completion (GPT-OSS-120B); Impostor detection near-chance across all model scales", "category": "Multi-Agent", "capabilities": "multi-agent, planning, spatial-reasoning, social-reasoning, adversarial, deception-detection, theory-of-mind, embodied", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-03"}, {"id": 499, "name": "MarketBench", "publisher": "Boston University / MIT IDE (Fradkin, Krishnan)", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2604.23897", "tasks": "93 SWE-bench Lite tasks × 6 frontier LLMs; Calibration (pre-task confidence elicitation) + Auction (procurement simulation) families; metacognition evaluation", "topScore": "Best Brier score 0.1693 (w/ self-history intervention); all frontier models substantially miscalibrated", "category": "Reasoning", "capabilities": "reasoning, metacognition, self-assessment, calibration, cost-estimation, market-participation, enterprise", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-03"}, {"id": 500, "name": "PredictionMarketBench", "publisher": "Avi Arora & Ritesh Malpani (Oddpool / Benchspan, YC S26)", "date": "2026", "venue": "arXiv", "url": "https://arxiv.org/abs/2602.00133", "tasks": "SWE-bench-style backtesting of trading agents on prediction markets; financial tool-use, order management, multi-ticker portfolio management within binary event episodes; uses historical market resolution as ground truth", "topScore": "Most agents fail to beat no-information baselines on prediction markets", "category": "Enterprise", "capabilities": "reasoning, financial-tool-use, market-state-reasoning, probabilistic-forecasting, order-management, fee-aware-trading, portfolio-management, calibration", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-03"}, {"id": 501, "name": "Gaia2 and ARE", "publisher": "Meta FAIR / Hugging Face (Romain Froger", "date": "2025", "venue": "Announcement", "url": "https://huggingface.co/blog/gaia2", "tasks": "—", "topScore": "—", "category": "Tool Use", "capabilities": "planning, reasoning, multi-agent, tool-use, dynamic-environments, temporal-reaso", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-03"}, {"id": 502, "name": "BaxBench", "publisher": "ETH Zurich SRI Lab / LogicStar.ai", "date": "2025", "venue": "ICML 2025", "url": "https://arxiv.org/abs/2502.11844", "tasks": "392 tasks; 28 backend scenarios × 14 frameworks × 6 languages (Python, JS, Go, PHP, Ruby, Rust); generate complete deployable REST APIs from OpenAPI spec + NL description; automated exploit execution across 13 CWE vulnerability categories", "topScore": "~52% sec_pass@1 (GPT-4o)", "category": "Code Generation", "capabilities": "code-generation, security, api-synthesis, vulnerability-resistance, backend-development, multi-file-generation", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-05"}, {"id": 503, "name": "CR-Bench", "publisher": "Nutanix, Inc.", "date": "2026", "venue": "preprint", "url": "https://arxiv.org/abs/2603.11078", "tasks": "584 instances from SWE-Bench; blind PR audit — agent reviews diff and identifies hidden bugs without ground truth; labeled by category (PREVENTABLE), impact, and severity (Low/Medium/High)", "topScore": "32.76% recall / SNR 1.95 (Reflexion, GPT-5.2); single-shot achieves SNR 5.11 but lower recall", "category": "Code Generation", "capabilities": "code-review, defect-detection, reasoning, code-comprehension, signal-prioritization, natural-language-generation", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-05"}, {"id": 504, "name": "ResearchEnvBench", "publisher": "Fudan University (Yubang Wang et al.)", "date": "2026", "venue": "preprint", "url": "https://arxiv.org/abs/2603.06739", "tasks": "44 pinned research repos (post-Jan 2024); 6-stage pyramid verification (C0: static import → C1: CPU exec → C2: CUDA alignment → C3: single-GPU → C4: multi-GPU/DDP → C5: hallucination audit); 2,858 C0 checkpoints", "topScore": "Substantial gap at C2–C5; dominated by dependency resolution failures", "category": "Code Generation", "capabilities": "dependency-resolution, environment-synthesis, debugging, tool-use, research, cuda-compatibility, distributed-configuration, hallucination-detection", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-05"}, {"id": 505, "name": "AgentIF", "publisher": "KEG Lab, Tsinghua University + Zhipu AI", "date": "2025", "venue": "NeurIPS 2025 (Spotlight)", "url": "https://arxiv.org/abs/2505.16944", "tasks": "707 human-annotated instructions across 50 real-world agentic task categories; 8,415 annotated constraints; avg 11.9 constraints/instruction; avg 1,723 words/instruction; code + LLM + hybrid evaluation", "topScore": "o1-mini: 59.8% CSR / 27.2% ISR; no model exceeds 30% ISR", "category": "Reasoning", "capabilities": "instruction-following, reasoning, tool-use, function-calling, planning, constraint-satisfaction, long-instruction-comprehension", "score": null, "citations": null, "tier": null, "importedAt": "2026-05-05"}], "categories": [{"name": "Coding", "count": 47, "color": "#60a5fa"}, {"name": "Tool Use", "count": 38, "color": "#fbbf24"}, {"name": "Web", "count": 31, "color": "#34d399"}, {"name": "OS", "count": 25, "color": "#f472b6"}, {"name": "Enterprise", "count": 21, "color": "#f97316"}, {"name": "Reasoning", "count": 25, "color": "#22d3ee"}, {"name": "ML/Research", "count": 17, "color": "#a3e635"}, {"name": "General", "count": 20, "color": "#a78bfa"}, {"name": "Security", "count": 17, "color": "#ef4444"}, {"name": "Multi-Agent", "count": 11, "color": "#c084fc"}, {"name": "Memory", "count": 9, "color": "#93c5fd"}, {"name": "Framework", "count": 10, "color": "#9ba1b5"}, {"name": "Safety", "count": 17, "color": "#fca5a5"}, {"name": "Data Science", "count": 9, "color": "#6ee7b7"}, {"name": "Customer Service", "count": 4, "color": "#d4d48a"}, {"name": "Medical", "count": 3, "color": "#67e8f9"}, {"name": "E-commerce", "count": 2, "color": "#fb923c"}], "category_counts": {"Coding": 73, "Web": 42, "Tool Use": 66, "General": 28, "OS": 27, "Customer Service": 5, "Medical": 4, "Reasoning": 36, "ML/Research": 20, "Enterprise": 30, "Memory": 13, "Security": 21, "Safety": 31, "Data Science": 10, "Multi-Agent": 31, "Framework": 11, "E-commerce": 2, "Computer Use": 1, "Science": 2, "Research": 1, "Agentic": 1, "Cybersecurity": 2, "AI Control": 2, "Trace Evaluation": 1, "Code Generation": 3}, "capabilities": [{"name": "Code Gen/SE", "benchmarks": 40, "level": "HIGH"}, {"name": "Tool Use", "benchmarks": 28, "level": "HIGH"}, {"name": "Function Calling", "benchmarks": 20, "level": "HIGH"}, {"name": "Web Navigation", "benchmarks": 16, "level": "HIGH"}, {"name": "Long Horizon", "benchmarks": 18, "level": "HIGH"}, {"name": "OS/GUI", "benchmarks": 17, "level": "HIGH"}, {"name": "Safety", "benchmarks": 13, "level": "HIGH"}, {"name": "ML/Research", "benchmarks": 14, "level": "HIGH"}, {"name": "Multimodal", "benchmarks": 11, "level": "HIGH"}, {"name": "Reasoning", "benchmarks": 12, "level": "HIGH"}, {"name": "Enterprise", "benchmarks": 11, "level": "HIGH"}, {"name": "Multi-Agent", "benchmarks": 10, "level": "HIGH"}, {"name": "Multi-turn", "benchmarks": 10, "level": "MEDIUM"}, {"name": "Scientific", "benchmarks": 7, "level": "MEDIUM"}, {"name": "Memory", "benchmarks": 7, "level": "MEDIUM"}, {"name": "Reliability", "benchmarks": 5, "level": "MEDIUM"}, {"name": "Multilingual", "benchmarks": 3, "level": "LOW"}, {"name": "Cost Eval", "benchmarks": 3, "level": "LOW"}, {"name": "Real-time", "benchmarks": 0, "level": "GAP"}, {"name": "Agent-to-Agent", "benchmarks": 0, "level": "GAP"}, {"name": "Creative", "benchmarks": 0, "level": "GAP"}, {"name": "Physical/HW", "benchmarks": 0, "level": "GAP"}], "capability_keys": [{"key": "cg", "label": "Code Gen"}, {"key": "bf", "label": "Bug Fix"}, {"key": "wn", "label": "Web Nav"}, {"key": "tu", "label": "Tool Use"}, {"key": "fc", "label": "Func Call"}, {"key": "os", "label": "OS/GUI"}, {"key": "r", "label": "Reasoning"}, {"key": "mt", "label": "Multi-turn"}, {"key": "mm", "label": "Multimodal"}, {"key": "sc", "label": "Scientific"}, {"key": "en", "label": "Enterprise"}, {"key": "sa", "label": "Safety"}, {"key": "ce", "label": "Cost Eval"}, {"key": "lh", "label": "Long Horizon"}, {"key": "re", "label": "Reliability"}, {"key": "me", "label": "Memory"}, {"key": "ma", "label": "Multi-Agent"}], "capability_matrix": [{"name": "SWE-bench", "caps": {"cg": 2, "bf": 2, "r": 1}}, {"name": "AgentBench", "caps": {"cg": 1, "wn": 2, "tu": 2, "os": 2, "r": 2}}, {"name": "Mind2Web", "caps": {"wn": 2, "r": 1}}, {"name": "WebArena", "caps": {"wn": 2, "tu": 1, "r": 1, "mt": 2, "lh": 2}}, {"name": "GAIA", "caps": {"cg": 1, "wn": 1, "tu": 2, "r": 2, "mm": 1}}, {"name": "ToolLLM", "caps": {"tu": 2, "fc": 2}}, {"name": "OSWorld", "caps": {"wn": 1, "tu": 1, "os": 2, "r": 1, "mm": 2, "lh": 1}}, {"name": "tau-bench", "caps": {"tu": 2, "r": 1, "mt": 2, "en": 2, "re": 2}}, {"name": "BFCL", "caps": {"fc": 2}}, {"name": "VisualWebArena", "caps": {"wn": 2, "tu": 1, "r": 1, "mt": 2, "mm": 2, "lh": 2}}, {"name": "TheAgentCompany", "caps": {"cg": 1, "wn": 2, "tu": 2, "os": 1, "r": 1, "mt": 2, "en": 2, "lh": 2}}, {"name": "MLE-bench", "caps": {"cg": 2, "tu": 1, "r": 2, "sc": 2, "lh": 2}}, {"name": "WorkArena", "caps": {"wn": 2, "tu": 1, "r": 1, "mt": 2, "en": 2}}, {"name": "MCP-Atlas", "caps": {"tu": 2, "fc": 2, "r": 1, "mt": 2, "lh": 1}}, {"name": "Toolathlon", "caps": {"tu": 2, "fc": 2, "r": 1, "mt": 2, "lh": 2}}, {"name": "PaperBench", "caps": {"cg": 2, "tu": 1, "r": 2, "sc": 2, "lh": 2}}, {"name": "SWE-Lancer", "caps": {"cg": 2, "bf": 2, "r": 1, "en": 2, "ce": 2, "lh": 1}}, {"name": "HCAST", "caps": {"cg": 2, "tu": 1, "r": 2, "sc": 1, "lh": 2}}, {"name": "RE-Bench", "caps": {"cg": 1, "tu": 1, "r": 2, "sc": 2, "lh": 2}}, {"name": "HAL", "caps": {"cg": 2, "bf": 2, "wn": 2, "tu": 2, "os": 2, "r": 1, "sc": 2, "en": 2, "ce": 2, "lh": 2, "re": 2}}, {"name": "FeatureBench", "caps": {"cg": 2, "r": 2, "lh": 2}}, {"name": "APEX-Agents", "caps": {"cg": 1, "wn": 2, "tu": 2, "r": 2, "mt": 2, "en": 2, "lh": 2}}, {"name": "BrowseComp", "caps": {"wn": 2, "r": 2, "lh": 2}}, {"name": "Mind2Web 2", "caps": {"wn": 2, "tu": 1, "r": 2, "mt": 2, "lh": 2}}, {"name": "AgentHarm", "caps": {"wn": 1, "tu": 2, "mt": 2, "sa": 2}}, {"name": "TRAIL", "caps": {"r": 2, "sa": 2}}, {"name": "tau2-bench", "caps": {"tu": 2, "r": 1, "mt": 2, "en": 2, "re": 2}}, {"name": "SciCode", "caps": {"cg": 2, "r": 2, "sc": 2}}, {"name": "HLE", "caps": {"r": 2, "sc": 1}}, {"name": "PropensityBench", "caps": {"mt": 2, "sa": 2}}, {"name": "KoCo-Bench", "caps": {"cg": 2, "r": 1}}, {"name": "FeatBench", "caps": {"cg": 2, "r": 2, "lh": 2}}, {"name": "A.S.E", "caps": {"cg": 1, "bf": 2, "r": 1, "sa": 2}}, {"name": "ASTRA", "caps": {"cg": 2, "re": 2}}, {"name": "MCPVerse", "caps": {"tu": 2, "fc": 2, "r": 1, "lh": 1}}, {"name": "ML-Tool-Bench", "caps": {"tu": 2, "r": 1, "sc": 2, "lh": 2}}, {"name": "MCP-AgentBench", "caps": {"tu": 2, "fc": 2, "r": 1, "ce": 2}}, {"name": "MCPEval", "caps": {"tu": 2, "fc": 2, "r": 1}}, {"name": "MCP-RADAR", "caps": {"tu": 2, "fc": 2, "r": 1, "mt": 2}}, {"name": "MCPToolBench++", "caps": {"tu": 2, "fc": 2, "r": 1}}, {"name": "FuncBenchGen", "caps": {"fc": 2, "r": 2, "mt": 1}}, {"name": "TPS-Bench", "caps": {"tu": 2, "fc": 2, "r": 2, "lh": 2}}, {"name": "IFEval-FC", "caps": {"fc": 2, "r": 2, "re": 2}}, {"name": "HammerBench", "caps": {"fc": 2, "r": 1, "mt": 2}}, {"name": "ITC", "caps": {"tu": 2, "fc": 2, "r": 1}}, {"name": "ASTRA-bench", "caps": {"tu": 2, "fc": 2, "r": 2, "mt": 2, "me": 1}}, {"name": "Tool-Genesis", "caps": {"cg": 2, "tu": 2, "fc": 2, "r": 1}}, {"name": "Agent-Diff", "caps": {"cg": 1, "tu": 2, "r": 1, "en": 2}}, {"name": "SlopCodeBench", "caps": {"cg": 2, "bf": 1, "r": 1, "lh": 2, "re": 2}}, {"name": "CodeClash", "caps": {"cg": 2, "r": 2, "lh": 2}}, {"name": "PostTrainBench", "caps": {"cg": 2, "tu": 1, "r": 2, "sc": 1, "lh": 2, "ce": 1}}, {"name": "LemmaBench", "caps": {"r": 2, "sc": 2}}], "citation_rankings": [{"rank": 1, "name": "SWE-bench", "year": 2023, "citations": "~750", "score": 95, "tier": "Tier 1", "caps": "Software engineering, bug fixing, code generation"}, {"rank": 2, "name": "AgentBench", "year": 2023, "citations": "~620", "score": 92, "tier": "Tier 1", "caps": "Multi-environment (8 envs), reasoning, tool use, OS"}, {"rank": 3, "name": "Mind2Web", "year": 2023, "citations": "~758", "score": 90, "tier": "Tier 1", "caps": "Generalist web agent, cross-domain navigation"}, {"rank": 4, "name": "WebArena", "year": 2023, "citations": "~520", "score": 90, "tier": "Tier 1", "caps": "Realistic web navigation, long-horizon tasks"}, {"rank": 5, "name": "GAIA", "year": 2023, "citations": "~390", "score": 88, "tier": "Tier 1", "caps": "General AI assistant, reasoning, multimodal, tool use"}, {"rank": 6, "name": "ToolLLM/ToolBench", "year": 2023, "citations": "~580", "score": 87, "tier": "Tier 1", "caps": "Tool use, 16K+ real APIs, multi-tool planning"}, {"rank": 7, "name": "OSWorld", "year": 2024, "citations": "~280", "score": 86, "tier": "Tier 1", "caps": "OS interaction, GUI grounding, multi-app workflows"}, {"rank": 8, "name": "tau-bench", "year": 2024, "citations": "~120", "score": 82, "tier": "Tier 1", "caps": "Tool-agent-user interaction, customer service, reliability"}, {"rank": 9, "name": "BFCL", "year": 2024, "citations": "~150", "score": 81, "tier": "Tier 1", "caps": "Function calling, tool use, API interaction"}, {"rank": 10, "name": "VisualWebArena", "year": 2024, "citations": "~180", "score": 80, "tier": "Tier 1", "caps": "Multimodal web navigation, visual understanding"}, {"rank": 11, "name": "TheAgentCompany", "year": 2024, "citations": "~70", "score": 78, "tier": "Tier 2", "caps": "Enterprise tasks, professional work, collaboration"}, {"rank": 12, "name": "MLE-bench", "year": 2024, "citations": "~95", "score": 77, "tier": "Tier 2", "caps": "ML engineering, Kaggle competitions"}, {"rank": 13, "name": "InterCode", "year": 2023, "citations": "~150", "score": 76, "tier": "Tier 2", "caps": "Interactive coding, execution feedback"}, {"rank": 14, "name": "WorkArena", "year": 2024, "citations": "~80", "score": 75, "tier": "Tier 2", "caps": "Enterprise software, knowledge work"}, {"rank": 15, "name": "AppWorld", "year": 2024, "citations": "~65", "score": 74, "tier": "Tier 2", "caps": "App interaction, 457 APIs"}, {"rank": 16, "name": "RE-Bench", "year": 2024, "citations": "~60", "score": 73, "tier": "Tier 2", "caps": "Research engineering, autonomous R&D"}, {"rank": 17, "name": "WebShop", "year": 2022, "citations": "~450", "score": 72, "tier": "Tier 2", "caps": "E-commerce web navigation"}, {"rank": 18, "name": "ScienceAgentBench", "year": 2024, "citations": "~45", "score": 71, "tier": "Tier 2", "caps": "Scientific discovery, data analysis"}, {"rank": 19, "name": "PaperBench", "year": 2025, "citations": "~55", "score": 70, "tier": "Tier 2", "caps": "Research replication, ICML papers"}, {"rank": 20, "name": "LiveCodeBench", "year": 2024, "citations": "~120", "score": 69, "tier": "Tier 2", "caps": "Contamination-free code eval"}, {"rank": 21, "name": "SciCode", "year": 2024, "citations": "~55", "score": 68, "tier": "Tier 2", "caps": "Scientific coding, 16 domains"}, {"rank": 22, "name": "SWE-Lancer", "year": 2025, "citations": "~40", "score": 67, "tier": "Tier 2", "caps": "Freelance SE, real Upwork tasks"}, {"rank": 23, "name": "HAL", "year": 2025, "citations": "~30", "score": 66, "tier": "Tier 2", "caps": "Unified agent evaluation infrastructure"}, {"rank": 24, "name": "TRAIL", "year": 2025, "citations": "~10", "score": 63, "tier": "Tier 3", "caps": "Agentic debugging, trace analysis"}, {"rank": 25, "name": "tau2-bench", "year": 2025, "citations": "~15", "score": 62, "tier": "Tier 3", "caps": "Customer service, telecom domain"}, {"rank": 26, "name": "BrowserGym", "year": 2024, "citations": "~40", "score": 61, "tier": "Tier 3", "caps": "Unified web agent eval framework"}, {"rank": 27, "name": "HCAST", "year": 2025, "citations": "~25", "score": 60, "tier": "Tier 3", "caps": "ML/security/SWE tasks, human-calibrated"}, {"rank": 28, "name": "CORE-Bench", "year": 2024, "citations": "~20", "score": 59, "tier": "Tier 3", "caps": "Computational reproducibility"}, {"rank": 29, "name": "Spider 2.0", "year": 2024, "citations": "~40", "score": 58, "tier": "Tier 3", "caps": "Enterprise text-to-SQL"}, {"rank": 30, "name": "MultiAgentBench", "year": 2025, "citations": "~20", "score": 57, "tier": "Tier 3", "caps": "Multi-agent coordination"}, {"rank": 31, "name": "HLE", "year": 2025, "citations": "~45", "score": 56, "tier": "Tier 3", "caps": "Expert knowledge, frontier capability"}, {"rank": 32, "name": "Mind2Web 2", "year": 2025, "citations": "~20", "score": 55, "tier": "Tier 3", "caps": "Long-horizon web search"}, {"rank": 33, "name": "APEX-Agents", "year": 2026, "citations": "~5", "score": 54, "tier": "Tier 3", "caps": "Professional services"}, {"rank": 34, "name": "FeatureBench", "year": 2026, "citations": "~8", "score": 53, "tier": "Tier 3", "caps": "Feature-level coding"}, {"rank": 35, "name": "AgentHarm", "year": 2024, "citations": "~25", "score": 52, "tier": "Tier 3", "caps": "Agent safety, jailbreak"}, {"rank": 36, "name": "MCPAgentBench", "year": 2025, "citations": "~5", "score": 51, "tier": "Tier 3", "caps": "MCP tool use evaluation"}, {"rank": 37, "name": "LiveAgentBench", "year": 2026, "citations": "~3", "score": 50, "tier": "Tier 3", "caps": "Real-world multi-modal agent"}, {"rank": 38, "name": "SWE-bench-Live", "year": 2025, "citations": "~40", "score": 53, "tier": "Tier 3", "caps": "Contamination-free coding"}], "gap_analysis": {"zero": [{"title": "Real-Time / Streaming Agents", "desc": "No benchmark evaluates agents on real-time data streams (market data, IoT, live dashboards). Critical for trading, monitoring, incident response.", "priority": "MEDIUM-HIGH"}, {"title": "Agent-to-Agent Communication", "desc": "No benchmark evaluates inter-agent communication quality, protocol adherence, or conflict resolution. Only AI-NativeBench touches A2A.", "priority": "MEDIUM"}, {"title": "Creative / Open-Ended Tasks", "desc": "No systematic evaluation of agents on tasks with no single correct answer (design, writing, brainstorming).", "priority": "LOW-MEDIUM"}, {"title": "Hardware / Physical Interaction", "desc": "Every benchmark in the registry is software-only — none evaluate agents controlling physical hardware or robots.", "priority": "LOW"}, {"title": "Production Monitoring Gap", "desc": "No benchmark measures how benchmark performance translates to real-world production reliability.", "priority": "MEDIUM-HIGH"}, {"title": "Cross-Modal Transfer", "desc": "No benchmark tests whether coding skill transfers to web navigation or vice versa.", "priority": "MEDIUM"}], "under": [{"title": "Multi-Agent Collaboration", "desc": "Only 3 benchmarks (MultiAgentBench, CooperBench, MedAgentBoard) despite rapid deployment of multi-agent systems.", "count": 3, "priority": "CRITICAL"}, {"title": "Agent Reliability", "desc": "Only tau-bench (pass^k) and HAL measure consistency. Most benchmarks report single-run scores.", "count": 2, "priority": "CRITICAL"}, {"title": "Cost-Efficiency Evaluation", "desc": "Only HAL systematically tracks cost. No Pareto frontier analysis of accuracy vs. cost vs. latency.", "count": 1, "priority": "HIGH"}, {"title": "Agent Memory (Long-Horizon)", "desc": "MEMTRACK, Context-Bench, Mem-Gallery, tau-Knowledge, MemoryAgentBench, Evo-Memory. Growing but nothing tests days/weeks of memory.", "count": 6, "priority": "HIGH"}, {"title": "Multilingual Agents", "desc": "BLUR, SEA-HELM, ABC-Bench, AILuminate, ITC (29 languages tool calling). Still <5% of benchmarks cover non-English agents.", "count": 5, "priority": "MEDIUM"}, {"title": "Safety & Alignment", "desc": "AgentHarm, TRAIL, PropensityBench, HELM Safety, AILuminate, AgentDojo, AgentDyn. Prompt injection security now better covered but unintended side-effects still under-evaluated.", "count": 7, "priority": "MEDIUM"}], "over": [{"title": "Code Generation / SE", "desc": "40 benchmarks with significant overlap, especially 12+ SWE-bench variants plus DevOps-Gym, SWE-CI, ProjDevBench, SlopCodeBench (quality degradation), CodeClash (competitive arenas) extending into full lifecycle.", "count": 40, "priority": null}, {"title": "Web Navigation", "desc": "16 benchmarks spanning simple to complex. Deep research search adds DeepSearchQA and BrowseComp-V3.", "count": 16, "priority": null}, {"title": "Tool Use / Function Calling", "desc": "34 benchmarks, rapidly growing with MCP and multilingual function calling (ITC, TPS-Bench, FuncBenchGen). Well-covered but lacking cross-language reliability testing.", "count": 34, "priority": null}]}, "trends": [{"title": "Deep Research Agent Explosion", "desc": "7 new benchmarks (DRACO, DeepResearch Bench, Dr. Bench, IDRBench, MMDeepResearch-Bench, ReportBench, DeepSearchQA). Entirely new evaluation category testing multi-step research, citation quality, and report generation.", "color": "var(--green)"}, {"title": "SWE-bench Family Explosion", "desc": "12+ variants of SWE-bench now exist: Verified, Multimodal, Live, Windows, Pro, +, smith, rebench, EVO, CI. Core formulation dominates but risks monoculture.", "color": "var(--accent)"}, {"title": "MCP Benchmark Explosion", "desc": "12+ dedicated MCP benchmarks: MCPVerse, MCP-AgentBench, MCPEval, MCP-RADAR, MCPToolBench++, MCP-Atlas, MCP-Bench, MCPMark, MCPWorld, AI-NativeBench, CUBE Standard. Fastest growing category.", "color": "var(--yellow)"}, {"title": "Mobile/Desktop GUI Surge", "desc": "11+ new GUI benchmarks: WorldGUI, AmbiBench, MemGUI-Bench, ProBench, A3, MobileWorld, FineState-Bench, MMBench-GUI, MAS-Bench, CUB (10.4% SOTA), UI-CUBE (226 enterprise tasks).", "color": "var(--pink)"}, {"title": "Multi-Agent Coordination Cluster", "desc": "9+ benchmarks testing agent collaboration: COMMA, OmniEAR, Silo-Bench, EmCoop, REALM-Bench, DPBench, MAESTRO, LLM-Coordination, CivBench.", "color": "var(--purple)"}, {"title": "DevOps & Full-Lifecycle Coding", "desc": "DevOps-Gym (700+ tasks), SWE-CI (CI loop, EvoScore), ProjDevBench (architecture to deployment). Moving beyond isolated bug-fixing to full software lifecycle.", "color": "var(--cyan)"}, {"title": "Safety & Security Hardening", "desc": "AgentDojo (prompt injection), AgentDyn (dynamic attacks), AILuminate (24K+ prompts, 12 hazard categories), SHADE-Arena, AuditBench. Adversarial robustness now a major focus.", "color": "var(--red)"}, {"title": "Near-Saturation Driving Harder Successors", "desc": "HumanEval >95%, GPQA ~94%, WebVoyager ~97%. Meanwhile CUB best is 10.4%, ProjDevBench 27%, EnterpriseOps-Gym 37.4%. The difficulty frontier keeps expanding.", "color": "var(--orange)"}], "publishers": [{"name": "Academic (various)", "count": 30}, {"name": "Multi-institutional", "count": 16}, {"name": "Princeton NLP", "count": 11}, {"name": "CMU / All Hands AI", "count": 9}, {"name": "XLANG Lab (HKU)", "count": 8}, {"name": "Scale AI", "count": 7}, {"name": "OpenAI", "count": 6}, {"name": "Tsinghua", "count": 5}, {"name": "Ohio State NLP", "count": 4}, {"name": "Patronus AI", "count": 3}, {"name": "Letta", "count": 3}, {"name": "ServiceNow", "count": 3}, {"name": "Anthropic", "count": 3}, {"name": "Google DeepMind", "count": 3}, {"name": "Sierra AI", "count": 2}, {"name": "METR", "count": 2}, {"name": "Salesforce AI", "count": 2}, {"name": "Vals AI", "count": 2}], "yearly_distribution": [], "summaries": [{"source_type": "arxiv", "title": "GAIA-v2-LILT: Multilingual Adaptation of Agent Benchmark beyond Translation", "url": "https://arxiv.org/abs/2604.24929", "author": "Yunsu Kim et al.", "date": "2026-04-27", "retrieved": "2026-04-29", "tags": "[benchmark, evaluation, multilingual, agent, general-assistant, GAIA, adaptation]", "filename": "2604.24929-gaia-v2-lilt.md"}, {"source_type": "arxiv", "title": "MarketBench: Evaluating AI Agents as Market Participants", "url": "https://arxiv.org/abs/2604.23897", "author": "Andrey Fradkin (Boston University, MIT Initiative on the Digital Economy); Rohit Krishnan (Independent Researcher)", "date": "2026-04-26", "retrieved": "2026-05-03", "tags": "[agentic, benchmark, evaluation, reasoning, calibration, self-assessment, metacognition, auction, market-design, software-engineering, tool-use]", "filename": "market_bench.md"}, {"source_type": "arxiv", "title": "WildToolBench: Benchmarking LLM Tool-Use in the Wild", "url": "https://openreview.net/forum?id=yz7fL5vfpn", "author": "Peijie Yu, Wei Liu, Yifan Yang, Jinjian Li, Zelong Zhang, Xiao Feng, Feng Zhang", "date": "2026-04-25", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, tool-use, multi-turn, real-world, compositional-tasks, implicit-intent, ICLR-2026]", "filename": "wildtoolbench.md"}, {"source_type": "arxiv", "title": "AgentSearchBench: A Benchmark for AI Agent Search in the Wild", "url": "https://arxiv.org/abs/2604.22436", "author": "Bin Wu, Arastun Mammadli, Xiaoyu Zhang, Emine Yilmaz", "date": "2026-04-24", "retrieved": "2026-05-03", "tags": "[agentic, benchmark, evaluation, tool-use, agent-discovery, retrieval, reranking, multi-agent]", "filename": "2604.22436-agentsearchbench.md"}, {"source_type": "arxiv", "title": "GTA-2: Benchmarking General Tool Agents from Atomic Tool-Use to Open-Ended Workflows", "url": "https://arxiv.org/abs/2604.15715", "author": "Jize Wang et al.", "date": "2026-04-20", "retrieved": "2026-04-27", "tags": "[agentic, benchmark, tool-use, evaluation, function-calling, multimodal, workflow, long-horizon, open-ended]", "filename": "gta2_bench.md"}, {"source_type": "arxiv", "title": "Precise Debugging Benchmark: Is Your Model Debugging or Regenerating?", "url": "https://arxiv.org/abs/2604.17338", "author": "Wang Bill Zhu et al.", "date": "2026-04-19", "retrieved": "2026-04-25", "tags": "[benchmark, debugging, code-generation, evaluation, LLM, precision, recall, software-engineering, agentic, tool-use]", "filename": "2604.17338-precise-debugging-benchmark.md"}, {"source_type": "arxiv", "title": "Evaluating Tool-Using Language Agents: Judge Reliability, Propagation Cascades, and Runtime Mitigation in AgentProp-Bench", "url": "https://arxiv.org/abs/2604.16706", "author": "Bhaskar Gurram et al.", "date": "2026-04-17", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, tool-use, evaluation, judge-reliability, error-propagation, runtime-mitigation, function-calling, LLM-evaluation]", "filename": "2604.16706-agentprop-bench.md"}, {"source_type": "arxiv", "title": "PRL-Bench: A Comprehensive Benchmark Evaluating LLMs' Capabilities in Frontier Physics Research", "url": "https://arxiv.org/abs/2604.15411", "author": "Unknown et al.", "date": "2026-04-16", "retrieved": "2026-04-29", "tags": "[benchmark, evaluation, reasoning, research, planning, physics, scientific-reasoning, long-horizon]", "filename": "2604.15411-prl-bench.md"}, {"source_type": "arxiv", "title": "RiskWebWorld: A Realistic Interactive Benchmark for GUI Agents in E-commerce Risk Management", "url": "https://arxiv.org/abs/2604.13531", "author": "Unknown et al.", "date": "2026-04-15", "retrieved": "2026-04-29", "tags": "[benchmark, evaluation, web-navigation, GUI, e-commerce, risk-management, agentic, real-world]", "filename": "2604.13531-riskwebworld.md"}, {"source_type": "arxiv", "title": "GeoAgentBench: A Dynamic Execution Benchmark for Tool-Augmented Agents in Spatial Analysis", "url": "https://arxiv.org/abs/2604.13888", "author": "Bo Yu et al.", "date": "2026-04-15", "retrieved": "2026-04-27", "tags": "[agentic, benchmark, tool-use, evaluation, spatial-analysis, GIS, geoai, multimodal, dynamic-execution, tool-augmented-agents]", "filename": "geo_agent_bench.md"}, {"source_type": "announcement", "title": "GDP.pdf: Can $100B AI Models Master the Documents that Run the World?", "url": "https://surgehq.ai/blog/gdp-pdf-can-100b-ai-models-master-the-documents-that-run-the-world", "author": "Surge AI", "date": "2026-04-15", "retrieved": "2026-04-16", "tags": "[announcement, benchmark, document_understanding, pdf, enterprise, multimodal, extraction, professional_domains, surge_ai]", "filename": "gdp_pdf.md"}, {"source_type": "arxiv", "title": "PAC-Bench: Evaluating Multi-Agent Collaboration under Privacy Constraints", "url": "https://arxiv.org/abs/2604.11523", "author": "Minjun Park, Donghyun Kim, Hyeonjong Ju et al.", "date": "2026-04-13", "retrieved": "2026-04-16", "tags": "[agentic, benchmark, evaluation, multi-agent, privacy, tool-use, reasoning, dataset]", "filename": "pac_bench.md"}, {"source_type": "announcement", "title": "N-Day-Bench: Evaluating Frontier LLMs on Real-World Vulnerability Discovery", "url": "https://ndaybench.winfunc.com", "author": "Winfunc Research", "date": "2026-04-13", "retrieved": "2026-04-16", "tags": "[agentic, benchmark, security, vulnerability_discovery, n_day, code_analysis, llm_as_judge, contamination_resistant]", "filename": "n_day_bench.md"}, {"source_type": "arxiv", "title": "FinTrace: Holistic Trajectory-Level Evaluation of LLM Tool Calling for Long-Horizon Financial Tasks", "url": "https://arxiv.org/abs/2604.10015", "author": "Yupeng Cao et al.", "date": "2026-04-11", "retrieved": "2026-04-27", "tags": "[agentic, benchmark, tool-use, function-calling, evaluation, finance, trajectory-evaluation, long-horizon, preference-learning]", "filename": "fin_trace_bench.md"}, {"source_type": "arxiv", "title": "HiL-Bench (Human-in-Loop Benchmark): Do Agents Know When to Ask for Help?", "url": "https://arxiv.org/abs/2604.09408", "author": "Mohamed Elfeki et al.", "date": "2026-04-10", "retrieved": "2026-04-21", "tags": "[agentic, benchmark, evaluation, human-in-the-loop, selective-escalation, coding, sql, reinforcement-learning, help-seeking, tool-use]", "filename": "hil_bench.md"}, {"source_type": "announcement", "title": "MirrorCode: Evidence AI can already do some weeks-long coding tasks", "url": "https://epoch.ai/blog/mirrorcode-preliminary-results", "author": "Epoch AI & METR", "date": "2026-04-10", "retrieved": "2026-04-19", "tags": "[benchmark, coding, software-engineering, long-horizon, autonomous-agents, safety]", "filename": "mirrorcode.md"}, {"source_type": "announcement", "title": "MirrorCode: Evidence AI can already do some weeks-long coding tasks", "url": "https://epoch.ai/blog/mirrorcode-preliminary-results", "author": "Epoch AI", "date": "2026-04-10", "retrieved": "2026-04-21", "tags": "[agentic, benchmark, evaluation, coding, software-engineering, long-horizon, autonomous-agents, safety, agentic-coding]", "filename": "summary_mirrorcode.md"}, {"source_type": "arxiv", "title": "ClawBench: Can AI Agents Complete Everyday Online Tasks?", "url": "https://arxiv.org/abs/2604.08523", "author": "Yuxuan Zhang et al.", "date": "2026-04-09", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, web-agent, real-world, online-tasks, evaluation, tool-use, browser-agent, live-websites, GUI]", "filename": "clawbench-everyday-online-tasks.md"}, {"source_type": "arxiv", "title": "KnowU-Bench: Towards Interactive, Proactive, and Personalized Mobile Agent Evaluation", "url": "https://arxiv.org/abs/2604.08455", "author": "Tongbo Chen et al.", "date": "2026-04-09", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, mobile-agent, personalization, proactive, interactive, Android, GUI, user-preference, evaluation, LLM-as-judge]", "filename": "knowu-bench-personalized-mobile-agent.md"}, {"source_type": "arxiv", "title": "Aligning Agents via Planning: A Benchmark for Trajectory-Level Reward Modeling", "url": "https://arxiv.org/abs/2604.08178", "author": "Jiaxuan Wang et al.", "date": "2026-04-09", "retrieved": "2026-04-21", "tags": "[agentic, benchmark, evaluation, planning, reasoning, reward-modeling, tool-use, trajectory-level, safety, RLHF]", "filename": "plan_reward_bench.md"}, {"source_type": "arxiv", "title": "PokeGym: A Visually-Driven Long-Horizon Benchmark for Vision-Language Models", "url": "https://arxiv.org/abs/2604.08340", "author": "Ruizhi Zhang, Ye Huang et al.", "date": "2026-04-09", "retrieved": "2026-04-13", "tags": "[benchmark, VLM, vision-language, long-horizon, game-playing, visual-reasoning, sequential-decision-making, evaluation, Pokemon]", "filename": "pokegym-vlm-long-horizon-benchmark.md"}, {"source_type": "announcement", "title": "KellyBench: Can Language Models Beat the Market?", "url": "https://www.gr.inc/KellyBenchPaper.pdf", "author": "Thomas Grady, Kip Parker, Iliyan Zarov, Henry Course, Chengxi Taylor, Ross Taylor", "date": "2026-04-09", "retrieved": "2026-04-10", "tags": "[agentic, benchmark, sequential-decision-making, sports-betting, long-horizon, non-stationary, risk-management, kelly-criterion, ml-engineering, forecasting, adaptivity]", "filename": "summary_kellybench_sequential_decision_making_sports_betting.md"}, {"source_type": "announcement", "title": "How We Broke Top AI Agent Benchmarks: And What Comes Next", "url": "https://moogician.github.io/blog/2026/trustworthy-benchmarks-cont/", "author": "Hao Wang, Qiuyang Mang, Alvin Cheung, Koushik Sen, Dawn Song (UC Berkeley)", "date": "2026-04-08", "retrieved": "2026-04-10", "tags": "[benchmark, evaluation, reward-hacking, AI safety, trustworthy, exploit, vulnerability, agentic, SWE-bench, WebArena, GAIA, OSWorld, benchmark-integrity]", "filename": "summary_benchjack_trustworthy_benchmarks.md"}, {"source_type": "arxiv", "title": "WebSP-Eval: Evaluating Web Agents on Website Security and Privacy Tasks", "url": "https://arxiv.org/abs/2604.06367", "author": "Guruprasad Viswanathan Ramesh et al.", "date": "2026-04-07", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, web-navigation, security, privacy, multimodal, GUI, browser-agent]", "filename": "websp-eval.md"}, {"source_type": "arxiv", "title": "TelcoAgent-Bench: A Multilingual Benchmark for Telecom AI Agents", "url": "https://arxiv.org/abs/2604.06209", "author": "Lina Bariah et al.", "date": "2026-04-06", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, telecom, multilingual, tool-use, tool-calling, function-calling, domain-specific, Arabic, 5G]", "filename": "telcoagent-bench.md"}, {"source_type": "arxiv", "title": "The Art of Building Verifiers for Computer Use Agents", "url": "https://arxiv.org/abs/2604.06240", "author": "Corby Rosset, Pratyusha Sharma, Andrew Zhao, Miguel Gonzalez-Fernandez, Ahmed Awadallah", "date": "2026-04-05", "retrieved": "2026-04-13", "tags": "[agentic, evaluation, verifier, computer-use, web-agent, reward-model, process-reward, outcome-reward, benchmark, CUA, methodology]", "filename": "cua-verifier-bench-computer-use-agents.md"}, {"source_type": "arxiv", "title": "YC-Bench: Benchmarking AI Agents for Long-Term Planning and Consistent Execution", "url": "https://arxiv.org/abs/2604.01212", "author": "Muyu He et al.", "date": "2026-04-02", "retrieved": "2026-05-03", "tags": "[agentic, benchmark, evaluation, planning, reasoning, long-horizon, coherence, simulation, memory, scratchpad, adversarial, resource-allocation, POMDP, tool-use]", "filename": "yc_bench.md"}, {"source_type": "arxiv", "title": "YC-Bench: Benchmarking AI Agents for Long-Term Planning and Consistent Execution", "url": "https://arxiv.org/abs/2604.01212", "author": "Muyu He et al.", "date": "2026-04-02", "retrieved": "2026-04-02", "tags": "[agentic, benchmark, long-horizon, planning, coherence, simulation, memory, scratchpad, adversarial, resource-allocation, POMDP, tool-use]", "filename": "yc_bench_long_horizon_startup.md"}, {"source_type": "arxiv", "title": "HippoCamp: Benchmarking Contextual Agents on Personal Computers", "url": "https://arxiv.org/abs/2604.01221", "author": "Zhe Yang et al.", "date": "2026-04-01", "retrieved": "2026-04-03", "tags": "[agentic, benchmark, evaluation, multimodal, file-management, personal-computing, retrieval, RAG, memory, personalization, context-aware, OS-interaction]", "filename": "hippocamp.md"}, {"source_type": "arxiv", "title": "When Users Change Their Mind: Evaluating Interruptible Agents in Long-Horizon Web Navigation", "url": "https://arxiv.org/abs/2604.00892", "author": "Henry Peng Zou et al.", "date": "2026-04-01", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, web-navigation, interruption, long-horizon, multi-turn, human-in-the-loop]", "filename": "interruptbench.md"}, {"source_type": "arxiv", "title": "Aligning Agents via Planning: A Benchmark for Trajectory-Level Reward Modeling", "url": "https://arxiv.org/abs/2604.08178", "author": "(pending full author list)", "date": "2026-04", "retrieved": "2026-04-19", "tags": "[benchmark, reward-modeling, planning, trajectory-level, tool-use, agentic, evaluation]", "filename": "2604.08178-plan-rewardbench.md"}, {"source_type": "arxiv", "title": "From Plan to Action: How Well Do Agents Follow the Plan?", "url": "https://arxiv.org/abs/2604.12147", "author": "(pending full author list)", "date": "2026-04", "retrieved": "2026-04-19", "tags": "[evaluation, planning, software-engineering, agent-behavior, plan-compliance, swe-bench]", "filename": "2604.12147-from-plan-to-action.md"}, {"source_type": "arxiv", "title": "HWE-Bench: Benchmarking LLM Agents on Real-World Hardware Bug Repair Tasks", "url": "https://arxiv.org/abs/2604.14709", "author": "Unknown et al.", "date": "2026-04", "retrieved": "2026-04-29", "tags": "[benchmark, evaluation, hardware, debugging, agentic, tool-use, code-generation, repository-level]", "filename": "2604.14709-hwe-bench.md"}, {"source_type": "arxiv", "title": "Do Agents Dream of Root Shells? Partial-Credit Evaluation of LLM Agents in Capture The Flag Challenges", "url": "https://arxiv.org/abs/2604.19354", "author": "Ali Al-Kaswan, Maksim Plotnikov, Maxim Hájek, Roland Vízner, Arie van Deursen, Maliheh Izadi (Delft University of Technology)", "date": "2026-04", "retrieved": "2026-04-29", "tags": "[benchmark, evaluation, cybersecurity, CTF, agentic, tool-use, reasoning, planning]", "filename": "2604.19354-deepred-ctf.md"}, {"source_type": "arxiv", "title": "Odysseys: Benchmarking Web Agents on Realistic Long Horizon Tasks", "url": "https://arxiv.org/abs/2604.24964", "author": "Unknown et al.", "date": "2026-04", "retrieved": "2026-04-29", "tags": "[benchmark, evaluation, web-navigation, long-horizon, agentic, multi-site, real-world]", "filename": "2604.24964-odysseys.md"}, {"source_type": "arxiv", "title": "Agentization of Digital Assets for the Agentic Web: Concepts, Techniques, and Benchmark", "url": "https://arxiv.org/abs/2604.04226", "author": "Linyao Chen et al.", "date": "2026-04", "retrieved": "2026-04-15", "tags": "[agentic, benchmark, evaluation, a2a-protocol, digital-assets, multi-agent, interoperability, agentic-web, tool-generation, mcp]", "filename": "a2a_agentization_bench.md"}, {"source_type": "arxiv", "title": "ACIArena: Toward Unified Evaluation for Agent Cascading Injection", "url": "https://arxiv.org/abs/2604.07775", "author": "ACIArena team", "date": "2026-04", "retrieved": "2026-04-17", "tags": "[multi-agent, security, prompt-injection, benchmark, robustness, MAS]", "filename": "aciarena.md"}, {"source_type": "arxiv", "title": "AgentCE-Bench: Agent Configurable Evaluation with Scalable Horizons and Controllable Difficulty under Lightweight Environments", "url": "https://arxiv.org/abs/2604.06111", "author": "Wang Yang et al.", "date": "2026-04", "retrieved": "2026-04-15", "tags": "[agentic, benchmark, evaluation, planning, tool-use, reasoning, configurable, lightweight, training-eval]", "filename": "agentce_bench.md"}, {"source_type": "arxiv", "title": "AgentSocialBench: Evaluating Privacy Risks in Human-Centered Agentic Social Networks", "url": "https://arxiv.org/abs/2604.01487", "author": "Prince Zizhuang Wang, Shuli Jiang", "date": "2026-04", "retrieved": "2026-04-17", "tags": "[multi-agent, privacy, social-networks, benchmark, human-agent-interaction, LLM]", "filename": "agentsocialbench.md"}, {"source_type": "arxiv", "title": "AlphaEval: Evaluating Agents in Production", "url": "https://arxiv.org/abs/2604.12162", "author": "Pengrui Lu, Bingyu Xu, Wenjun Zhang, Shengjia Hua et al. (Pengfei Liu corresponding)", "date": "2026-04", "retrieved": "2026-04-16", "tags": "[agentic, benchmark, evaluation, enterprise, multi-agent, tool-use, code-generation, reasoning, leaderboard]", "filename": "alphaeval.md"}, {"source_type": "arxiv", "title": "The Amazing Agent Race: Strong Tool Users, Weak Navigators", "url": "https://arxiv.org/abs/2604.10261", "author": "Zae Myung Kim et al.", "date": "2026-04", "retrieved": "2026-04-15", "tags": "[agentic, benchmark, evaluation, tool-use, web-navigation, wikipedia, dag, multi-step, harbor, reasoning, compositional]", "filename": "amazing_agent_race.md"}, {"source_type": "arxiv", "title": "Evaluating LLM Agents on Automated Software Analysis Tasks", "url": "https://arxiv.org/abs/2604.11270", "author": "Islem Bouzenia, Cristian Cadar, Michael Pradel", "date": "2026-04", "retrieved": "2026-04-24", "tags": "[agentic, benchmark, software-engineering, software-analysis, c-cpp, java, tool-configuration]", "filename": "analysisbench.md"}, {"source_type": "arxiv", "title": "ATANT v1.1: Positioning Continuity Evaluation Against Memory, Long-Context, and Agentic-Memory Benchmarks", "url": "https://arxiv.org/abs/2604.10981", "author": "Samuel Sameer Tanguturi", "date": "2026-04", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, memory, continuity-evaluation, long-context, agentic-memory, companion-paper]", "filename": "atant_v1_1.md"}, {"source_type": "arxiv", "title": "AutomationBench", "url": "https://arxiv.org/abs/2604.18934", "author": "Daniel Shepard, Robin Salimans", "date": "2026-04", "retrieved": "2026-04-24", "tags": "[agentic, benchmark, enterprise, workflow-automation, rest-api, cross-application, business-process]", "filename": "automationbench.md"}, {"source_type": "arxiv", "title": "BankerToolBench: Evaluating AI Agents in End-to-End Investment Banking Workflows", "url": "https://arxiv.org/abs/2604.11304", "author": "Elaine Lau et al.", "date": "2026-04", "retrieved": "2026-04-21", "tags": "[agentic, benchmark, evaluation, tool-use, enterprise, finance, multi-file-output, rubric-evaluation, professional-workflows]", "filename": "banker_tool_bench.md"}, {"source_type": "arxiv", "title": "Your Agent, Their Asset: A Real-World Safety Analysis of OpenClaw", "url": "https://arxiv.org/abs/2604.04759", "author": "Zijun Wang, Haoqin Tu, Letian Zhang", "date": "2026-04", "retrieved": "2026-04-21", "tags": "[agentic, benchmark, safety, security, evaluation, personal-ai, attack-scenarios, poisoning, system-access]", "filename": "cik_bench.md"}, {"source_type": "arxiv", "title": "ClawArena: Benchmarking AI Agents in Evolving Information Environments", "url": "https://arxiv.org/abs/2604.04202", "author": "Ji, Xiong, Han, Xia, Qiu, Zhou, Liu, Li, Li, Zheng, Xie, Yao (UC Santa Cruz / Aiming Lab)", "date": "2026-04", "retrieved": "2026-04-10", "tags": "[benchmark, evaluation, agentic, persistent-assistant, belief-revision, multi-source-conflict, personalization, dynamic-updates, information-environments]", "filename": "clawarena_evolving_information.md"}, {"source_type": "arxiv", "title": "CocoaBench: Evaluating Unified Digital Agents in the Wild", "url": "https://arxiv.org/abs/2604.11201", "author": "Shibo Hao et al.", "date": "2026-04", "retrieved": "2026-04-15", "tags": "[agentic, benchmark, evaluation, multimodal, vision, web-navigation, coding, tool-use, long-horizon, unified-agent, composition]", "filename": "cocoabench.md"}, {"source_type": "arxiv", "title": "CoopEval: Benchmarking Cooperation-Sustaining Mechanisms and LLM Agents in Social Dilemmas", "url": "https://arxiv.org/abs/2604.15267", "author": "Emanuel Tewolde et al.", "date": "2026-04", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, multi-agent, cooperation, social-dilemmas, game-theory, LLM-agents, replicator-dynamics, mechanism-design, prisoner-dilemma]", "filename": "coopeval.md"}, {"source_type": "arxiv", "title": "Frontier-Eng: Benchmarking Self-Evolving Agents on Real-World Engineering Tasks with Generative Optimization", "url": "https://arxiv.org/abs/2604.12290", "author": "Yizhe Chi, Deyao Hong, Dapeng Jiang", "date": "2026-04", "retrieved": "2026-04-24", "tags": "[agentic, benchmark, engineering, generative-optimization, iterative-design, industrial-simulator]", "filename": "frontier_eng.md"}, {"source_type": "arxiv", "title": "GUIDE: Interpretable GUI Agent Evaluation via Hierarchical Diagnosis", "url": "https://arxiv.org/abs/2604.04399", "author": "Yuwen Zhai et al.", "date": "2026-04", "retrieved": "2026-04-15", "tags": "[agentic, benchmark, evaluation, gui, web-navigation, mobile, desktop, interpretable, llm-as-judge, trajectory-evaluation, hierarchical]", "filename": "guide_gui_eval.md"}, {"source_type": "arxiv", "title": "LinuxArena: A Control Setting for AI Agents in Live Production Software Environments", "url": "https://arxiv.org/abs/2604.15384", "author": "Tyler Tracy et al.", "date": "2026-04", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, ai-control, safety, sabotage, software-engineering, monitoring, production-environment, docker, tool-use]", "filename": "linuxarena.md"}, {"source_type": "arxiv", "title": "Litmus (Re)Agent: A Benchmark and Agentic System for Predictive Evaluation of Multilingual Models", "url": "https://arxiv.org/abs/2604.08970", "author": "Avni Mittal, Shanu Kumar, Sandipan Dandapat", "date": "2026-04", "retrieved": "2026-04-24", "tags": "[agentic, benchmark, multilingual, predictive-evaluation, transfer-learning, evidence-aggregation]", "filename": "litmus_reagent.md"}, {"source_type": "arxiv", "title": "LiveClawBench: Benchmarking LLM Agents on Complex, Real-World Assistant Tasks", "url": "https://arxiv.org/abs/2604.13072", "author": "Xiang Long, Li Du, Yilong Xu, et al. (Samsung Research, HKUST Guangzhou, Peking University, City University of Hong Kong)", "date": "2026-04", "retrieved": "2026-04-16", "tags": "[agentic, benchmark, evaluation, tool-use, reasoning, planning, memory, multi-agent, web-navigation, os-interaction]", "filename": "liveclawbench.md"}, {"source_type": "arxiv", "title": "MemMachine: A Ground-Truth-Preserving Memory System for Personalized AI Agents", "url": "https://arxiv.org/abs/2604.04853", "author": "Shu Wang, Edwin Yu, Oscar Love", "date": "2026-04", "retrieved": "2026-04-23", "tags": "[agentic, memory-system, long-term, episodic, retrieval-augmented, personalization, open-source]", "filename": "memmachine.md"}, {"source_type": "arxiv", "title": "Memory Intelligence Agent", "url": "https://arxiv.org/abs/2604.04503", "author": "Jingyang Qiao, Weicheng Meng, Yu Cheng", "date": "2026-04", "retrieved": "2026-04-23", "tags": "[agent-framework, memory-system, deep-research, manager-planner-executor, test-time-learning, self-evolution]", "filename": "mia.md"}, {"source_type": "arxiv", "title": "OccuBench: Evaluating AI Agents on Real-World Professional Tasks via Language Environment Simulation", "url": "https://arxiv.org/abs/2604.10866", "author": "Xiaomeng Hu et al.", "date": "2026-04", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, professional-tasks, tool-use, robustness, fault-injection, multi-domain, language-environment-simulation, occupational]", "filename": "occubench.md"}, {"source_type": "arxiv", "title": "ProdCodeBench: A Production-Derived Benchmark for Evaluating AI Coding Agents", "url": "https://arxiv.org/abs/2604.01527", "author": "Smriti Jha, Matteo Paltenghi, Chandra Maddila", "date": "2026-04", "retrieved": "2026-04-24", "tags": "[agentic, benchmark, coding, software-engineering, production-derived, multi-language, harness-design]", "filename": "prodcodebench.md"}, {"source_type": "arxiv", "title": "SAGE: A Service Agent Graph-guided Evaluation Benchmark", "url": "https://arxiv.org/abs/2604.09285", "author": "Ling Shi, Yuqin Dai, Ziyin Wang, Ning Gao, Wei Zhang, Chaozheng Wang, Yujie Wang, Wei He, Jinpeng Wang, Deiyi Xiong", "date": "2026-04", "retrieved": "2026-04-17", "tags": "[multi-agent, service-agent, benchmark, customer-service, SOP-compliance, evaluation]", "filename": "sage_service_agent.md"}, {"source_type": "arxiv", "title": "SIR-Bench: Evaluating Investigation Depth in Security Incident Response Agents", "url": "https://arxiv.org/abs/2604.12040", "author": "Daniel Begimher, Cristian Leo, Jack Huang, Pat Gaw, Bonan Zheng (Amazon Web Services)", "date": "2026-04", "retrieved": "2026-04-16", "tags": "[agentic, benchmark, evaluation, security, tool-use, reasoning, dataset]", "filename": "sir_bench.md"}, {"source_type": "arxiv", "title": "SkillLearnBench: Benchmarking Continual Learning Methods for Agent Skill Generation on Real-World Tasks", "url": "https://arxiv.org/abs/2604.20087", "author": "Shanshan Zhong, Yi Lu, Jingjie Ning", "date": "2026-04", "retrieved": "2026-04-24", "tags": "[agentic, benchmark, continual-learning, skill-generation, real-world, self-feedback]", "filename": "skilllearnbench.md"}, {"source_type": "arxiv", "title": "Your Agent, Their Asset: A Real-World Safety Analysis of OpenClaw", "url": "https://arxiv.org/abs/2604.04759", "author": "Zijun Wang et al.", "date": "2026-04", "retrieved": "2026-04-23", "tags": "[agentic, safety, security, benchmark, evaluation, personal-agent, prompt-injection, adversarial, tool-use, OpenClaw, red-teaming]", "filename": "your_agent_their_asset.md"}, {"source_type": "announcement", "title": "Announcing AutoBench Agentic: The Next Generation Agentic Benchmark", "url": "https://huggingface.co/blog/PeterKruger/autobench-agentic-1", "author": "Peter Kruger", "date": "2026-04", "retrieved": "2026-04-24", "tags": "[agentic, benchmark, dynamic-generation, un-gameable, virtual-environments, enterprise, react, multi-turn]", "filename": "autobench_agentic.md"}, {"source_type": "announcement", "title": "FrontierSWE: Benchmarking coding agents at the limits of human abilities", "url": "https://www.frontierswe.com/blog", "author": "Proximal Labs", "date": "2026-04", "retrieved": "2026-04-16", "tags": "[agentic, benchmark, coding, software_engineering, long_horizon, frontier_swe, proximal_labs]", "filename": "frontierswe.md"}, {"source_type": "announcement", "title": "Introducing o11y-bench: an open benchmark for AI agents running observability workflows", "url": "https://grafana.com/blog/o11y-bench-open-benchmark-for-observability-agents/", "author": "Grafana Labs", "date": "2026-04", "retrieved": "2026-04-24", "tags": "[agentic, benchmark, observability, tool-use, grafana, mcp, sre, open-source]", "filename": "o11y_bench.md"}, {"source_type": "announcement", "title": "ParseBench: The Document Parsing Benchmark for AI Agents", "url": "https://www.parsebench.ai/", "author": "LlamaIndex (Boyang Zhang, Sebastián G. Acosta, Preston Carlson, Sacha Bron, Pierre-Loïc Doulcet, Simon Suo)", "date": "2026-04", "retrieved": "2026-04-16", "tags": "[announcement, benchmark, document-parsing, ocr, agentic, llamaindex, tables, charts, visual-grounding, vlm]", "filename": "summary_parsebench.md"}, {"source_type": "arxiv", "title": "AEC-Bench: A Multimodal Benchmark for Agentic Systems in Architecture, Engineering, and Construction", "url": "https://arxiv.org/abs/2603.29199", "author": "Harsh Mankodiya, Chase Gallik, Theodoros Galanos, Andriy Mulyar", "date": "2026-03-31", "retrieved": "2026-04-19", "tags": "[benchmark, multimodal, agentic, architecture, engineering, construction, domain-specific, reasoning]", "filename": "2603.29199-aec-bench.md"}, {"source_type": "arxiv", "title": "ELT-Bench-Verified: Benchmark Quality Issues Underestimate AI Agent Capabilities", "url": "https://arxiv.org/abs/2603.29399", "author": "Christopher Zanoli et al.", "date": "2026-03-31", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, evaluation, data-engineering, ELT, code-generation, tool-use, benchmark-quality, annotation-errors, SQL, data-pipelines]", "filename": "2603.29399-elt-bench-verified.md"}, {"source_type": "arxiv", "title": "AEC-Bench: A Multimodal Benchmark for Agentic Systems in Architecture, Engineering, and Construction", "url": "https://arxiv.org/abs/2603.29199", "author": "Harsh Mankodiya et al.", "date": "2026-03-31", "retrieved": "2026-04-21", "tags": "[agentic, benchmark, evaluation, reasoning, multimodal, domain-specific, construction, architecture, engineering]", "filename": "aec_bench.md"}, {"source_type": "arxiv", "title": "GUIDE: A Benchmark for Understanding and Assisting Users in Open-Ended GUI Tasks", "url": "https://arxiv.org/abs/2603.25864", "author": "Saelyne Yang et al.", "date": "2026-03-31", "retrieved": "2026-03-31", "tags": "[agentic, benchmark, evaluation, gui, user-intent, screen-recording, multimodal, proactive-assistance, behavior-detection, intent-prediction, help-prediction]", "filename": "guide_bench.md"}, {"source_type": "arxiv", "title": "MolQuest: A Benchmark for Agentic Evaluation of Abductive Reasoning in Chemical Structure Elucidation", "url": "https://arxiv.org/abs/2603.25253", "author": "Taolin Han et al.", "date": "2026-03-31", "retrieved": "2026-03-31", "tags": "[agentic, benchmark, evaluation, reasoning, science, chemistry, abductive-reasoning, multi-turn, tool-use, interactive]", "filename": "molquest.md"}, {"source_type": "arxiv", "title": "PRBench: End-to-end Paper Reproduction in Physics Research", "url": "https://arxiv.org/abs/2603.27646", "author": "Shi Qiu et al.", "date": "2026-03-31", "retrieved": "2026-03-31", "tags": "[agentic, benchmark, evaluation, research, scientific-reasoning, code-generation, physics, paper-reproduction, multi-agent, sandboxed-execution]", "filename": "prbench_physics.md"}, {"source_type": "arxiv", "title": "PSPA-Bench: A Personalized Benchmark for Smartphone GUI Agent", "url": "https://arxiv.org/abs/2603.29318", "author": "Hongyi Nie et al.", "date": "2026-03-31", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, mobile, smartphone, GUI, personalization, android, task-decomposition]", "filename": "pspa-bench.md"}, {"source_type": "arxiv", "title": "WebTestBench: Evaluating Computer-Use Agents towards End-to-End Automated Web Testing", "url": "https://arxiv.org/abs/2603.25226", "author": "Fanheng Kong et al.", "date": "2026-03-31", "retrieved": "2026-03-31", "tags": "[agentic, benchmark, evaluation, web-navigation, computer-use, software-testing, defect-detection, long-horizon, tool-use]", "filename": "webtestbench.md"}, {"source_type": "announcement", "title": "Scale Labs Leaderboard: Agentic Tool Use (Chat) — ToolComp", "url": "https://scale.com/leaderboard/tool_use_chat", "author": "Scale AI (Scale Labs)", "date": "2026-03-29", "retrieved": "2026-03-29", "tags": "[benchmark, tool-use, function-calling, leaderboard, compositional, agentic, chain-of-tools, scale-ai]", "filename": "summary_scale-toolcomp-chat-leaderboard.md"}, {"source_type": "arxiv", "title": "Ego2Web: A Web Agent Benchmark Grounded in Egocentric Videos", "url": "https://arxiv.org/abs/2603.22529", "author": "Shoubin Yu et al.", "date": "2026-03-28", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, web-navigation, egocentric-video, multimodal, embodied-AI, AR, vision-language]", "filename": "ego2web.md"}, {"source_type": "arxiv", "title": "A Benchmark for Evaluating Repository-Level Code Agents with Intermediate Reasoning on Feature Addition Task", "url": "https://arxiv.org/abs/2603.26337", "author": "Shuhan Liu et al.", "date": "2026-03-27", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, code-generation, software-engineering, repository-level, reasoning, feature-addition, intermediate-reasoning, evaluation, code-agents]", "filename": "2603.26337-feature-add-bench.md"}, {"source_type": "arxiv", "title": "FinMCP-Bench: Benchmarking LLM Agents for Real-World Financial Tool Use under the Model Context Protocol", "url": "https://arxiv.org/abs/2603.24943", "author": "Jie Zhu et al.", "date": "2026-03-26", "retrieved": "2026-04-27", "tags": "[agentic, benchmark, tool-use, function-calling, evaluation, finance, mcp]", "filename": "fin_mcp_bench.md"}, {"source_type": "arxiv", "title": "ARC-AGI-3: A New Challenge for Frontier Agentic Intelligence", "url": "https://arxiv.org/abs/2603.24621", "author": "François Chollet, Mike Knoop et al. (ARC Prize Foundation)", "date": "2026-03-24", "retrieved": "2026-04-21", "tags": "[agentic, benchmark, evaluation, reasoning, planning, fluid-intelligence, interactive, goal-inference, world-modeling, exploration]", "filename": "arc_agi_3.md"}, {"source_type": "arxiv", "title": "Efficient Benchmarking of AI Agents", "url": "https://arxiv.org/abs/2603.23749", "author": "Franck Ndzomga", "date": "2026-03-24", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, methodology, efficiency, leaderboard, item-response-theory, scaffold, ranking, cost-reduction]", "filename": "efficient-benchmarking-agents.md"}, {"source_type": "arxiv", "title": "CaP-X: A Framework for Benchmarking and Improving Coding Agents for Robot Manipulation", "url": "https://arxiv.org/abs/2603.22435", "author": "Max Fu et al.", "date": "2026-03-23", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, robotics, code-generation, embodied-ai, tool-use, evaluation, sim-to-real, reinforcement-learning, vision-language-models]", "filename": "2603.22435-cap-bench.md"}, {"source_type": "announcement", "title": "JJ Benchmark: Evaluating AI Agents on Jujutsu Version Control Tasks", "url": "https://tabbyml.github.io/jj-benchmark/", "author": "TabbyML", "date": "2026-03-23", "retrieved": "2026-03-23", "tags": "[agentic, benchmark, evaluation, code-generation, tool-use, version-control]", "filename": "summary_jj_benchmark.md"}, {"source_type": "arxiv", "title": "SWE-Next: Scalable Real-World Software Engineering Tasks for Agents", "url": "https://arxiv.org/abs/2603.20691", "author": "Jiarong Liang et al.", "date": "2026-03-21", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, software-engineering, code-generation, evaluation, dataset, training-data, swe-bench, fine-tuning, execution-grounded]", "filename": "2603.20691-swe-next.md"}, {"source_type": "announcement", "title": "τ³-bench: Advancing Agent Benchmarking to Knowledge and Voice", "url": "https://sierra.ai/resources/research/tau-3-bench", "author": "Sierra AI", "date": "2026-03-18", "retrieved": "2026-03-26", "tags": "[agentic, benchmark, evaluation, tool-use, reasoning, multi-turn, customer-service, voice, knowledge-retrieval]", "filename": "tau_3_bench.md"}, {"source_type": "twitter", "title": "Distributed Systems Theory for LLM Multi-Agent Teams", "url": "https://x.com/omarsar0/status/2033211887907999894", "author": "Elvis (@omarsar0, DAIR.AI)", "date": "2026-03-15", "retrieved": "2026-03-18", "tags": "[multi-agent, framework, distributed-systems, methodology]", "filename": "omarsar0_distributed_systems_agents.md"}, {"source_type": "twitter", "title": "XSkill: Dual-Stream Continual Learning for Agents", "url": "https://x.com/omarsar0/status/2032928526022881399", "author": "Elvis (@omarsar0, DAIR.AI)", "date": "2026-03-14", "retrieved": "2026-03-18", "tags": "[continual-learning, tool-use, framework, methodology]", "filename": "omarsar0_xskill_framework.md"}, {"source_type": "arxiv", "title": "HomeSafe-Bench: Evaluating Vision-Language Models on Unsafe Action Detection for Embodied Agents in Household Scenarios", "url": "https://arxiv.org/abs/2603.11975", "author": "Jiayue Pu et al.", "date": "2026-03-13", "retrieved": "2026-04-03", "tags": "[benchmark, evaluation, safety, embodied, robotics, vision-language-models, household, video, real-time, VLM]", "filename": "homesafe_bench.md"}, {"source_type": "announcement", "title": "A3: An Automated Alignment Agent for Safety Finetuning", "url": "https://alignment.anthropic.com/2026/automated-alignment-agent/", "author": "Anthropic Fellows Program; Constellation; Anthropic", "date": "2026-03-11", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, alignment, safety, finetuning, sycophancy, political-neutrality, jailbreak, automated-agent, data-generation]", "filename": "anthropic_a3_alignment_agent.md"}, {"source_type": "announcement", "title": "BigLaw Bench: Research", "url": "https://www.harvey.ai/blog/introducing-big-law-bench-research", "author": "Harvey AI, Snorkel AI", "date": "2026-03-11", "retrieved": "2026-03-18", "tags": "[agentic, benchmark, legal, reasoning, research, tool-use]", "filename": "harvey_biglaw_bench_research.md"}, {"source_type": "arxiv", "title": "CR-Bench: Evaluating the Real-World Utility of AI Code Review Agents", "url": "https://arxiv.org/abs/2603.11078", "author": "Kristen Pereira, Neelabh Sinha, Rajat Ghosh, Debojyoti Dutta (Nutanix, Inc.)", "date": "2026-03-10", "retrieved": "2026-05-05", "tags": "[benchmark, evaluation, code-review, agentic, software-engineering, tool-use, defect-detection]", "filename": "cr_bench_code_review_agents.md"}, {"source_type": "arxiv", "title": "ResearchEnvBench: Benchmarking Agents on Environment Synthesis for Research Code Execution", "url": "https://arxiv.org/abs/2603.06739", "author": "Yubang Wang, Chenxi Zhang, Bowen Chen, Zezheng Huai, Zihao Dai, Xinchi Chen, Yuxin Wang, Yining Zheng, Jingjing Gong, Xipeng Qiu", "date": "2026-03-10", "retrieved": "2026-05-05", "tags": "[benchmark, evaluation, code-generation, agentic, research, tool-use, environment-setup, reproducibility, HPC, CUDA]", "filename": "research_env_bench.md"}, {"source_type": "announcement", "title": "AuditBench: Evaluating Alignment Auditing Techniques on Models with Hidden Behaviors", "url": "https://alignment.anthropic.com/2026/auditbench/", "author": "Anthropic (Abhay Sheshadri, Aidan Ewart, Kai Fronsdal, Isha Gupta, Samuel R. Bowman, Sara Price, Samuel Marks, Rowan Wang)", "date": "2026-03-10", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, alignment, auditing, safety, hidden-behaviors, interpretability, adversarial-training]", "filename": "anthropic_auditbench.md"}, {"source_type": "arxiv", "title": "EVMbench: Evaluating AI Agents on Smart Contract Security", "url": "https://arxiv.org/abs/2603.04915", "author": "Justin Wang et al.", "date": "2026-03-05", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, security, code-generation, evaluation, smart-contract, blockchain, cybersecurity, exploit, vulnerability-detection, tool-use]", "filename": "2603.04915-evmbench.md"}, {"source_type": "arxiv", "title": "TimeWarp: Evaluating Web Agents by Revisiting the Past", "url": "https://arxiv.org/abs/2603.04949", "author": "Md Farhan Ishmam et al.", "date": "2026-03-05", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, web-navigation, temporal-robustness, interface-drift, plan-distillation, behavior-cloning, evolving-interfaces]", "filename": "timewarp-web-agents.md"}, {"source_type": "announcement", "title": "VeRO: Benchmarking AI Agent Optimization", "url": "https://labs.scale.com/blog/vero", "author": "Scale Labs (Scale AI)", "date": "2026-03-05", "retrieved": "2026-03-23", "tags": "[agentic, benchmark, evaluation, tool-use, reasoning, agent-optimization, coding-agent, meta-agent]", "filename": "summary_vero.md"}, {"source_type": "arxiv", "title": "SWE-CI: Evaluating Agent Capabilities in Maintaining Codebases via Continuous Integration", "url": "https://arxiv.org/abs/2603.03823", "author": "Jialong Chen et al.", "date": "2026-03-04", "retrieved": "2026-03-08", "tags": "[agentic, benchmark, evaluation, code-generation, multi-agent, planning, dataset]", "filename": "summary_sweci_evaluating_agent_capabilities_in_maintaining.md"}, {"source_type": "arxiv", "title": "LiveAgentBench: Comprehensive Benchmarking of Agentic Systems Across 104 Real-World Challenges", "url": "https://arxiv.org/abs/2603.02586", "author": "Hao Li, Huan Wang, Jinjie Gu, Wenjie Wang, Chenyi Zhuang, Sikang Bian", "date": "2026-03-03", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, real-world, multi-capability, browser, file-operation, mobile, continuous-update]", "filename": "liveagentbench.md"}, {"source_type": "twitter", "title": "No Wall in Sight — Continued Benchmark Progress on Agentic Tasks", "url": "https://x.com/polynoamial/status/2029622090152956335", "author": "@polynoamial", "date": "2026-03-03", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, progress, GDPval, computer-use, scaling, OpenAI]", "filename": "thread_noam_brown_no_wall_benchmark_progress.md"}, {"source_type": "announcement", "title": "Pencil Puzzle Bench: A Benchmark for Multi-Step Verifiable Reasoning", "url": "https://ppbench.com/", "author": "Justin Waugh (bluecoconut)", "date": "2026-03-02", "retrieved": "2026-03-25", "tags": "[agentic, benchmark, reasoning, multi-step, constraint-satisfaction, verifiable-reasoning, single-shot, agentic-iteration, puzzle, logic]", "filename": "ppbench_pencil_puzzle.md"}, {"source_type": "arxiv", "title": "BeSafe-Bench: Unveiling Behavioral Safety Risks of Situated Agents in Functional Environments", "url": "https://arxiv.org/abs/2603.25747", "author": "Yuxuan Li et al.", "date": "2026-03-01", "retrieved": "2026-04-03", "tags": "[agentic, benchmark, evaluation, safety, behavioral-safety, web-agent, mobile-agent, embodied-agent, VLM, VLA, multi-domain, hybrid-eval, LLM-judge]", "filename": "besafe_bench.md"}, {"source_type": "arxiv", "title": "MASEval: Extending Multi-Agent Evaluation from Models to Systems", "url": "https://arxiv.org/abs/2603.08835", "author": "Cornelius Emde et al.", "date": "2026-03-01", "retrieved": "2026-03-31", "tags": "[agentic, benchmark, evaluation, multi-agent, framework-agnostic, infrastructure, system-level, multi-agent-systems, tracing, tool-use]", "filename": "maseval.md"}, {"source_type": "arxiv", "title": "MiroEval: Benchmarking Multimodal Deep Research Agents in Process and Outcome", "url": "https://arxiv.org/abs/2603.28407", "author": "Fangda Ye et al. (MiroMind Team)", "date": "2026-03-01", "retrieved": "2026-04-03", "tags": "[agentic, benchmark, evaluation, deep-research, multimodal, factuality, process-centric, report-generation, web-search, long-form-qa]", "filename": "miroeval.md"}, {"source_type": "arxiv", "title": "Silo-Bench: A Scalable Environment for Evaluating Distributed Coordination in Multi-Agent LLM Systems", "url": "https://arxiv.org/abs/2603.01045", "author": "Yuzhe Zhang, Feiran Liu, Yi Shan, Xinyi Huang, Xin Yang, Yueqi Zhu, Xuxin Cheng, Cao Liu, Ke Zeng, Terry Jingchen Zhang, Wenyuan Jiang", "date": "2026-03-01", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, multi-agent, distributed-coordination, communication, reasoning-integration, scalability]", "filename": "silo-bench.md"}, {"source_type": "arxiv", "title": "Vision2Web: A Hierarchical Benchmark for Visual Website Development with Agent Verification", "url": "https://arxiv.org/abs/2603.26648", "author": "Zehai He et al.", "date": "2026-03-01", "retrieved": "2026-04-03", "tags": "[agentic, benchmark, evaluation, coding-agent, web-development, multimodal, visual-fidelity, ui-to-code, full-stack, agent-verification, gui-agent, VLM]", "filename": "vision2web.md"}, {"source_type": "announcement", "title": "FoodTruck Bench -- AI Business Simulation Benchmark", "url": "https://foodtruckbench.com/", "author": "Unknown (independent project; Twitter @foodtruckbench)", "date": "2026-03 (estimated from model roster including Claude Opus 4.6, GPT-5.2, Gemini 3 Pro)", "retrieved": "2026-04-10", "tags": "[agentic, benchmark, business-simulation, decision-making, tool-use, multi-step-reasoning, strategic-planning, resource-management, function-calling]", "filename": "summary_foodtruck_bench.md"}, {"source_type": "arxiv", "title": "Enhancing Tool Calling in LLMs with the International Tool Calling Dataset", "url": "https://arxiv.org/abs/2603.05515", "author": "Zuoyu Zhang, Yancheng Zhu", "date": "2026-03", "retrieved": "2026-03-29", "tags": "[benchmark, tool-use, function-calling, multilingual, dataset, international, api, cross-lingual]", "filename": "2603.05515-itc-international-tool-calling.md"}, {"source_type": "arxiv", "title": "PostTrainBench: Can LLM Agents Automate LLM Post-Training?", "url": "https://arxiv.org/abs/2603.08640", "author": "Ben Rank, Hardik Bhatnagar, Ameya Prabhu, Shira Eisenberg, Nguyen Karina, Matthias Bethge, Maksym Andriushchenko", "date": "2026-03", "retrieved": "2026-03-29", "tags": "[agentic, benchmark, ai-rnd, post-training, autonomous-agent, reward-hacking, coding, tool-use, math-reasoning]", "filename": "2603.08640-posttrainbench.md"}, {"source_type": "arxiv", "title": "Agentified Assessment of Logical Reasoning Agents", "url": "https://arxiv.org/abs/2603.02788", "author": "Zhiyu Ni et al.", "date": "2026-03", "retrieved": "2026-04-01", "tags": "[agentic, benchmark, evaluation, reasoning, taxonomy]", "filename": "agentified_logical_reasoning.md"}, {"source_type": "arxiv", "title": "ASTRA-bench: Evaluating Tool-Use Agent Reasoning and Action Planning with Personal User Context", "url": "https://arxiv.org/abs/2603.01357", "author": "Zidi Xiu, David Q. Sun, Kevin Cheng, Maitrik Patel, Josh Date, Yizhe Zhang, Jiarui Lu, Omar Attia, Raviteja Vemulapalli, Oncel Tuzel, Meng Cao, Samy Bengio", "date": "2026-03", "retrieved": "2026-03-29", "tags": "[benchmark, tool-use, personal-assistant, agentic, multi-turn, planning, reasoning, personal-context, ICML2026, Apple]", "filename": "astra-bench.md"}, {"source_type": "arxiv", "title": "Code Review Agent Benchmark", "url": "https://arxiv.org/abs/2603.23448", "author": "Yuntong Zhang, Zhiyuan Pan, Imam Nur Bani Yusuf, Haifeng Ruan, Ridwan Shariffdeen, Abhik Roychoudhury", "date": "2026-03", "retrieved": "2026-03-27", "tags": "[benchmark, code-review, software-engineering, agentic, pull-request, test-based-evaluation, NUS, SonarSource]", "filename": "c-crab-code-review-agent-benchmark.md"}, {"source_type": "arxiv", "title": "CUBE: A Standard for Unifying Agent Benchmarks", "url": "https://arxiv.org/abs/2603.15798", "author": "Lacoste et al. (ServiceNow AI Research, Silverstream.ai, IBM Research, CMU, HKU, OSU, UC Berkeley, Mila, McGill, Jetty)", "date": "2026-03", "retrieved": "2026-03-28", "tags": "[evaluation, agentic, benchmark, taxonomy, tool-use]", "filename": "cube_standard.md"}, {"source_type": "arxiv", "title": "Measuring AI Agents' Progress on Multi-Step Cyber Attack Scenarios", "url": "https://arxiv.org/abs/2603.11214", "author": "Linus Folkerts, Will Payne, Simon Inman", "date": "2026-03", "retrieved": "2026-04-21", "tags": "[agentic, benchmark, security, cyber-attack, multi-step, autonomous, industrial-control, dangerous-capability]", "filename": "cyber_attack_scenario.md"}, {"source_type": "arxiv", "title": "Can AI Agents Answer Your Data Questions? A Benchmark for Data Agents", "url": "https://arxiv.org/abs/2603.20576", "author": "Ruiying Ma et al.", "date": "2026-03", "retrieved": "2026-04-01", "tags": "[agentic, benchmark, evaluation, tool-use, function-calling, reasoning, dataset]", "filename": "dab_data_agent_bench.md"}, {"source_type": "arxiv", "title": "Emergence WebVoyager: Toward Consistent and Transparent Evaluation of (Web) Agents in The Wild", "url": "https://arxiv.org/abs/2603.29020", "author": "Deepak Akkil et al.", "date": "2026-03", "retrieved": "2026-04-15", "tags": "[agentic, benchmark, evaluation, web-navigation, inter-annotator-agreement, evaluation-methodology, live-web, standardization]", "filename": "emergence-webvoyager.md"}, {"source_type": "arxiv", "title": "EnterpriseOps-Gym: Agentic Planning in Realistic Enterprise Settings", "url": "https://arxiv.org/abs/2603.13594", "author": "ServiceNow-AI", "date": "2026-03", "retrieved": "2026-03-18", "tags": "[agentic, benchmark, enterprise, planning, tool-use, long-horizon]", "filename": "enterpriseops-gym.md"}, {"source_type": "arxiv", "title": "LifeBench: A Benchmark for Long-Horizon Multi-Source Memory", "url": "https://arxiv.org/abs/2603.03781", "author": "Zihao Cheng, Weixin Wang, Yu Zhao", "date": "2026-03", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, memory, long-horizon, multi-source, personalized, declarative, procedural]", "filename": "lifebench.md"}, {"source_type": "arxiv", "title": "LiveCultureBench: a Multi-Agent, Multi-Cultural Benchmark for Large Language Models in Dynamic Social Simulations", "url": "https://arxiv.org/abs/2603.01952", "author": "Viet-Thanh Pham, Lizhen Qu, Thuy-Trang Vu, Gholamreza Haffari, Dinh Phung (Monash University)", "date": "2026-03", "retrieved": "2026-03-25", "tags": "[agentic, benchmark, multi-agent, cultural-alignment, social-simulation, LLM-as-judge, norm-adherence, multi-cultural, conformal-prediction]", "filename": "liveculturebench.md"}, {"source_type": "arxiv", "title": "LMEB: Long-horizon Memory Embedding Benchmark", "url": "https://arxiv.org/abs/2603.12572", "author": "Xinping Zhao, Xinshuo Hu, Jiaxin Xu", "date": "2026-03", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, memory, embedding, retrieval, long-horizon, episodic, dialogue, semantic, procedural]", "filename": "lmeb.md"}, {"source_type": "arxiv", "title": "MobileDev-Bench: A Comprehensive Benchmark for Evaluating Language Models on Mobile Application Development", "url": "https://arxiv.org/abs/2603.24946", "author": "Moshood A. Fakorede, Krishna Upadhyay, A.B. Siddique, Umar Farooq", "date": "2026-03", "retrieved": "2026-03-27", "tags": "[agentic, benchmark, coding, software-engineering, mobile, android, flutter, react-native, issue-resolution, swe-bench, multi-file, program-repair]", "filename": "mobiledev-bench.md"}, {"source_type": "arxiv", "title": "MonitorBench: A Comprehensive Benchmark for Chain-of-Thought Monitorability in Large Language Models", "url": "https://arxiv.org/abs/2603.28590", "author": "Han Wang et al.", "date": "2026-03", "retrieved": "2026-04-01", "tags": "[benchmark, evaluation, reasoning, safety, dataset, leaderboard]", "filename": "monitorbench.md"}, {"source_type": "arxiv", "title": "Multi-Agent Memory from a Computer Architecture Perspective", "url": "https://arxiv.org/abs/2603.10062", "author": "UC San Diego, Georgia Tech", "date": "2026-03", "retrieved": "2026-03-18", "tags": "[multi-agent, memory, architecture, vision-paper]", "filename": "multi_agent_memory_architecture.md"}, {"source_type": "arxiv", "title": "Measuring AI Agents' Progress on Multi-Step Cyber Attack Scenarios", "url": "https://arxiv.org/abs/2603.11214", "author": "Linus Folkerts et al.", "date": "2026-03", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, cybersecurity, cyber-attack, multi-step, inference-scaling, autonomous-agents, red-teaming, penetration-testing, AISI]", "filename": "multistep_cyber_attack_bench.md"}, {"source_type": "arxiv", "title": "Building Effective AI Coding Agents for the Terminal: OpenDev", "url": "https://arxiv.org/abs/2603.05344", "author": "Nghi D. Q. Bui", "date": "2026-03", "retrieved": "2026-03-18", "tags": "[coding-agent, terminal, architecture, report]", "filename": "opendev_coding_agent.md"}, {"source_type": "arxiv", "title": "Beyond Task Completion: Revealing Corrupt Success in LLM Agents through Procedure-Aware Evaluation", "url": "https://arxiv.org/abs/2603.03116", "author": "Hongliu Cao, Ilias Driouich, Eoin Thomas (Amadeus France)", "date": "2026-03", "retrieved": "2026-03-25", "tags": "[agentic, evaluation, framework, procedural-integrity, corrupt-success, tau-bench, tool-use, customer-service, llm-as-judge, safety]", "filename": "pae_procedure_aware_evaluation.md"}, {"source_type": "arxiv", "title": "RewardHackingAgents: Benchmarking Evaluation Integrity for LLM ML-Engineering Agents", "url": "https://arxiv.org/abs/2603.11337", "author": "Yonas Atinafu, Robin Cohen", "date": "2026-03", "retrieved": "2026-04-24", "tags": "[agentic, benchmark, ml-engineering, evaluation-integrity, reward-hacking, train-test-leakage]", "filename": "reward_hacking_agents.md"}, {"source_type": "arxiv", "title": "SlopCodeBench: Benchmarking How Coding Agents Degrade Over Long-Horizon Iterative Tasks", "url": "https://arxiv.org/abs/2603.24755", "author": "Gabriel Orlanski, Devjeet Roy, Alexander Yun, Changho Shin, Alex Gu, Albert Ge, Dyah Adila, Frederic Sala, Aws Albarghouthi", "date": "2026-03", "retrieved": "2026-03-29", "tags": "[benchmark, coding, agentic, iterative, code-quality, long-horizon, software-engineering, code-degradation]", "filename": "slopcodebench.md"}, {"source_type": "arxiv", "title": "Tool-Genesis: A Task-Driven Tool Creation Benchmark for Self-Evolving Language Agent", "url": "https://arxiv.org/abs/2603.05578", "author": "Bowei Xia, Mengkang Hu, Shijian Wang, Jiarui Jin, Wenxiang Jiao, Yuan Lu, Kexin Li, Ping Luo", "date": "2026-03", "retrieved": "2026-03-29", "tags": "[benchmark, tool-creation, MCP, self-evolving-agents, function-calling, tool-use, agentic, ICML2026, HKU, Xiaohongshu]", "filename": "tool-genesis.md"}, {"source_type": "arxiv", "title": "WirelessBench: A Tolerance-Aware LLM Agent Benchmark for Wireless Network Intelligence", "url": "https://arxiv.org/abs/2603.21251", "author": "Jingwen Tong, Fang Liu, Linkai Xv, Shiliang Lu, Kangqi Li, Yiqian Zhang, Yijie Song, Zeyang Xue, Jun Zhang (Shenzhen University; Hong Kong University of Science and Technology)", "date": "2026-03", "retrieved": "2026-03-25", "tags": "[agentic, benchmark, tool-use, domain-specific, wireless, telecom, evaluation, tolerance-aware, chain-of-thought, structured-output, engineering]", "filename": "wirelessbench.md"}, {"source_type": "arxiv", "title": "ZeroDayBench: Evaluating LLM Agents on Unseen Zero-Day Vulnerabilities for Cyberdefense", "url": "https://arxiv.org/abs/2603.02297", "author": "Nancy Lau et al.", "date": "2026-03", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, cybersecurity, debugging, code-generation, tool-use, reasoning]", "filename": "zerodaybench.md"}, {"source_type": "announcement", "title": "A3: An Automated Alignment Agent for Safety Finetuning", "url": "https://alignment.anthropic.com/2026/automated-alignment-agent/", "author": "Jifan Zhang, Henry Sleight, Joe Benton", "date": "2026-03", "retrieved": "2026-04-21", "tags": "[llm-safety, automated-alignment, agentic-framework, finetuning, bias-mitigation, jailbreak-prevention, anthropic]", "filename": "a3_alignment_agent.md"}, {"source_type": "announcement", "title": "aiRank: Comprehensive LLM Comparison for Coding", "url": "https://airank.dev/", "author": "aiRank", "date": "2026-03", "retrieved": "2026-03-28", "tags": "[leaderboard, aggregator, coding, agentic, benchmark-comparison, multi-benchmark]", "filename": "summary_airank.md"}, {"source_type": "announcement", "title": "Open-world evaluations for measuring frontier AI capabilities", "url": "https://cruxevals.com/open-world-evaluations.pdf", "author": "Sayash Kapoor, Peter Kirgis, Andrew Schwartz, Stephan Rabanser, J.J. Allaire, Rishi Bommasani, Magda Dubois, Gillian Hadfield, Andy Hall, Sara Hooker, Seth Lazar, Steve Newman, Dimitris Papailiopoulos, Shoshannah Tekofsky, Helen Toner, Cozmin Ududec, Arvind Narayanan (Princeton and collaborators)", "date": "2026-03", "retrieved": "2026-04-16", "tags": "[announcement, crux, open_world_evaluation, agentic, frontier_capabilities, ios_app_development, log_analysis, princeton]", "filename": "summary_crux_open_world_evaluations.md"}, {"source_type": "arxiv", "title": "EmCoop: A Framework and Benchmark for Embodied Cooperation Among LLM Agents", "url": "https://arxiv.org/abs/2603.00349", "author": "Hanqing Yang, Shiyu Chen, Narjes Nourzad, Marie Siew, Jingdi Chen, Carlee Joe-Wong", "date": "2026-02-27", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, multi-agent, embodied, cooperation, coordination, communication]", "filename": "emcoop.md"}, {"source_type": "arxiv", "title": "MobilityBench: A Benchmark for Evaluating Route-Planning Agents in Real-World Mobility Scenarios", "url": "https://arxiv.org/abs/2602.22638", "author": "Zhiheng Song, Jingshuai Zhang, Chuan Qin, Chao Wang, Chao Chen, Longfei Xu, Kaikui Liu, Xiangxiang Chu, Hengshu Zhu", "date": "2026-02-26", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, route-planning, mobility, tool-use, api-calling, navigation]", "filename": "mobilitybench.md"}, {"source_type": "announcement", "title": "Pacific Northwest National Laboratory and OpenAI partner to accelerate federal permitting", "url": "https://openai.com/index/pacific-northwest-national-laboratory/", "author": "OpenAI / PNNL", "date": "2026-02-26", "retrieved": "2026-03-31", "tags": "[agentic, benchmark, evaluation, enterprise, reasoning, coding-agent, government, document-drafting, federal-permitting, nepa, environmental-review]", "filename": "summary_draft_nepa_bench.md"}, {"source_type": "twitter", "title": "Princeton's Framework for AI Agent Reliability — Beyond Raw Benchmark Scores", "url": "https://x.com/BrianRoemmele/status/2026675089027248547", "author": "@BrianRoemmele", "date": "2026-02-24", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, reliability, Princeton, methodology, framework, CITP]", "filename": "thread_princeton_agent_reliability_framework.md"}, {"source_type": "arxiv", "title": "Classroom Final Exam: An Instructor-Tested Reasoning Benchmark", "url": "https://arxiv.org/abs/2602.19517", "author": "Chongyang Gao et al.", "date": "2026-02-23", "retrieved": "2026-04-21", "tags": "[benchmark, evaluation, reasoning, multimodal, STEM, university-level, multi-step-reasoning]", "filename": "cfe_bench.md"}, {"source_type": "twitter", "title": "2025 AI Agent Index — Documenting Technical and Safety Features of Deployed Agents", "url": "https://x.com/Graham_dePenros/status/2024998307592855643", "author": "@Graham_dePenros", "date": "2026-02-20", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, safety, index, MIT, Cambridge, Stanford, Harvard, deployed-agents]", "filename": "thread_2025_ai_agent_index.md"}, {"source_type": "arxiv", "title": "Persona2Web: Benchmarking Personalized Web Agents for Contextual Reasoning with User History", "url": "https://arxiv.org/abs/2602.17003", "author": "Serin Kim et al.", "date": "2026-02-19", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, web-navigation, personalization, user-history, contextual-reasoning, ambiguous-tasks, clarification, preference-inference]", "filename": "persona2web-personalized-web-agents.md"}, {"source_type": "arxiv", "title": "AgentLAB: Benchmarking LLM Agents against Long-Horizon Attacks", "url": "https://arxiv.org/abs/2602.16901", "author": "Tanqiu Jiang et al.", "date": "2026-02-18", "retrieved": "2026-04-03", "tags": "[agentic, benchmark, evaluation, safety, security, adversarial, multi-turn, jailbreak, prompt-injection, tool-use, memory-poisoning, red-teaming]", "filename": "agentlab_attacks.md"}, {"source_type": "announcement", "title": "Introducing EVMbench: Evaluating AI Agents on Smart Contract Security", "url": "https://openai.com/index/introducing-evmbench/", "author": "OpenAI, Paradigm, and OtterSec (Justin Wang, Andreas Bigger, Xiaohai Xu, Justin W. Lin, Andy Applebaum, Tejal Patwardhan, Alpin Yukseloglu, Olivia Watkins)", "date": "2026-02-18", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, smart-contracts, cybersecurity, blockchain, ethereum, EVM, solidity, vulnerability-detection, exploit, code-security, DeFi, tool-use]", "filename": "openai_evmbench.md"}, {"source_type": "announcement", "title": "PA-Bench: Evaluating Web Agents on Real-World Personal Assistant Workflows", "url": "https://vibrantlabs.com/blog/pa-bench", "author": "Vibrant Labs", "date": "2026-02-16", "retrieved": "2026-03-31", "tags": "[agentic, benchmark, evaluation, web-navigation, computer-use, multi-app, personal-assistant, long-horizon, multi-step]", "filename": "summary_pa_bench.md"}, {"source_type": "arxiv", "title": "AmbiBench: Benchmarking Mobile GUI Agents Beyond One-Shot Instructions in the Wild", "url": "https://arxiv.org/abs/2602.11750", "author": "Jiazheng Sun, Mingxuan Li, Yingying Zhang, Jiayang Niu, Yachen Wu, Ruihan Jin, Shuyu Lei, Pengrongrui Tan, Zongyu Zhang, Ruoyi Wang, Jiachen Yang, Boyu Yang, Jiacheng Liu, Xin Peng", "date": "2026-02-12", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, mobile-GUI, ambiguity, interaction, intent-alignment, clarification]", "filename": "ambibench.md"}, {"source_type": "arxiv", "title": "FeatureBench: Benchmarking Agentic Coding for Complex Feature Development", "url": "https://arxiv.org/abs/2602.10975", "author": "Qixing Zhou et al.", "date": "2026-02-11", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, code-generation, software-engineering, feature-development, test-driven, evaluation, ICLR-2026, execution-based, dependency-graph]", "filename": "2602.10975-featurebench.md"}, {"source_type": "twitter", "title": "Demystifying Evals for AI Agents — Anthropic Engineering Blog", "url": "https://x.com/AnthropicAI/status/2009696515061911674", "author": "@AnthropicAI", "date": "2026-02-10", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation-methodology, Anthropic, best-practices, deployment]", "filename": "thread_anthropic_demystifying_agent_evals.md"}, {"source_type": "arxiv", "title": "LOCA-bench: Benchmarking Language Agents Under Controllable and Extreme Context Growth", "url": "https://arxiv.org/abs/2602.07962", "author": "Weihao Zeng, Yuzhen Huang, Junxian He (HKUST)", "date": "2026-02-08", "retrieved": "2026-04-10", "tags": "[benchmark, evaluation, agentic, long-context, tool-use, context-engineering, context-rot, scaffolding]", "filename": "loca_bench_long_context_agents.md"}, {"source_type": "arxiv", "title": "PATHWAYS: Evaluating Investigation and Context Discovery in AI Web Agents", "url": "https://arxiv.org/abs/2602.05354", "author": "Shifat E. Arman et al.", "date": "2026-02-05", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, web-navigation, context-discovery, investigation, hallucination, behavioral-forensics, fraud-detection, moderation]", "filename": "pathways-web-agents.md"}, {"source_type": "arxiv", "title": "ProAgentBench: Evaluating LLM Agents for Proactive Assistance with Real-World Data", "url": "https://arxiv.org/abs/2602.04482", "author": "Unknown et al.", "date": "2026-02-04", "retrieved": "2026-04-29", "tags": "[benchmark, evaluation, proactive-assistance, agentic, planning, memory, real-world, dataset]", "filename": "2602.04482-proagentbench.md"}, {"source_type": "arxiv", "title": "ProjDevBench: Benchmarking AI Coding Agents on End-to-End Project Development", "url": "https://arxiv.org/abs/2602.01655", "author": "Pengrui Lu et al.", "date": "2026-02-03", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, code-generation, evaluation, software-engineering, iterative-refinement, multi-file, end-to-end, tool-use, debugging]", "filename": "2602.01655-projdevbench.md"}, {"source_type": "arxiv", "title": "LPS-Bench: Benchmarking Safety Awareness of Computer-Use Agents in Long-Horizon Planning under Benign and Adversarial Scenarios", "url": "https://arxiv.org/abs/2602.03255", "author": "Tianyu Chen et al.", "date": "2026-02-03", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, safety, computer-use, long-horizon, MCP, adversarial, planning, risk-awareness, GUI]", "filename": "lps-bench-safety-computer-use.md"}, {"source_type": "arxiv", "title": "MemGUI-Bench: Benchmarking Memory of Mobile GUI Agents in Dynamic Environments", "url": "https://arxiv.org/abs/2602.06075", "author": "Guangyi Liu, Pengxiang Zhao, Yaozhen Liang, Qinyi Luo, Shunye Tang, Yuxiang Chai, Weifeng Lin, Han Xiao, WenHao Wang, Siheng Chen, Zhengxi Lu, Gao Wu, Hao Wang, Liang Liu, Yong Liu", "date": "2026-02-03", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, mobile-GUI, memory, cross-session, cross-application, retention]", "filename": "memgui-bench.md"}, {"source_type": "announcement", "title": "CL-bench: A Benchmark for Context Learning", "url": "https://github.com/Tencent-Hunyuan/CL-bench", "author": "Shihan Dou, Ming Zhang, Zhangyue Yin, Chenhao Huang, et al. (Tencent Hunyuan, Fudan University)", "date": "2026-02-03", "retrieved": "2026-03-28", "tags": "[benchmark, context-learning, in-context-learning, long-context, reasoning, LLM, knowledge-acquisition]", "filename": "cl_bench.md"}, {"source_type": "arxiv", "title": "DPBench: Large Language Models Struggle with Simultaneous Coordination", "url": "https://arxiv.org/abs/2602.13255", "author": "Najmul Hasan, Prashanth BusiReddyGari", "date": "2026-02-02", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, multi-agent, coordination, simultaneous-decision, deadlock, resource-contention]", "filename": "dpbench.md"}, {"source_type": "arxiv", "title": "ISD-Agent-Bench: A Comprehensive Benchmark for Evaluating LLM-based Instructional Design Agents", "url": "https://arxiv.org/abs/2602.10620", "author": "YoungHoon Jeon et al.", "date": "2026-02-01", "retrieved": "2026-03-31", "tags": "[agentic, benchmark, evaluation, instructional-design, education, multi-step-reasoning, tool-use, LLM-as-judge, ADDIE, scenario-generation]", "filename": "isd_agent_bench.md"}, {"source_type": "arxiv", "title": "FinMTM: A Multi-Turn Multimodal Benchmark for Financial Reasoning and Agent Evaluation", "url": "https://arxiv.org/abs/2602.03130", "author": "(pending full author list)", "date": "2026-02", "retrieved": "2026-04-19", "tags": "[benchmark, financial, multimodal, multi-turn, agent, tool-use, reasoning, bilingual]", "filename": "2602.03130-finmtm.md"}, {"source_type": "arxiv", "title": "Agent-Diff: Benchmarking LLM Agents on Enterprise API Tasks via Code Execution with State-Diff-Based Evaluation", "url": "https://arxiv.org/abs/2602.11224", "author": "Hubert M. Pysklo, Artem Zhuravel, Patrick D. Watson", "date": "2026-02", "retrieved": "2026-03-29", "tags": "[benchmark, enterprise, API, code-execution, state-diff, tool-use, agentic, SaaS, Slack, Box, Linear, GoogleCalendar]", "filename": "agent-diff.md"}, {"source_type": "arxiv", "title": "AgentLeak: A Full-Stack Benchmark for Privacy Leakage in Multi-Agent LLM Systems", "url": "https://arxiv.org/abs/2602.11510", "author": "Privatris team (GitHub: Privatris/AgentLeak)", "date": "2026-02", "retrieved": "2026-04-17", "tags": "[multi-agent, privacy, security, benchmark, LLM, data-leakage]", "filename": "agentleak.md"}, {"source_type": "arxiv", "title": "BrowseComp-V3: A Visual, Vertical, and Verifiable Benchmark for Multimodal Browsing Agents", "url": "https://arxiv.org/abs/2602.12876", "author": "Zhang et al. (PKU, HKUST(GZ), Huawei Cloud BU)", "date": "2026-02", "retrieved": "2026-03-28", "tags": "[benchmark, agentic, evaluation, web-navigation, reasoning, tool-use]", "filename": "browsecomp_v3.md"}, {"source_type": "arxiv", "title": "DRACO: a Cross-Domain Benchmark for Deep Research Accuracy, Completeness, and Objectivity", "url": "https://arxiv.org/abs/2602.11685", "author": "Joey Zhong, Hao Zhang et al. (Perplexity)", "date": "2026-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, research, reasoning, tool-use]", "filename": "draco_deep_research.md"}, {"source_type": "arxiv", "title": "EvoCodeBench: A Human-Performance Benchmark for Self-Evolving LLM-Driven Coding Systems", "url": "https://arxiv.org/abs/2602.10171", "author": "Wentao Zhang, Jianfeng Wang, Liheng Liang, Yilei Zhao, HaiBin Wen, Zhe Zhao", "date": "2026-02", "retrieved": "2026-03-27", "tags": "[benchmark, code-generation, self-evolving, multilingual, human-performance, LeetCode, KDD-2026, agentic, efficiency]", "filename": "evocodebench.md"}, {"source_type": "arxiv", "title": "FeatBench: Towards More Realistic Evaluation of Feature-level Code Generation", "url": "https://arxiv.org/abs/2509.22237", "author": "Haorui Chen, Chengze Li, Jia Li", "date": "2026-02", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, coding, feature-level, code-generation, software-engineering, regression-testing, natural-language-requirements]", "filename": "featbench.md"}, {"source_type": "arxiv", "title": "FeatureBench: Benchmarking Agentic Coding for Complex Feature Development", "url": "https://arxiv.org/abs/2602.10975", "author": "Qixing Zhou, Jiacheng Zhang, Haiyang Wang, Rui Hao, Jiahe Wang, Minghao Han, Yuxue Yang, Shuzhe Wu, Feiyang Pan, Lue Fan, Dandan Tu, Zhaoxiang Zhang", "date": "2026-02", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, coding, feature-development, software-engineering, test-driven, ICLR-2026]", "filename": "featurebench.md"}, {"source_type": "arxiv", "title": "Benchmark Test-Time Scaling of General LLM Agents", "url": "https://arxiv.org/abs/2602.18998", "author": "Xiaochuan Li et al.", "date": "2026-02", "retrieved": "2026-03-23", "tags": "[agentic, benchmark, evaluation, tool-use, reasoning, multi-agent, planning, test-time-scaling, coding, search, unified-framework]", "filename": "general_agentbench.md"}, {"source_type": "arxiv", "title": "LemmaBench: A Live, Research-Level Benchmark to Evaluate LLM Capabilities in Mathematics", "url": "https://arxiv.org/abs/2602.24173", "author": "Antoine Peyronnet, Fabian Gloeckle, Amaury Hayat", "date": "2026-02", "retrieved": "2026-03-29", "tags": "[benchmark, mathematics, theorem-proving, research-level, live-benchmark, contamination-resistant, reasoning, LLM-judge]", "filename": "lemmabench.md"}, {"source_type": "arxiv", "title": "LongCLI-Bench: A Preliminary Benchmark and Study for Long-horizon Agentic Programming in Command-Line Interfaces", "url": "https://arxiv.org/abs/2602.14337", "author": "Yukang Feng, Jianwen Sun, Zelai Yang", "date": "2026-02", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, coding, software-engineering, long-horizon, command-line, programming]", "filename": "longcli_bench.md"}, {"source_type": "arxiv", "title": "MemoryArena: Benchmarking Agent Memory in Interdependent Multi-Session Agentic Tasks", "url": "https://arxiv.org/abs/2602.16313", "author": "Zexue He, Yu Wang, Churan Zhi", "date": "2026-02", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, memory, long-horizon, multi-session, web-navigation, planning, reasoning]", "filename": "memory_arena.md"}, {"source_type": "arxiv", "title": "Unsafer in Many Turns: Benchmarking and Defending Multi-Turn Safety Risks in Tool-Using Agents", "url": "https://arxiv.org/abs/2602.13379", "author": "Xu Li, Simon Yu, Minzhou Pan, Yiyou Sun, Bo Li, Dawn Song, Xue Lin, Weiyan Shi", "date": "2026-02", "retrieved": "2026-03-22", "tags": "[agentic, benchmark, safety, evaluation, multi-turn, tool-use, jailbreak, MCP, defense]", "filename": "mt_agentrisk.md"}, {"source_type": "arxiv", "title": "Risky-Bench: Probing Agentic Safety Risks under Real-World Deployment Conditions", "url": "https://arxiv.org/abs/2602.03100", "author": "Jingnan Zheng, Yanzhen Luo, Jingjun Xu, Bingnan Liu, Yuxin Chen, Chenhang Cui, Gelei Deng, Chaochao Lu, Xiang Wang, An Zhang, Tat-Seng Chua", "date": "2026-02", "retrieved": "2026-03-22", "tags": "[agentic, benchmark, safety, evaluation, risk, adversarial, red-teaming, life-assist, deployment]", "filename": "risky_bench.md"}, {"source_type": "arxiv", "title": "TRIP-Bench: A Benchmark for Long-Horizon Interactive Agents in Real-World Scenarios", "url": "https://arxiv.org/abs/2602.01675", "author": "Yuanzhe Shen, Zisu Huang, Zhengyuan Wang", "date": "2026-02", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, long-horizon, tool-use, travel-planning, multi-tool, constraint-satisfaction, interactive]", "filename": "trip_bench.md"}, {"source_type": "announcement", "title": "PinchBench: OpenClaw Coding Agent Benchmark", "url": "https://pinchbench.com/", "author": "PinchBench (Kilo.ai / boleary.dev contributors)", "date": "2026-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, coding, autonomous-agents, tool-use, leaderboard]", "filename": "summary_pinchbench.md"}, {"source_type": "announcement", "title": "SWE-bench-Live/Windows: Evaluating AI Agents on Windows PowerShell Tasks", "url": "https://swe-bench-live.github.io/", "author": "Microsoft (GitHub Copilot Team, Microsoft US; DKI Group, Microsoft Shanghai)", "date": "2026-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, software-engineering, windows, powershell, code-generation, issue-resolution, swe-bench]", "filename": "summary_swe_bench_live_windows.md"}, {"source_type": "announcement", "title": "AI Cyber Model Arena: Testing AI Agents in Cybersecurity", "url": "https://www.wiz.io/cyber-model-arena", "author": "Wiz (Matan Vetzler et al.)", "date": "2026-02", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, cybersecurity, offensive-security, zero-day, CVE, web-security, cloud-security, API-security]", "filename": "wiz_cyber_model_arena.md"}, {"source_type": "arxiv", "title": "MCP-Atlas: A Large-Scale Benchmark for Tool-Use Competency with Real MCP Servers", "url": "https://arxiv.org/abs/2602.00933", "author": "Scale AI Research Team", "date": "2026-01-31", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, tool-use, MCP, model-context-protocol, multi-step, Scale-AI]", "filename": "mcp-atlas.md"}, {"source_type": "arxiv", "title": "CAR-bench: Evaluating the Consistency and Limit-Awareness of LLM Agents under Real-World Uncertainty", "url": "https://arxiv.org/abs/2601.22027", "author": "Johannes Kirmayr et al.", "date": "2026-01-29", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, tool-use, multi-turn, uncertainty, hallucination, in-car, voice-assistant, reliability, consistency]", "filename": "car-bench.md"}, {"source_type": "announcement", "title": "OpenHands Index", "url": "https://openhands.dev/blog/openhands-index", "author": "OpenHands Team", "date": "2026-01-29", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, coding, software-engineering, composite, leaderboard]", "filename": "openhands_index.md"}, {"source_type": "arxiv", "title": "AACR-Bench: Evaluating Automatic Code Review with Holistic Repository-Level Context", "url": "https://arxiv.org/abs/2601.19494", "author": "L. Zhang et al.", "date": "2026-01-28", "retrieved": "2026-04-25", "tags": "[benchmark, code-review, repository-level, multilingual, automated-code-review, llm-evaluation, software-engineering, code-quality, pull-request]", "filename": "2601.19494-aacr-bench.md"}, {"source_type": "arxiv", "title": "OS-Marathon: Benchmarking Computer-Use Agents on Long-Horizon Repetitive Tasks", "url": "https://arxiv.org/abs/2601.20650", "author": "Jing Wu et al.", "date": "2026-01-28", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, computer-use, long-horizon, repetitive-workflows, desktop, condensed-demonstrations, few-shot, GUI, OSWorld]", "filename": "os-marathon-long-horizon-computer-use.md"}, {"source_type": "arxiv", "title": "PredictionMarketBench: A SWE-bench-Style Framework for Backtesting Trading Agents on Prediction Markets", "url": "https://arxiv.org/abs/2602.00133", "author": "Avi Arora, Ritesh Malpani (Oddpool / Benchspan, YC S26)", "date": "2026-01-28", "retrieved": "2026-05-03", "tags": "[agentic, benchmark, evaluation, reasoning, planning, financial-agents, tool-use, trading, prediction-markets]", "filename": "prediction_market_bench.md"}, {"source_type": "arxiv", "title": "RubberDuckBench: A Benchmark for AI Coding Assistants", "url": "https://arxiv.org/abs/2601.16456", "author": "Ferida Mohammad, Fatma Ayad, Petros Maniatis, Satish Chandra, Elizabeth Dinella", "date": "2026-01-27", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, coding-assistant, code-understanding, question-answering, hallucination, multilingual, Java, Python, C++, pull-request, contextualized-questions]", "filename": "rubberduckbench.md"}, {"source_type": "arxiv", "title": "EntWorld: A Holistic Environment and Benchmark for Verifiable Enterprise GUI Agents", "url": "https://arxiv.org/abs/2601.17722", "author": "Ying Mo et al.", "date": "2026-01-25", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, enterprise, GUI, web-navigation, verification, multi-app, CRM, ERP, ITSM, computer-use]", "filename": "entworld.md"}, {"source_type": "arxiv", "title": "AgentDrive: An Open Benchmark Dataset for Agentic AI Reasoning with LLM-Generated Scenarios in Autonomous Systems", "url": "https://arxiv.org/abs/2601.16964", "author": "Mohamed Amine Ferrag, Abderrahmane Lakas, Merouane Debbah", "date": "2026-01-23", "retrieved": "2026-04-19", "tags": "[benchmark, autonomous-driving, reasoning, planning, agent, domain-specific, MCQ]", "filename": "2601.16964-agentdrive.md"}, {"source_type": "announcement", "title": "Introducing APEX-Agents", "url": "https://www.mercor.com/blog/introducing-apex-agents/", "author": "Mercor", "date": "2026-01-21 (arxiv 2601.14242)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, professional-services, investment-banking, consulting, corporate-law, long-horizon, tool-use]", "filename": "mercor_apex_agents.md"}, {"source_type": "substack", "title": "Agentic AI Weekly - Berkeley RDI", "url": "https://berkeleyrdi.substack.com/p/agentic-ai-weekly-berkeley-rdi-january-27c", "author": "Berkeley RDI (Responsible Decentralized Intelligence)", "date": "2026-01-21", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, ecosystem, standardization, reproducibility, community, landscape]", "filename": "berkeley_rdi_agentic_weekly.md"}, {"source_type": "arxiv", "title": "ToolPRMBench: Evaluating and Advancing Process Reward Models for Tool-using Agents", "url": "https://arxiv.org/abs/2601.12294", "author": "Dawei Li et al.", "date": "2026-01-18", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, tool-use, process-reward-model, PRM, reward-model, step-level, function-calling, multi-step-reasoning]", "filename": "toolprmbench.md"}, {"source_type": "arxiv", "title": "ABC-Bench: Benchmarking Agentic Backend Coding in Real-World Development", "url": "https://arxiv.org/abs/2601.11077", "author": "Jie Yang et al. (Fudan University / Shanghai Qiji Zhifeng)", "date": "2026-01-16", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, code-generation, software-engineering, deployment, tool-use, multi-language, containerization, evaluation]", "filename": "2601.11077-abc-bench.md"}, {"source_type": "substack", "title": "Agentic Engineering Patterns", "url": "https://simonw.substack.com/p/agentic-engineering-patterns", "author": "Simon Willison", "date": "2026-01-15", "retrieved": "2026-03-07", "tags": "[agentic, engineering, patterns, coding-agents, claude-code, codex, evaluation, best-practices]", "filename": "willison_agentic_engineering_patterns.md"}, {"source_type": "twitter", "title": "MCP-Atlas — Open-Source Benchmark for Agentic Tool Use via Model Context Protocol", "url": "https://x.com/scale_AI/status/2002099826163601655", "author": "@scale_AI", "date": "2026-01-15", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, MCP, tool-use, Scale-AI, open-source, function-calling]", "filename": "thread_mcp_atlas_scale_ai.md"}, {"source_type": "arxiv", "title": "Blue Teaming Function-Calling Agents", "url": "https://arxiv.org/abs/2601.09292", "author": "Greta Dolcetti, Giulio Zizzo, Sergio Maffeis", "date": "2026-01-14", "retrieved": "2026-04-13", "tags": "[security, function-calling, tool-use, adversarial, blue-teaming, prompt-injection, defense, benchmark, robustness, LLM-agents]", "filename": "blue-teaming-function-calling-agents.md"}, {"source_type": "announcement", "title": "MiniMax Open-Sources New Benchmark: Defining Production-Grade Standards for Coding Agent", "url": "https://www.minimax.io/news/production-grade-benchmark-for-coding-agents", "author": "MiniMax", "date": "2026-01-14", "retrieved": "2026-03-29", "tags": "[benchmark, coding-agent, instruction-following, production-grade, open-source, tool-use, agentic]", "filename": "summary_minimax-octocodingbench.md"}, {"source_type": "arxiv", "title": "APEX-SWE: AI Productivity Index for Software Engineering", "url": "https://arxiv.org/abs/2601.08806", "author": "Abhi Kottamasu, Chirag Mahapatra, Sam Lee, Ben Pan et al.", "date": "2026-01-13", "retrieved": "2026-03-26", "tags": "[agentic, benchmark, evaluation, code-generation, debugging, tool-use, reasoning, leaderboard, dataset, enterprise]", "filename": "apex_swe.md"}, {"source_type": "arxiv", "title": "SafePro: Evaluating the Safety of Professional-Level AI Agents", "url": "https://arxiv.org/abs/2601.06663", "author": "Kaiwen Zhou et al.", "date": "2026-01-13", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, safety, professional, tool-use, LLM-agent, alignment, risk, occupations]", "filename": "safepro.md"}, {"source_type": "arxiv", "title": "arXiv Query: search_query=&amp;id_list=2601.08806&amp;start=0&amp;max_results=10", "url": "https://arxiv.org/abs/2601.08806", "author": "Abhi Kottamasu et al.", "date": "2026-01-13", "retrieved": "2026-03-25", "tags": "[agentic, benchmark, evaluation, code-generation, debugging, tool-use, planning, reasoning, leaderboard, dataset]", "filename": "summary_arxiv_query_searchqueryampidlist260108806ampstart0.md"}, {"source_type": "arxiv", "title": "Terminal-Bench: Benchmarking AI Agents in Terminal Environments", "url": "https://arxiv.org/abs/2601.11868", "author": "Laude Institute et al. (Stanford x Laude)", "date": "2026-01-01", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, terminal, cli, os-interaction, system-administration, security, devops, machine-learning, data-science, evaluation, code-generation, tool-use]", "filename": "2601.11868-terminal-bench.md"}, {"source_type": "arxiv", "title": "MAESTRO: Multi-Agent Evaluation Suite for Testing, Reliability, and Observability", "url": "https://arxiv.org/abs/2601.00481", "author": "Tie Ma, Yixi Chen, Vaastav Anand, Alessandro Cornacchia, Amândio R. Faustino, Guanheng Liu, Shan Zhang, Hongbin Luo, Suhaib A. Fahmy, Zafar A. Qazi, Marco Canini", "date": "2026-01-01", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, multi-agent, evaluation-framework, reliability, observability, testing]", "filename": "maestro.md"}, {"source_type": "arxiv", "title": "AgentDoG: A Diagnostic Guardrail Framework for AI Agent Safety and Security", "url": "https://arxiv.org/abs/2601.18491", "author": "Dongrui Liu, Qihan Ren, Chen Qian, Shuai Shao, Yuejin Xie, Yu Li, Zhonghao Yang, Haoyu Luo, Peng Wang, Qingyu Liu, et al. (Shanghai Artificial Intelligence Laboratory)", "date": "2026-01", "retrieved": "2026-03-22", "tags": "[agentic, benchmark, safety, tool-use, evaluation, guardrail, trajectory-level, risk-taxonomy, explainability]", "filename": "agentdog_atbench.md"}, {"source_type": "arxiv", "title": "AgenticRed: Optimizing Agentic Systems for Automated Red-teaming", "url": "https://arxiv.org/abs/2601.13518", "author": "Jiayi Yuan et al.", "date": "2026-01", "retrieved": "2026-04-23", "tags": "[agentic, red-teaming, safety, evaluation, jailbreak, automated-attack, evolutionary-search, benchmark, LLM-safety]", "filename": "agentic_red.md"}, {"source_type": "arxiv", "title": "AgenticRed: Evolving Agentic Systems for Red-Teaming", "url": "https://arxiv.org/abs/2601.13518", "author": "Jiayi Yuan, Jonathan Nöther, Natasha Jaques", "date": "2026-01", "retrieved": "2026-04-21", "tags": "[agentic, red-teaming, evolutionary, llm-safety, adversarial, automated-testing, framework]", "filename": "agenticred.md"}, {"source_type": "arxiv", "title": "AJAR: Adaptive Jailbreak Architecture for Red-teaming", "url": "https://arxiv.org/abs/2601.10971", "author": "Yipu Dou et al.", "date": "2026-01", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, safety, red-teaming, jailbreak, mcp, multi-turn, tool-use, llm-safety, adversarial, agent-framework]", "filename": "ajar.md"}, {"source_type": "arxiv", "title": "APEX-Agents", "url": "https://arxiv.org/abs/2601.14242", "author": "Bertie Vidgen, Austin Mann, Abby Fennelly, John Wright Stanly, Lucas Rothman, Marco Burstein, Julien Benchek, David Ostrofsky, Anirudh Ravichandran, Debnil Sur, Neel Venugopal, Alannah Hsia, Isaac Robinson, Calix Huang, Olivia Varones, Daniyal Khan, Michael Haines, Zach Richards, Chirag Mahapatra, Brendan Foody, Osvald Nitski", "date": "2026-01", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, professional-services, investment-banking, consulting, legal, enterprise, Mercor, cross-application]", "filename": "apex-agents.md"}, {"source_type": "arxiv", "title": "BioAgent Bench: An AI Agent Evaluation Suite for Bioinformatics", "url": "https://arxiv.org/abs/2601.21800", "author": "Dionizije Fa et al.", "date": "2026-01", "retrieved": "2026-04-01", "tags": "[agentic, benchmark, evaluation, tool-use, reasoning, planning, debugging, dataset]", "filename": "bioagent_bench.md"}, {"source_type": "arxiv", "title": "M3MAD-Bench: Are Multi-Agent Debates Really Effective Across Domains and Modalities?", "url": "https://arxiv.org/abs/2601.02854", "author": "Ao Li et al.", "date": "2026-01", "retrieved": "2026-05-01", "tags": "[benchmark, evaluation, multi-agent, debate, multimodal, vision-language, reasoning, knowledge, mathematics, medicine, science]", "filename": "m3mad_bench_multiagent_debate_modalities.md"}, {"source_type": "arxiv", "title": "MirrorBench: A Benchmark to Evaluate Conversational User-Proxy Agents for Human-Likeness", "url": "https://arxiv.org/abs/2601.08118", "author": "Ashutosh Hathidara, Julien Yu, Vaishali Senthil, Sebastian Schreiber, Anil Babu Ankisettipalli", "date": "2026-01", "retrieved": "2026-03-29", "tags": "[agentic, benchmark, user-simulation, human-likeness, conversational-AI, LLM-judge, lexical-diversity, user-proxy]", "filename": "mirrorbench.md"}, {"source_type": "arxiv", "title": "OctoBench: Benchmarking Instruction Following in Agentic Coding Scaffolds", "url": "https://arxiv.org/abs/2601.10343", "author": "(MiniMax team — full author list in paper)", "date": "2026-01", "retrieved": "2026-03-27", "tags": "[agentic, benchmark, coding, instruction-following, scaffold, tool-use, evaluation, llm-as-judge]", "filename": "octobench.md"}, {"source_type": "arxiv", "title": "SimpleMem: Efficient Lifelong Memory for LLM Agents", "url": "https://arxiv.org/abs/2601.02553", "author": "Jiaqi Liu, Yaofeng Su, Peng Xia", "date": "2026-01", "retrieved": "2026-04-23", "tags": "[memory-system, lifelong-memory, semantic-compression, token-efficiency, intent-aware-retrieval]", "filename": "simplemem.md"}, {"source_type": "announcement", "title": "CooperBench: Benchmarking Agent Teams — Why Coding Agents Cannot be Your Teammates Yet", "url": "https://cooperbench.com/", "author": "Stanford University & SAP Labs US", "date": "2026-01", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, multi-agent, collaboration, coding, coordination, cooperative]", "filename": "cooperbench.md"}, {"source_type": "announcement", "title": "WybeCoder: Verified Imperative Code Generation", "url": "https://facebookresearch.github.io/wybecoder/", "author": "FAIR at Meta (with CERMICS/ENPC, University of Cambridge, UCL)", "date": "2026", "retrieved": "2026-04-16", "tags": "[code_generation, formal_verification, lean, dafny, verified_code, agentic, smt, proof_generation]", "filename": "wybecoder.md"}, {"source_type": "announcement", "title": "CivBench: Long-Horizon Multi-Agent Strategy Game Benchmark", "url": "https://clashai.live", "author": "ClashAI", "date": "2025-2026", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, multi-agent, strategy-game, long-horizon, game-playing, competitive]", "filename": "clashai_civbench.md"}, {"source_type": "substack", "title": "2025: The year in LLMs", "url": "https://simonwillison.net/2025/Dec/31/the-year-in-llms/", "author": "Simon Willison", "date": "2025-12-31", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, agents, coding-agents, landscape, year-in-review, evaluation]", "filename": "willison_year_in_llms_agents.md"}, {"source_type": "substack", "title": "The State Of LLMs 2025: Progress, Progress, and Predictions", "url": "https://magazine.sebastianraschka.com/p/state-of-llms-2025", "author": "Sebastian Raschka, PhD (Ahead of AI)", "date": "2025-12-30", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, landscape, LLMs, year-in-review, predictions, methodology]", "filename": "raschka_state_of_llms_2025.md"}, {"source_type": "arxiv", "title": "DECEPTICON: How Dark Patterns Manipulate Web Agents", "url": "https://arxiv.org/abs/2512.22894", "author": "Phil Cuvin et al.", "date": "2025-12-28", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, web-navigation, safety, security, dark-patterns, robustness, adversarial, GUI, manipulation]", "filename": "decepticon.md"}, {"source_type": "arxiv", "title": "A Benchmark for Evaluating Outcome-Driven Constraint Violations in Autonomous AI Agents", "url": "https://arxiv.org/abs/2512.20798", "author": "Miles Q. Li, Benjamin C. M. Fung, Martin Weiss, Pulei Xiong, Khalil Al-Hussaeni, Claude Fachkha", "date": "2025-12-23", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, safety, alignment, constraint-violations, ethics, misalignment]", "filename": "constraint-violations-benchmark.md"}, {"source_type": "substack", "title": "Why Benchmarking is Hard", "url": "https://epochai.substack.com/p/why-benchmarking-is-hard", "author": "Florian Brand, JS Denain (Epoch AI)", "date": "2025-12-23", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, methodology, scaffold, SWE-bench, reproducibility]", "filename": "epoch_ai_benchmarking_hard.md"}, {"source_type": "arxiv", "title": "MobileWorld: Benchmarking Autonomous Mobile Agents in Agent-User Interactive and MCP-Augmented Environments", "url": "https://arxiv.org/abs/2512.19432", "author": "Quyu Kong, Xu Zhang, Zhenyu Yang, Nolan Gao, Chen Liu, Panrong Tong, Chenglin Cai, Hanzhang Zhou, Jianan Zhang, Liangyu Chen, Zhidan Liu, Steven Hoi, Yue Wang", "date": "2025-12-22", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, mobile, gui, mcp, agent-user-interaction, multi-app]", "filename": "mobileworld.md"}, {"source_type": "substack", "title": "The 2025 AI Engineering Reading List", "url": "https://www.latent.space/p/2025-papers", "author": "Latent Space (swyx, Alessio Fanelli)", "date": "2025-12-20", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, reading-list, SWE-bench, agents, RAG, survey, papers]", "filename": "latent_space_2025_reading_list.md"}, {"source_type": "announcement", "title": "Bloom: Automated Behavioral Evaluations Tool", "url": "https://www.anthropic.com/research/bloom", "author": "Anthropic", "date": "2025-12-19", "retrieved": "2026-03-22", "tags": "[agentic, benchmark, safety, alignment, evaluation, behavioral-eval, open-source]", "filename": "summary_bloom_anthropic.md"}, {"source_type": "arxiv", "title": "Agent Tools Orchestration Leaks More: Dataset, Benchmark, and Mitigation", "url": "https://arxiv.org/abs/2512.16310", "author": "Yuxuan Qiao et al.", "date": "2025-12-18", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, safety, privacy, tool-use, multi-tool, data-leakage, orchestration, security]", "filename": "top-bench.md"}, {"source_type": "arxiv", "title": "VenusBench-GD: A Comprehensive Multi-Platform GUI Benchmark for Diverse Grounding Tasks", "url": "https://arxiv.org/abs/2512.16501", "author": "Beitong Zhou et al.", "date": "2025-12-18", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, GUI-agent, GUI-grounding, visual-grounding, multimodal, multi-platform, bilingual, leaderboard]", "filename": "venusbench-gd.md"}, {"source_type": "announcement", "title": "Berkeley Function Calling Leaderboard (BFCL) V4: From Tool Use to Agentic Evaluation", "url": "https://gorilla.cs.berkeley.edu/leaderboard.html", "author": "UC Berkeley (Shishir Patil, Huanzhi Mao, Charlie Cheng-Jie Ji, Fanjia Yan, Vishnu Suresh, Ion Stoica, Joseph E. Gonzalez)", "date": "2025-12-16", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, function-calling, tool-use, BFCL, leaderboard, berkeley]", "filename": "summary_bfcl.md"}, {"source_type": "substack", "title": "AI Evaluation Digest - November/December 2025", "url": "https://aievaluation.substack.com/p/2025-december-ai-evaluation-digest", "author": "AI Evaluation Substack", "date": "2025-12-15", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, digest, NIST, EvalEval, LoCoBench, methodology, standards]", "filename": "ai_evaluation_digest_2025.md"}, {"source_type": "arxiv", "title": "ReasonBENCH: Benchmarking the (In)Stability of LLM Reasoning", "url": "https://arxiv.org/abs/2512.07795", "author": "Nearchos Potamitis, Lars Klein, Akhil Arora", "date": "2025-12-08", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, reasoning, stability, reproducibility, evaluation-methodology]", "filename": "reasonbench.md"}, {"source_type": "announcement", "title": "WebArena Verified — Human-Audited Web Agent Benchmark", "url": "https://github.com/ServiceNow/BrowserGym", "author": "ServiceNow Research", "date": "2025-12-04", "retrieved": "2026-03-29", "tags": "[agentic, benchmark, evaluation, web-navigation, tool-use]", "filename": "webarena_verified.md"}, {"source_type": "arxiv", "title": "Blocksworld-MCP: Benchmark for Planning and Control with Large Language Model Agents", "url": "https://arxiv.org/abs/2512.03955", "author": "Niklas Jobs, Luis Miguel Vieira da Silva, Jayanth Somashekaraiah, Maximilian Weigand, David Kube, Felix Gehlhoff", "date": "2025-12-03", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, planning, control, MCP, blocksworld, tool-use, industrial-automation]", "filename": "blocksworld-mcp.md"}, {"source_type": "announcement", "title": "NeuroGrid CTF: The Ultimate AI Security Showdown", "url": "https://www.hackthebox.com/events/neurogrid", "author": "Hack The Box", "date": "2025-12-03", "retrieved": "2026-03-28", "tags": "[benchmark, agentic, cybersecurity, CTF, offensive-security, AI-vs-human, tool-use, reasoning, autonomous-agent]", "filename": "htb_neurogrid_ctf.md"}, {"source_type": "arxiv", "title": "ML-Tool-Bench: Tool-Augmented Planning for ML Tasks", "url": "https://arxiv.org/abs/2512.00672", "author": "Yaswanth Chittepu, Raghavendra Addanki, Tung Mai, Anup Rao, Branislav Kveton", "date": "2025-12-01", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, tool-use, planning, machine-learning, kaggle, tabular-ml, long-horizon, MCTS, tree-search]", "filename": "ml-tool-bench.md"}, {"source_type": "arxiv", "title": "TeleAI-Safety: A comprehensive LLM jailbreaking benchmark towards attacks, defenses, and evaluations", "url": "https://arxiv.org/abs/2512.05485", "author": "Xiuyuan Chen et al.", "date": "2025-12-01", "retrieved": "2026-04-03", "tags": "[benchmark, evaluation, safety, jailbreak, llm-safety, attack, defense, red-teaming, framework]", "filename": "teleai_safety.md"}, {"source_type": "announcement", "title": "AI agents find $4.6M in blockchain smart contract exploits", "url": "https://red.anthropic.com/2025/smart-contracts/", "author": "Winnie Xiao, Cole Killian, Henry Sleight, Alan Chan, Nicholas Carlini, Alwin Peng (Anthropic / MATS)", "date": "2025-12-01", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, smart-contracts, cybersecurity, exploitation, blockchain, DeFi, tool-use, long-horizon, zero-day]", "filename": "anthropic_sconebench.md"}, {"source_type": "substack", "title": "Evaluating AI agents: Real-world lessons from building agentic systems at Amazon", "url": "https://aws.amazon.com/blogs/machine-learning/evaluating-ai-agents-real-world-lessons-from-building-agentic-systems-at-amazon/", "author": "AWS/Amazon Machine Learning Blog", "date": "2025-12-01", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, enterprise, production, error-recovery, amazon, deployment, methodology]", "filename": "aws_evaluating_agents_real_world.md"}, {"source_type": "substack", "title": "8 Benchmarks Shaping the Next Generation of AI Agents", "url": "https://tessl.io/blog/8-benchmarks-shaping-the-next-generation-of-ai-agents/", "author": "Tessl", "date": "2025-12-01", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, coding, context, terminal, enterprise, next-generation, landscape]", "filename": "tessl_8_benchmarks_next_gen_agents.md"}, {"source_type": "arxiv", "title": "MCP-SafetyBench: A Benchmark for Safety Evaluation of Large Language Models with Real-World MCP Servers", "url": "https://arxiv.org/abs/2512.15163", "author": "Xuanjun Zong, Zhiqi Shen, Lei Wang, Yunshi Lan, Chao Yang", "date": "2025-12 (December 2025)", "retrieved": "2026-03-22", "tags": "[agentic, benchmark, safety, security, mcp, tool-use, tool-poisoning, multi-turn, adversarial]", "filename": "mcp_safetybench.md"}, {"source_type": "arxiv", "title": "Evaluating Long-Context Reasoning in LLM-Based WebAgents", "url": "https://arxiv.org/abs/2512.04307", "author": "Andy Chung, Yichi Zhang, Kaixiang Lin", "date": "2025-12", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, web, long-context, multi-session, irrelevant-trajectory-injection, evaluation]", "filename": "long_context_webagent.md"}, {"source_type": "arxiv", "title": "NIKA: A Network Arena for Benchmarking AI Agents on Network Troubleshooting", "url": "https://arxiv.org/abs/2512.16381", "author": "Zhihao Wang, Alessandro Cornacchia, Alessio Sacco, Franco Galante, Marco Canini, Dingde Jiang", "date": "2025-12", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, network-troubleshooting, tool-use, diagnosis, root-cause-analysis, infrastructure]", "filename": "nika.md"}, {"source_type": "arxiv", "title": "NL2Repo-Bench: Towards Long-Horizon Repository Generation Evaluation of Coding Agents", "url": "https://arxiv.org/abs/2512.12730", "author": "Jingzhe Ding, Shengda Long, Changxin Pu, Ge Zhang, Huan Zhou et al. (ByteDance Seed)", "date": "2025-12", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, code-generation, planning, reasoning, long-horizon]", "filename": "nl2repo_bench.md"}, {"source_type": "announcement", "title": "PropensityBench | SEAL by Scale AI", "url": "https://scale.com/leaderboard/propensitybench", "author": "Scale AI", "date": "2025-11-25", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, safety, propensity, alignment, tool-use, biosecurity, cybersecurity, chemical-security, self-proliferation, SEAL]", "filename": "scale_propensitybench.md"}, {"source_type": "announcement", "title": "Aider Polyglot Coding Leaderboard", "url": "https://aider.chat/docs/leaderboards/", "author": "Paul Gauthier (Aider-AI)", "date": "2025-11-20", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, coding, leaderboard, multi-language, code-editing]", "filename": "aider_polyglot.md"}, {"source_type": "announcement", "title": "Cline-Bench: Reproducible RL Environments for Autonomous Coding Agents", "url": "https://cline.bot/blog/cline-bench-initiative", "author": "Cline Bot Inc", "date": "2025-11-20", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, coding, reinforcement-learning, open-source, containerized, reproducible]", "filename": "summary_cline_bench.md"}, {"source_type": "substack", "title": "Demystifying Evals for AI Agents", "url": "https://www.anthropic.com/engineering", "author": "Anthropic Engineering", "date": "2025-11-15", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, methodology, practical-guide, deployment, coding-agents, research-agents]", "filename": "anthropic_agent_evals_guide.md"}, {"source_type": "arxiv", "title": "PRBench: Large-Scale Expert Rubrics for Evaluating High-Stakes Professional Reasoning", "url": "https://arxiv.org/abs/2511.11562", "author": "Afra Feyza Akyürek et al. (Scale AI)", "date": "2025-11-14", "retrieved": "2026-04-19", "tags": "[benchmark, reasoning, professional, finance, legal, expert-rubrics, scale-ai, evaluation]", "filename": "2511.11562-prbench.md"}, {"source_type": "arxiv", "title": "PRBench: Large-Scale Expert Rubrics for Evaluating High-Stakes Professional Reasoning", "url": "https://arxiv.org/abs/2511.11562", "author": "Afra Feyza Akyürek et al. (Scale AI)", "date": "2025-11-14", "retrieved": "2026-04-21", "tags": "[benchmark, evaluation, reasoning, professional, finance, legal, expert-rubrics, scale-ai, rubric-based, open-ended]", "filename": "prbench_professional_reasoning.md"}, {"source_type": "arxiv", "title": "ResearchRubrics: A Benchmark of Prompts and Rubrics For Evaluating Deep Research Agents", "url": "https://arxiv.org/abs/2511.07685", "author": "Manasi Sharma, Chen Bo Calvin Zhang, Chaithanya Bandi, Clinton Wang, Ankit Aich, Huy Nghiem, Tahseen Rabbani, Ye Htet, Brian Jang, Sumana Basu, Aishwarya Balwani, Denis Peskoff, Marcos Ayestaran, Sean M. Hendryx, Brad Kenstler, Bing Liu", "date": "2025-11-12", "retrieved": "2026-03-25", "tags": "[agentic, benchmark, deep-research, rubric-based, llm-as-judge, evaluation, multi-document-synthesis, scale-ai, open-ended, long-form]", "filename": "2511.07685-researchrubrics.md"}, {"source_type": "arxiv", "title": "ProBench: Benchmarking GUI Agents with Accurate Process Information", "url": "https://arxiv.org/abs/2511.09157", "author": "Leyang Yang, Ziwei Wang, Xiaoxuan Tang, Sheng Zhou, Dajun Chen, Wei Jiang, Yong Li", "date": "2025-11-12", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, gui, mobile, process-evaluation, android]", "filename": "probench.md"}, {"source_type": "substack", "title": "Building the Open Agent Ecosystem Together: Introducing OpenEnv", "url": "https://huggingface.co/blog/openenv", "author": "Hugging Face & Meta (PyTorch team)", "date": "2025-11-01", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, environments, tool-use, open-source, infrastructure, meta, huggingface]", "filename": "huggingface_openenv_agent_evaluation.md"}, {"source_type": "substack", "title": "Testing AI coding agents (2025): Cursor vs. Claude, OpenAI, and Gemini", "url": "https://render.com/blog/ai-coding-agents-benchmark", "author": "Render", "date": "2025-11-01", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, coding, cursor, claude-code, gemini-cli, codex, practical, comparison]", "filename": "render_coding_agents_benchmark.md"}, {"source_type": "substack", "title": "The Reliability Gap: Agent Benchmarks for Enterprise", "url": "https://simmering.dev/blog/agent-benchmarks/", "author": "Paul Simmering", "date": "2025-11-01", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, enterprise, reliability, adoption, GAIA, BFCL, SWE-bench, production]", "filename": "simmering_reliability_gap_enterprise.md"}, {"source_type": "announcement", "title": "PRBench: Professional Reasoning Benchmark", "url": "https://scale.com/research/prbench", "author": "Scale AI (Afra Feyza Akyurek, Advait Gosai, Chen Bo Calvin Zhang, Vipul Gupta, Jaehwan Jeong, Anisha Gunjal, Tahseen Rabbani, Maria Mazzone, David Randolph, et al.)", "date": "2025-11 (arxiv 2511.11562)", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, professional-reasoning, finance, legal, rubric-based, expert-authored, open-ended]", "filename": "scale_prbench.md"}, {"source_type": "arxiv", "title": "Beyond Accuracy: A Multi-Dimensional Framework for Evaluating Enterprise Agentic AI Systems", "url": "https://arxiv.org/abs/2511.14136", "author": "Sushant Mehta", "date": "2025-11", "retrieved": "2026-03-23", "tags": "[agentic, benchmark, evaluation, enterprise, reasoning, tool-use, multi-agent, cost-efficiency, reliability, security]", "filename": "beyond_accuracy_enterprise.md"}, {"source_type": "arxiv", "title": "CodeClash: Benchmarking Goal-Oriented Software Engineering", "url": "https://arxiv.org/abs/2511.00839", "author": "John Yang, Kilian Lieret, Joyce Yang, Carlos E. Jimenez, Ofir Press, Ludwig Schmidt, Diyi Yang", "date": "2025-11", "retrieved": "2026-03-29", "tags": "[benchmark, coding, agentic, competitive, goal-oriented, multi-agent, software-engineering, iterative, game-playing]", "filename": "codeclash.md"}, {"source_type": "arxiv", "title": "Evo-Memory: Benchmarking LLM Agent Test-time Learning with Self-Evolving Memory", "url": "https://arxiv.org/abs/2511.20857", "author": "Tianxin Wei, Noveen Sachdeva, Benjamin Coleman et al. (UIUC / Google DeepMind)", "date": "2025-11", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, memory, reasoning, tool-use, planning]", "filename": "evo_memory.md"}, {"source_type": "arxiv", "title": "GUI-360°: A Comprehensive Dataset and Benchmark for Computer-Using Agents", "url": "https://arxiv.org/abs/2511.04307", "author": "Jian Mu et al.", "date": "2025-11", "retrieved": "2026-04-15", "tags": "[agentic, benchmark, evaluation, gui, computer-use, windows, desktop, grounding, screen-parsing, action-prediction, multimodal, dataset]", "filename": "gui_360.md"}, {"source_type": "arxiv", "title": "LoCoBench-Agent: An Interactive Benchmark for LLM Agents in Long-Context Software Engineering", "url": "https://arxiv.org/abs/2511.13998", "author": "Jielin Qiu, Zuxin Liu, Zhiwei Liu, Rithesh Murthy, Jianguo Zhang, Haolin Chen, Shiyu Wang, Ming Zhu, Liangwei Yang, Juntao Tan, Roshan Ram, Akshara Prabhakar, Tulika Awalgaonkar, Zixiang Chen, Zhepeng Cen, Cheng Qian, Shelby Heinecke, Weiran Yao, Silvio Savarese, Caiming Xiong, Huan Wang", "date": "2025-11", "retrieved": "2026-03-27", "tags": "[agentic, benchmark, coding, software-engineering, long-context, multi-turn, tool-use, evaluation]", "filename": "locobench-agent.md"}, {"source_type": "arxiv", "title": "Multi-Agent Craftax: Benchmarking Open-Ended Multi-Agent Reinforcement Learning at the Hyperscale", "url": "https://arxiv.org/abs/2511.04904", "author": "Bassel Al Omari et al.", "date": "2025-11", "retrieved": "2026-05-01", "tags": "[benchmark, evaluation, multi-agent, reinforcement-learning, MARL, open-ended, JAX, cooperative, environment, long-horizon]", "filename": "multi_agent_craftax_openended_marl.md"}, {"source_type": "arxiv", "title": "TPS-Bench: Evaluating AI Agents' Tool Planning & Scheduling Abilities in Compounding Tasks", "url": "https://arxiv.org/abs/2511.01527", "author": "Hanwen Xu, Xuyao Huang, Yuzhe Liu, Kai Yu, Zhijie Deng", "date": "2025-11", "retrieved": "2026-03-29", "tags": "[agentic, benchmark, tool-use, tool-planning, scheduling, MCP, efficiency, multi-tool, compounding-tasks]", "filename": "tps-bench.md"}, {"source_type": "arxiv", "title": "UI-CUBE: Enterprise-Grade Computer Use Agent Benchmarking Beyond Task Accuracy to Operational Reliability", "url": "https://arxiv.org/abs/2511.17131", "author": "Cristescu et al. (UiPath)", "date": "2025-11", "retrieved": "2026-03-28", "tags": "[benchmark, agentic, evaluation, os-interaction, planning, memory, web-navigation]", "filename": "ui_cube.md"}, {"source_type": "arxiv", "title": "The Tool Decathlon: Benchmarking Language Agents for Diverse, Realistic, and Long-Horizon Task Execution", "url": "https://arxiv.org/abs/2510.25726", "author": "Junlong Li, Wenshuo Zhao, Jian Zhao, Weihao Zeng, Haoze Wu, Xiaochen Wang, Rui Ge, Yuxuan Cao, Yuzhen Huang, Wei Liu, Junteng Liu, Zhaochen Su, Yiyang Guo, Fan Zhou, Lueyang Zhang, Juan Michelini, Xingyao Wang, Xiang Yue, Shuyan Zhou, Graham Neubig, Junxian He", "date": "2025-10-29", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, tool-use, MCP, long-horizon, multi-app, ICLR-2026, HKUST]", "filename": "toolathlon.md"}, {"source_type": "announcement", "title": "OSWorld-MCP: Benchmarking MCP Tool Invocation in Computer-Use Agents", "url": "https://github.com/X-PLUG/OSWorld-MCP", "author": "Hongrui Jia, Jitong Liao, Xi Zhang, Haiyang Xu, Tianbao Xie, Chaoya Jiang, Ming Yan, Si Liu, Wei Ye, Fei Huang (Peking University, Tongyi Lab / Alibaba Group, Beijing Zhongguancun Academy)", "date": "2025-10-28", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, computer-use, MCP, tool-use, GUI, OS-interaction, multimodal, decision-making]", "filename": "osworld_mcp.md"}, {"source_type": "announcement", "title": "Introducing Spring AI Agents and Spring AI Bench", "url": "https://spring.io/blog/2025/10/28/agents-and-benchmarks/", "author": "Spring AI Community (VMware Tanzu / Broadcom)", "date": "2025-10-28", "retrieved": "2026-03-28", "tags": "[benchmark, agentic, coding, enterprise, Java, Spring, developer-productivity, tool-use, PR-review, issue-triage]", "filename": "spring_ai_bench.md"}, {"source_type": "substack", "title": "Introducing Developer Productivity AI Arena: An Open Platform for AI Coding Agents Benchmarks", "url": "https://blog.jetbrains.com/blog/2025/10/28/introducing-developer-productivity-ai-arena-an-open-platform-for-ai-coding-agents-benchmarks/", "author": "JetBrains", "date": "2025-10-28", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, coding, developer-productivity, open-platform, multi-language, multi-workflow]", "filename": "jetbrains_dpai_arena.md"}, {"source_type": "arxiv", "title": "CUARewardBench: A Benchmark for Evaluating Reward Models on Computer-Using Agents", "url": "https://arxiv.org/abs/2510.18596", "author": "Haojia Lin, Xiaoyu Tan, Yulei Qin, Zihan Xu, Yuchen Shi, Zongyi Li, Gang Li, Shaofei Cai, Siqi Cai, Chaoyou Fu, Ke Li, Xing Sun", "date": "2025-10-21", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, reward-model, computer-use, GUI, vision-language, evaluation]", "filename": "cuarewardbench.md"}, {"source_type": "arxiv", "title": "MoReBench: Evaluating Procedural and Pluralistic Moral Reasoning in Language Models, More than Outcomes", "url": "https://arxiv.org/abs/2510.16380", "author": "Yu Ying Chiu et al.", "date": "2025-10-21", "retrieved": "2026-04-21", "tags": "[benchmark, evaluation, reasoning, safety, moral-reasoning, alignment, pluralistic, rubric-based]", "filename": "morebench.md"}, {"source_type": "arxiv", "title": "SusBench: An Online Benchmark for Evaluating Dark Pattern Susceptibility of Computer-Use Agents", "url": "https://arxiv.org/abs/2510.11035", "author": "Longjie Guo et al.", "date": "2025-10-15", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, dark-patterns, safety, computer-use, GUI, web-navigation, robustness, adversarial, human-study, IUI]", "filename": "susbench.md"}, {"source_type": "substack", "title": "Artificial Analysis: Independent LLM Evals as a Service", "url": "https://www.latent.space/p/artificialanalysis", "author": "Latent Space (swyx, Alessio Fanelli) with George Cameron and Micah-Hill Smith", "date": "2025-10-15", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, independent-evaluation, SWE-bench, tool-use, leaderboard, methodology]", "filename": "latent_space_artificial_analysis.md"}, {"source_type": "arxiv", "title": "Holistic Agent Leaderboard: The Missing Infrastructure for AI Agent Evaluation", "url": "https://arxiv.org/abs/2510.11977", "author": "Sayash Kapoor, Benedikt Stroebl, Peter Kirgis, Nitya Nadgir, Zachary S Siegel, Boyi Wei, Tianci Xue, Ziru Chen, Felix Chen, Saiteja Utpala, Franck Ndzomga, Dheeraj Oruganty, Sophie Luskin, Kangheng Liu, Botao Yu, Amit Arora, Dongyoon Hahm, Harsh Trivedi, Huan Sun, Juyong Lee, Tengjun Jin, Yifan Mai, Yifei Zhou, Yuxuan Zhu, Rishi Bommasani, Daniel Kang, Dawn Song, Peter Henderson, Yu Su, Percy Liang, Arvind Narayanan", "date": "2025-10-13", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, meta-evaluation, infrastructure, leaderboard, reproducibility, Princeton, ICLR-2026]", "filename": "hal-holistic-agent-leaderboard.md"}, {"source_type": "arxiv", "title": "PaperArena: An Evaluation Benchmark for Tool-Augmented Agentic Reasoning on Scientific Literature", "url": "https://arxiv.org/abs/2510.10909", "author": "Daoyu Wang, Mingyue Cheng, Shuo Yu, Zirui Liu, Ze Guo, Xin Li, Qi Liu", "date": "2025-10-13", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, scientific-reasoning, tool-use, multi-paper, research]", "filename": "paperarena.md"}, {"source_type": "arxiv", "title": "WARC-Bench: Web Archive Based Benchmark for GUI Subtask Executions", "url": "https://arxiv.org/abs/2510.09872", "author": "Sanjari Srivastava et al.", "date": "2025-10-10", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, web-navigation, GUI, computer-use, subtask, web-archive, RLVR, fine-tuning, multimodal]", "filename": "warc-bench.md"}, {"source_type": "arxiv", "title": "FURINA: A Fully Customizable Role-Playing Benchmark via Scalable Multi-Agent Collaboration Pipeline", "url": "https://arxiv.org/abs/2510.06800", "author": "Haotian Wu et al.", "date": "2025-10-09", "retrieved": "2026-03-31", "tags": "[benchmark, evaluation, role-playing, multi-agent, llm-judge, dialogue, hallucination, bilingual, character-evaluation]", "filename": "furina.md"}, {"source_type": "arxiv", "title": "BrowserArena: Evaluating LLM Agents on Real-World Web Navigation Tasks", "url": "https://arxiv.org/abs/2510.02418", "author": "Sagnik Anupam, Davis Brown, Shuo Li, Eric Wong, Hamed Hassani, Osbert Bastani", "date": "2025-10-02", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, web-navigation, live-evaluation, browser, arena]", "filename": "browserarena-live.md"}, {"source_type": "announcement", "title": "GDPval: Measuring AI on Real-World Economically Valuable Tasks", "url": "https://openai.com/index/gdpval/", "author": "OpenAI", "date": "2025-10-01", "retrieved": "2026-05-03", "tags": "[agentic, benchmark, evaluation, reasoning, enterprise, economic-value, knowledge-work, occupations, long-horizon]", "filename": "gdpval.md"}, {"source_type": "announcement", "title": "Introducing Recovery-Bench: Evaluating LLMs' Ability to Recover from Mistakes", "url": "https://www.letta.com/blog/recovery-bench", "author": "Letta", "date": "2025-10-01", "retrieved": "2026-03-28", "tags": "[benchmark, agentic, error-recovery, context-pollution, terminal-use, resilience, continual-learning]", "filename": "recovery_bench.md"}, {"source_type": "announcement", "title": "Remote Labor Index: Measuring AI Automation of Remote Work", "url": "https://www.remotelabor.ai/", "author": "Scale AI and Center for AI Safety (CAIS)", "date": "2025-10 (arxiv 2510.26787)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, remote-work, freelance, economic-value, automation, multi-sector, Upwork]", "filename": "remote_labor_index.md"}, {"source_type": "announcement", "title": "MEMTRACK: Evaluating Long-Term Memory and State Tracking in Multi-Platform Dynamic Agent Environments", "url": "https://www.patronus.ai/blog/memtrack", "author": "Patronus AI", "date": "2025-10 (arxiv 2510.01353)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, memory, state-tracking, multi-platform, agent-evaluation, long-horizon]", "filename": "patronus_memtrack.md"}, {"source_type": "arxiv", "title": "FaithCoT-Bench: Benchmarking Instance-Level Faithfulness of Chain-of-Thought Reasoning", "url": "https://arxiv.org/abs/2510.04040", "author": "(pending full author list)", "date": "2025-10", "retrieved": "2026-04-19", "tags": "[benchmark, reasoning, chain-of-thought, faithfulness, evaluation, meta-evaluation, interpretability]", "filename": "2510.04040-faithcot-bench.md"}, {"source_type": "arxiv", "title": "DEBATE: A Large-Scale Benchmark for Evaluating Opinion Dynamics in Role-Playing LLM Agents", "url": "https://arxiv.org/abs/2510.25110", "author": "Yun-Shiuan Chuang et al.", "date": "2025-10", "retrieved": "2026-05-01", "tags": "[benchmark, evaluation, multi-agent, role-playing, opinion-dynamics, social-simulation, LLM-agents, debate, conversation]", "filename": "debate_opinion_dynamics_roleplaying_llm_agents.md"}, {"source_type": "arxiv", "title": "Dr. Bench: A Multidimensional Evaluation for Deep Research Agents, from Answers to Reports", "url": "https://arxiv.org/abs/2510.02190", "author": "Yang Yao et al.", "date": "2025-10", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, deep-research, report-generation, retrieval, reasoning, long-form-generation]", "filename": "dr_bench.md"}, {"source_type": "arxiv", "title": "Can LLMs Help You at Work? A Sandbox for Evaluating LLM Agents in Enterprise Environments", "url": "https://arxiv.org/abs/2510.27287", "author": "Harsh Vishwakarma et al.", "date": "2025-10", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, enterprise, tool-use, planning, function-calling, multi-agent]", "filename": "enterprisebench.md"}, {"source_type": "arxiv", "title": "GDPval: Evaluating AI Model Performance on Real-World Economically Valuable Tasks", "url": "https://arxiv.org/abs/2510.04374", "author": "Tejal Patwardhan et al.", "date": "2025-10", "retrieved": "2026-04-01", "tags": "[benchmark, evaluation, leaderboard, dataset, reasoning, multi-agent]", "filename": "gdpval_economic_tasks.md"}, {"source_type": "arxiv", "title": "Balancing Specialization and Centralization: A Multi-Agent Reinforcement Learning Benchmark for Sequential Industrial Control", "url": "https://arxiv.org/abs/2510.20408", "author": "Tom Maus et al.", "date": "2025-10", "retrieved": "2026-05-01", "tags": "[benchmark, evaluation, multi-agent, reinforcement-learning, industrial-control, action-masking, MARL, sequential-decision-making, real-world-rl]", "filename": "industrial_control_marl_benchmark.md"}, {"source_type": "arxiv", "title": "Towards a Standard, Enterprise-Relevant Agentic AI Benchmark: Lessons from 5.5 billion tokens' worth of agentic AI evaluations", "url": "https://arxiv.org/abs/2511.08042", "author": "JV Roig (Kamiwaza AI)", "date": "2025-10", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, enterprise, tool-use, evaluation, contamination-resistance, reproducibility]", "filename": "kami.md"}, {"source_type": "arxiv", "title": "MCP Security Bench (MSB): Benchmarking Attacks Against Model Context Protocol in LLM Agents", "url": "https://arxiv.org/abs/2510.15994", "author": "Dongsen Zhang, Zekun Li, Xu Luo, Xuannan Liu, Peipei Li, Wenjun Xu", "date": "2025-10", "retrieved": "2026-03-22", "tags": "[agentic, benchmark, safety, security, mcp, tool-use]", "filename": "mcp_security_bench_msb.md"}, {"source_type": "arxiv", "title": "Benchmarking is Broken - Don't Let AI be its Own Judge", "url": "https://arxiv.org/abs/2510.07575", "author": "(Multiple authors, affiliation details in paper)", "date": "2025-10", "retrieved": "2026-03-29", "tags": "[benchmark, meta-evaluation, evaluation-methodology, contamination, data-quality, governance, position-paper, peerbench]", "filename": "peerbench.md"}, {"source_type": "arxiv", "title": "Automatically Benchmarking LLM Code Agents through Agent-driven Annotation and Evaluation", "url": "https://arxiv.org/abs/2510.24358", "author": "Lingyue Fu, Bolun Zhang, Hao Guan, Yaoming Zhu, Lin Qiu, Weiwen Liu, Xuezhi Cao, Xunliang Cai, Weinan Zhang, Yong Yu", "date": "2025-10", "retrieved": "2026-03-27", "tags": "[benchmark, code-agents, project-level, agent-as-judge, fine-tuned-judge, PRD, Python, AAMAS-2026, SJTU, Meituan]", "filename": "prdbench.md"}, {"source_type": "arxiv", "title": "SecureWebArena: A Holistic Security Evaluation Benchmark for LVLM-based Web Agents", "url": "https://arxiv.org/abs/2510.10073", "author": "Zonghao Ying et al.", "date": "2025-10", "retrieved": "2026-04-15", "tags": "[agentic, benchmark, evaluation, web-navigation, security, adversarial, prompt-injection, LVLM, attack-vectors, multi-layer-evaluation]", "filename": "securewebarena.md"}, {"source_type": "arxiv", "title": "TRAJECT-Bench: A Trajectory-Aware Benchmark for Evaluating Agentic Tool Use", "url": "https://arxiv.org/abs/2510.04550", "author": "Pengfei He et al.", "date": "2025-10", "retrieved": "2026-04-03", "tags": "[agentic, benchmark, tool-use, function-calling, evaluation, planning, reasoning]", "filename": "traject-bench.md"}, {"source_type": "announcement", "title": "Introducing Aardvark: OpenAI's agentic security researcher", "url": "https://openai.com/index/introducing-aardvark/", "author": "OpenAI", "date": "2025-10", "retrieved": "2026-04-21", "tags": "[openai, security-researcher, agentic, vulnerability-discovery, gpt-5, cve, product-release]", "filename": "aardvark.md"}, {"source_type": "announcement", "title": "Introducing Aardvark: OpenAI's agentic security researcher", "url": "https://openai.com/index/introducing-aardvark/", "author": "OpenAI (no individual author attributed; Ian Brelinsky cited as Codex Security team member)", "date": "2025-10", "retrieved": "2026-04-23", "tags": "[agentic, security, cybersecurity, vulnerability-detection, code-analysis, patching, GPT-5, tool-use, sandboxed-execution, agentic-system]", "filename": "openai_aardvark.md"}, {"source_type": "arxiv", "title": "UltraHorizon: Benchmarking Agent Capabilities in Ultra Long-Horizon Scenarios", "url": "https://arxiv.org/abs/2509.21766", "author": "Haotian Luo, Huaisong Zhang, Xuelin Zhang, Haoyu Wang, Zeyu Qin, Wenjie Lu, Guozheng Ma, Haiying He, Yingsha Xie, Qiyang Zhou, Zixuan Hu, Hongze Mi, Yibo Wang, Naiqiang Tan, Hong Chen, Yi R. Fung, Chun Yuan, Li Shen", "date": "2025-09-26", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, long-horizon, planning, memory, tool-use, reasoning]", "filename": "ultrahorizon.md"}, {"source_type": "arxiv", "title": "D-REX: A Benchmark for Detecting Deceptive Reasoning in Large Language Models", "url": "https://arxiv.org/abs/2509.17938", "author": "Satyapriya Krishna, Andy Zou, Rahul Gupta, Eliot Krzysztof Jones, Nick Winter, Dan Hendrycks, J. Zico Kolter, Matt Fredrikson, Spyros Matsoukas", "date": "2025-09-22", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, safety, alignment, deceptive-reasoning, red-teaming, chain-of-thought, prompt-injection]", "filename": "drex_deceptive_reasoning.md"}, {"source_type": "announcement", "title": "Gaia2 and ARE: Empowering the community to study agents", "url": "https://huggingface.co/blog/gaia2", "author": "Meta FAIR / Hugging Face (Romain Froger, Pierre Andrews, Matteo Bettini, Amar Budhiraja, Ricardo Silveira Cabral, Virginie Do, Emilien Garreau, Jean-Baptiste Gaya, Hugo Laurençon, Maxime Lecanu, Kunal Malkan, Dheeraj Mekala, Pierre Ménard, Gerard Moreno-Torres Bertran, Ulyana Piterbarg, Mikhail Plekhanov, Mathieu Rita, Andrey Rusakov, Vladislav Vorotilov, Mengjue Wang, Ian Yu, Amine Benhalloum, Grégoire Mialon, Thomas Scialom; Clémentine Fourrier — Hugging Face)", "date": "2025-09-22", "retrieved": "2026-05-03", "tags": "[agentic, benchmark, evaluation, planning, reasoning, multi-agent, tool-use, dynamic-environments, temporal-reasoning, ambiguity, noise-robustness]", "filename": "gaia2.md"}, {"source_type": "arxiv", "title": "SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?", "url": "https://arxiv.org/abs/2509.16941", "author": "Xiang Deng, Jeff Da et al.", "date": "2025-09-21", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, code-generation, software-engineering, evaluation, long-horizon, contamination-resistance, enterprise, multi-file, agent]", "filename": "2509.16941-swe-bench-pro.md"}, {"source_type": "arxiv", "title": "A.S.E: A Repository-Level Benchmark for Evaluating Security in AI-Generated Code", "url": "https://arxiv.org/abs/2508.18106", "author": "Keke Lian, Bing Wang, Lei Zhang, Libo Chen, Junjie Wang, Ziming Zhao, Yujiu Yang, Miaoqian Lin, Haotong Duan, Haoran Zhao, Shuang Liao, Mingda Guo, Jiazheng Quan, Yilu Zhong, Chenhao He, Zichuan Chen, Jie Wu, Haoling Li, Zhaoxuan Li, Jiongchi Yu, Hui Li, Dong Zhang (Tencent, Peking University, Fudan University, Shanghai Jiao Tong University, Tsinghua University, Zhejiang University, Chinese Academy of Sciences, Singapore Management University)", "date": "2025-09-18", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, security, code-generation, repository-level, vulnerability, CWE, SAST, web-security, AI-coding-assistants]", "filename": "ase-security.md"}, {"source_type": "twitter", "title": "MCPMark — Stress-Testing LLM Agents on Real MCP Tool Use", "url": "https://x.com/rohanpaul_ai/status/1974353540077511156", "author": "@rohanpaul_ai", "date": "2025-09-16", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, MCP, tool-use, function-calling, Notion, GitHub, PostgreSQL]", "filename": "thread_mcpmark_benchmark.md"}, {"source_type": "arxiv", "title": "MCP-AgentBench: Evaluating Real-World Language Agent Performance with MCP-Mediated Tools", "url": "https://arxiv.org/abs/2509.09734", "author": "Zikang Guo, Benfeng Xu, Chiwei Zhu, Wentao Hong, Xiaorui Wang, Zhendong Mao (University of Science and Technology of China; Metastone Technology, Beijing)", "date": "2025-09-10", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, MCP, tool-use, evaluation, model-context-protocol, agent-tool-interaction, ReAct, tool-calling]", "filename": "mcp-agentbench-v2.md"}, {"source_type": "arxiv", "title": "SafeToolBench: Pioneering a Prospective Benchmark to Evaluating Tool Utilization Safety in LLMs", "url": "https://arxiv.org/abs/2509.07315", "author": "Hongfei Xia et al.", "date": "2025-09-09", "retrieved": "2026-04-27", "tags": "[agentic, benchmark, tool-use, safety, evaluation, function-calling]", "filename": "safe_tool_bench.md"}, {"source_type": "arxiv", "title": "MAS-Bench: A Unified Benchmark for Shortcut-Augmented Hybrid Mobile GUI Agents", "url": "https://arxiv.org/abs/2509.06477", "author": "Pengxiang Zhao, Guangyi Liu, Yaozhen Liang, Weiqing He, Zhengxi Lu, Yuehao Huang, Yaxuan Guo, Kexin Zhang, Hao Wang, Liang Liu, Yong Liu", "date": "2025-09-08", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, mobile, gui, shortcuts, hybrid-agents, api, deep-links, android, efficiency]", "filename": "mas-bench.md"}, {"source_type": "twitter", "title": "GDPval — Measuring AI on Real-World Economically Valuable Tasks", "url": "https://x.com/OpenAI/status/1971249374077518226", "author": "@OpenAI", "date": "2025-09-04", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, economic-value, white-collar, knowledge-work, OpenAI, occupations]", "filename": "thread_gdpval_openai.md"}, {"source_type": "substack", "title": "The AI Agent Evaluation Crisis and How to Fix It", "url": "https://labs.adaline.ai/p/the-ai-agent-evaluation-", "author": "Nilesh Barla (Adaline Labs)", "date": "2025-09-03", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, crisis, methodology, planning, tool-use, memory, safety, alignment]", "filename": "adaline_agent_evaluation_crisis.md"}, {"source_type": "arxiv", "title": "BeyondBench: Contamination-Resistant Evaluation of Reasoning in Language Models", "url": "https://arxiv.org/abs/2509.24210", "author": "Unknown et al.", "date": "2025-09", "retrieved": "2026-04-29", "tags": "[benchmark, evaluation, reasoning, contamination, algorithmic-reasoning, leaderboard]", "filename": "2509.24210-beyondbench.md"}, {"source_type": "arxiv", "title": "AgentArch: A Benchmark for Evaluating Agent Architectures in Enterprise Workflows", "url": "https://arxiv.org/abs/2509.10769", "author": "Tara Bogavelli, Hari Subramani, Roshnee Sharma", "date": "2025-09", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, tool-use, multi-agent, planning, reasoning, function-calling, orchestration, memory, enterprise, ServiceNow]", "filename": "agentarch.md"}, {"source_type": "arxiv", "title": "FDABench: A Benchmark for Data Agents on Analytical Queries over Heterogeneous Data", "url": "https://arxiv.org/abs/2509.02473", "author": "Ziting Wang et al.", "date": "2025-09", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, tool-use, reasoning, planning, multi-agent, dataset]", "filename": "fdabench.md"}, {"source_type": "arxiv", "title": "Towards Reliable Benchmarking: A Contamination Free, Controllable Evaluation Framework for Multi-step LLM Function Calling", "url": "https://arxiv.org/abs/2509.26553", "author": "Seiji Maekawa, Jackson Hassell, Pouya Pezeshkpour, Tom Mitchell, Estevam Hruschka", "date": "2025-09", "retrieved": "2026-03-29", "tags": "[benchmark, function-calling, tool-use, contamination-free, synthetic, multi-step, agentic, controllable-evaluation, ICLR2026]", "filename": "funcbenchgen.md"}, {"source_type": "arxiv", "title": "Instruction-Following Evaluation in Function Calling for Large Language Models", "url": "https://arxiv.org/abs/2509.18420", "author": "Nikolai Skripko", "date": "2025-09", "retrieved": "2026-03-29", "tags": "[benchmark, function-calling, instruction-following, tool-use, format-constraints, agentic, LLM-evaluation]", "filename": "ifeval-fc.md"}, {"source_type": "arxiv", "title": "MASLegalBench: Benchmarking Multi-Agent Systems in Deductive Legal Reasoning", "url": "https://arxiv.org/abs/2509.24922", "author": "Huihao Jing et al.", "date": "2025-09", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, multi-agent, legal, reasoning, GDPR, RAG, deductive-reasoning, knowledge-base, tool-use, QA]", "filename": "maslegalbench.md"}, {"source_type": "arxiv", "title": "SecureAgentBench: Benchmarking Secure Code Generation under Realistic Vulnerability Scenarios", "url": "https://arxiv.org/abs/2509.22097", "author": "Junkai Chen et al.", "date": "2025-09", "retrieved": "2026-03-23", "tags": "[agentic, benchmark, evaluation, safety, code-generation, security, software-engineering, repository-level]", "filename": "secure_agent_bench.md"}, {"source_type": "arxiv", "title": "WideSearch: Benchmarking Agentic Broad Info-Seeking", "url": "https://arxiv.org/abs/2508.07999", "author": "Ryan Wong, Jiawei Wang, Junjie Zhao, Li Chen, Yan Gao, Long Zhang, Xuan Zhou, Zuo Wang, Kai Xiang, Ge Zhang, Wenhao Huang, Yang Wang, Ke Wang (ByteDance Seed)", "date": "2025-08-28", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, information-retrieval, web-search, multi-agent, broad-search, bilingual, structured-output]", "filename": "widesearch.md"}, {"source_type": "arxiv", "title": "MCPVerse: An Expansive, Real-World Benchmark for Agentic Tool Use", "url": "https://arxiv.org/abs/2508.16260", "author": "Fei Lei, Yibo Yang, Wenxiu Sun, Dahua Lin", "date": "2025-08-21", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, tool-use, MCP, function-calling, real-world, large-scale]", "filename": "mcpverse.md"}, {"source_type": "announcement", "title": "ARC-AGI-3: 30-Day Preview Learnings", "url": "https://arcprize.org/blog/arc-agi-3-preview-30-day-learnings", "author": "Greg Kamradt, ARC Prize Foundation", "date": "2025-08-19", "retrieved": "2026-03-27", "tags": "[benchmark, evaluation, agentic, reasoning, planning, memory]", "filename": "summary_arc_agi_3.md"}, {"source_type": "arxiv", "title": "HeroBench: A Benchmark for Long-Horizon Planning and Structured Reasoning in Virtual Worlds", "url": "https://arxiv.org/abs/2508.12782", "author": "Petr Anokhin, Roman Khalikov, Stefan Rebrikov, Viktor Volkov, Artyom Sorokin, Vincent Bissonnette", "date": "2025-08-18", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, long-horizon-planning, structured-reasoning, RPG, virtual-worlds, crafting]", "filename": "herobench.md"}, {"source_type": "arxiv", "title": "FutureX: An Advanced Live Benchmark for LLM Agents in Future Prediction", "url": "https://arxiv.org/abs/2508.11987", "author": "Zhiyuan Zeng, Jiashuo Liu, Siyuan Chen, Tianci He, Yali Liao, Yixiao Tian, Jinpeng Wang, Zaiyuan Wang, Yang Yang, Lingyue Yin, Mingren Yin, Zhenwei Zhu, Tianle Cai, Zehui Chen, Jiecao Chen, Yantao Du, Xiang Gao, Jiacheng Guo, Liang Hu, Jianpeng Jiao, Xiangsheng Li, Jingkai Liu, Shuang Ni, Zhoufutu Wen, Ge Zhang, Kaiyuan Zhang, Xin Zhou, Jose Blanchet, Xipeng Qiu, Mengdi Wang, Wenhao Huang", "date": "2025-08-16", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, prediction, live-evaluation, contamination-free, reasoning, search]", "filename": "futurex.md"}, {"source_type": "arxiv", "title": "MM-BrowseComp: A Comprehensive Benchmark for Multimodal Browsing Agents", "url": "https://arxiv.org/abs/2508.13186", "author": "Shilong Li, Xingyuan Bu, Wenjie Wang, Jiaheng Liu, Jun Dong, and 19 additional collaborators", "date": "2025-08-14", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, multimodal, web-browsing, retrieval, reasoning, vision]", "filename": "mm-browsecomp.md"}, {"source_type": "twitter", "title": "MedAgentBench — Evaluating LLM Agent Capabilities in Clinical EHR Environments", "url": "https://x.com/NEJM_AI/status/1958537091685716479", "author": "@NEJM_AI", "date": "2025-08-14", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, medical, clinical, EHR, healthcare, domain-specific]", "filename": "thread_medagentbench_clinical_nejm.md"}, {"source_type": "announcement", "title": "OpenCUA: Open Foundations for Computer-Use Agents & AgentNetBench", "url": "https://opencua.xlang.ai/", "author": "XLANG Lab (HKU) / Moonshot AI / Stanford / Waterloo / CMU", "date": "2025-08-13", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, computer-use, gui-agent, multimodal, offline-eval, dataset, open-source]", "filename": "summary_opencua_agentnetbench.md"}, {"source_type": "arxiv", "title": "FineState-Bench: A Comprehensive Benchmark for Fine-Grained State Control in GUI Agents", "url": "https://arxiv.org/abs/2508.09241", "author": "Fengxian Ji, Jingpu Yang, Zirui Song, Yuanxi Wang, Zhexuan Cui, Yuke Li, Qian Jiang, Miao Fang, Xiuying Chen", "date": "2025-08-12", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, gui, fine-grained-control, multi-platform, visual-grounding, desktop, web, mobile]", "filename": "finestate-bench.md"}, {"source_type": "arxiv", "title": "MCPToolBench++: A Large Scale AI Agent Model Context Protocol MCP Tool Use Benchmark", "url": "https://arxiv.org/abs/2508.07575", "author": "Shiqing Fan, Xichen Ding, Liang Zhang, Linjian Mo (Ant Group)", "date": "2025-08-11", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, tool-use, MCP, function-calling, multi-step, multi-domain, multilingual]", "filename": "mcptoolbench-plus.md"}, {"source_type": "arxiv", "title": "DatasetResearch: Benchmarking Agent Systems for Demand-Driven Dataset Discovery", "url": "https://arxiv.org/abs/2508.06960", "author": "Keyu Li, Mohan Jiang, Dayuan Fu, Yunze Wu, Xiangkun Hu, Dequan Wang, Pengfei Liu (Shanghai Jiao Tong University, SII, GAIR)", "date": "2025-08-09", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, dataset-discovery, dataset-synthesis, NLP, deep-research, search-agents]", "filename": "datasetresearch.md"}, {"source_type": "arxiv", "title": "OmniEAR: Benchmarking Agent Reasoning in Embodied Tasks", "url": "https://arxiv.org/abs/2508.05614", "author": "Zixuan Wang, Dingming Li, Hongxing Li, Shuo Chen, Yuchen Yan, Wenqi Zhang, Yongliang Shen, Weiming Lu, Jun Xiao, Yueting Zhuang", "date": "2025-08-07", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, embodied, reasoning, tool-use, multi-agent, collaboration, physical-reasoning]", "filename": "omniear.md"}, {"source_type": "arxiv", "title": "NaturalGAIA: Pushing the Frontiers of GUI Agents with a Challenging Benchmark and High-Quality Trajectory Dataset", "url": "https://arxiv.org/abs/2508.01330", "author": "Anonymous et al.", "date": "2025-08-02", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, GUI-agent, trajectory-dataset, long-horizon, multi-platform, verifiable-eval, OS-interaction]", "filename": "naturalgaia.md"}, {"source_type": "arxiv", "title": "WebDS: An End-to-End Benchmark for Web-based Data Science", "url": "https://arxiv.org/abs/2508.01222", "author": "(first author unknown — Stanford/UC Berkeley team)", "date": "2025-08-02", "retrieved": "2026-04-15", "tags": "[agentic, benchmark, evaluation, data-science, web-agent, tool-use, code-generation, end-to-end, multi-step, multi-modal]", "filename": "webds.md"}, {"source_type": "substack", "title": "The Future of AI Agent Evaluation", "url": "https://research.ibm.com/blog/AI-agent-benchmarks", "author": "IBM Research (with Hebrew University and Yale collaborators)", "date": "2025-08-01", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, survey, planning, tool-calling, memory, reflection, methodology]", "filename": "ibm_research_future_agent_evaluation.md"}, {"source_type": "arxiv", "title": "A Functionality-Grounded Benchmark for Evaluating Web Agents in E-commerce Domains", "url": "https://arxiv.org/abs/2508.15832", "author": "Xianren Zhang et al.", "date": "2025-08", "retrieved": "2026-04-15", "tags": "[agentic, benchmark, evaluation, web-navigation, e-commerce, amazon, safety, functionality, web-agent, account-management]", "filename": "amazon_bench_ecommerce.md"}, {"source_type": "arxiv", "title": "Breaking Agent Backbones: Evaluating the Security of Backbone LLMs in AI Agents", "url": "https://arxiv.org/abs/2510.22620", "author": "Bazinska et al.", "date": "2025-08", "retrieved": "2026-03-23", "tags": "[agentic, benchmark, evaluation, safety, security, adversarial, red-teaming, prompt-injection]", "filename": "breaking_agent_backbones_b3.md"}, {"source_type": "arxiv", "title": "GitTaskBench: A Benchmark for Code Agents Solving Real-World Tasks Through Code Repository Leveraging", "url": "https://arxiv.org/abs/2508.18993", "author": "Ziyi Ni, Huacan Wang, Shuo Zhang et al. (UCAS, CASIA, StepFun, HKUST, PKU, NUS)", "date": "2025-08", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, code-generation, tool-use, reasoning, planning]", "filename": "gittaskbench.md"}, {"source_type": "arxiv", "title": "MCP-Bench: Benchmarking Tool-Using LLM Agents with Complex Real-World Tasks via MCP Servers", "url": "https://arxiv.org/abs/2508.20453", "author": "Zhenting Wang et al.", "date": "2025-08", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, tool-use, function-calling, planning, reasoning, mcp]", "filename": "mcp_bench_accenture.md"}, {"source_type": "arxiv", "title": "MCPSecBench: A Systematic Security Benchmark and Playground for Testing Model Context Protocols", "url": "https://arxiv.org/abs/2508.13220", "author": "Yixuan Yang, Cuifeng Gao, Daoyuan Wu, Yufan Chen, Yingjiu Li, Shuai Wang", "date": "2025-08", "retrieved": "2026-03-22", "tags": "[agentic, benchmark, safety, security, mcp, tool-use]", "filename": "mcpsecbench.md"}, {"source_type": "arxiv", "title": "OdysseyBench: Evaluating LLM Agents on Long-Horizon Complex Office Application Workflows", "url": "https://arxiv.org/abs/2508.09124", "author": "Weixuan Wang, Dongge Han, Daniel Madrigal Diaz", "date": "2025-08", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, enterprise, long-horizon, office-applications, word, excel, pdf, email, calendar]", "filename": "odysseybench.md"}, {"source_type": "arxiv", "title": "ReportBench: Evaluating Deep Research Agents via Academic Survey Tasks", "url": "https://arxiv.org/abs/2508.15804", "author": "Minghao Li et al.", "date": "2025-08", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, deep-research, report-generation, citation, fact-checking, survey, academic]", "filename": "reportbench.md"}, {"source_type": "arxiv", "title": "ShoppingBench: A Real-World Intent-Grounded Shopping Benchmark for LLM-based Agents", "url": "https://arxiv.org/abs/2508.04266", "author": "Jiangyuan Wang et al.", "date": "2025-08", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, tool-use, reasoning, planning]", "filename": "shoppingbench.md"}, {"source_type": "arxiv", "title": "WebMall - A Multi-Shop Benchmark for Evaluating Web Agents", "url": "https://arxiv.org/abs/2508.13024", "author": "Peeters et al.", "date": "2025-08", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, web-navigation, reasoning, planning, memory]", "filename": "webmall.md"}, {"source_type": "arxiv", "title": "Evaluation and Benchmarking of LLM Agents: A Survey", "url": "https://arxiv.org/abs/2507.21504", "author": "Mahmoud Mohammadi, Yipeng Li, Jane Lo, Wendy Yip (SAP Labs)", "date": "2025-07-29", "retrieved": "2026-03-25", "tags": "[survey, agentic, benchmark, evaluation, taxonomy, tool-use, planning, reasoning, safety, enterprise, multi-agent, reliability, KDD-2025]", "filename": "eval_benchmarking_llm_agents_survey_kdd2025.md"}, {"source_type": "twitter", "title": "OSWorld-Verified — Improved Computer Use Agent Benchmark", "url": "https://x.com/taoyds/status/1954964905007911157", "author": "@taoyds", "date": "2025-07-28", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, computer-use, OS-interaction, multimodal, desktop, web, NeurIPS]", "filename": "thread_osworld_verified_taoyds.md"}, {"source_type": "arxiv", "title": "MMBench-GUI: Hierarchical Multi-Platform Evaluation Framework for GUI Agents", "url": "https://arxiv.org/abs/2507.19478", "author": "Xuehui Wang, Zhenyu Wu, JingJing Xie, Zichen Ding, Bowen Yang, and 23 additional collaborators", "date": "2025-07-25", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, gui, multi-platform, hierarchical-evaluation, desktop, web, mobile, cross-platform, efficiency]", "filename": "mmbench-gui.md"}, {"source_type": "substack", "title": "SWE-bench Verified is Flawed Despite Expert Review: UTBoost Exposes Gaps in Test Coverage", "url": "https://ddkang.substack.com/p/swe-bench-verified-is-flawed-despite", "author": "Daniel Kang", "date": "2025-07-22", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, SWE-bench, evaluation, test-coverage, coding, software-engineering, criticism]", "filename": "kang_swebench_flawed.md"}, {"source_type": "arxiv", "title": "MCPEval: Automatic MCP-based Deep Evaluation for AI Agent Models", "url": "https://arxiv.org/abs/2507.12806", "author": "Zhiwei Liu, Jielin Qiu, Shiyu Wang, Jianguo Zhang, Zuxin Liu, Roshan Ram, Haolin Chen, Weiran Yao, Shelby Heinecke, Silvio Savarese, Huan Wang, Caiming Xiong (Salesforce AI Research)", "date": "2025-07-17", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, evaluation, MCP, model-context-protocol, tool-use, synthetic-data, LLM-judge, multi-domain, Salesforce]", "filename": "mcpeval.md"}, {"source_type": "twitter", "title": "Agentic Benchmark Best Practices — Checklist for Rigorous Agent Evaluation", "url": "https://x.com/rohanpaul_ai/status/1941809851577106492", "author": "@rohanpaul_ai", "date": "2025-07-16", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, methodology, best-practices, evaluation-rigor, benchmark-gaming]", "filename": "thread_agentic_benchmark_best_practices_rohanpaul.md"}, {"source_type": "arxiv", "title": "Establishing Best Practices for Building Rigorous Agentic Benchmarks", "url": "https://arxiv.org/abs/2507.02825", "author": "Yuxuan Zhu, Tengjun Jin, Yada Pruksachatkun, Andy Zhang, Shu Liu, Sasha Cui, Sayash Kapoor, Shayne Longpre, Kevin Meng, Rebecca Weiss, Fazl Barez, Rahul Gupta, Jwala Dhamala, Jacob Merizian, Mario Giulianelli, Harry Coppock, Cozmin Ududec, Jasjeet Sekhon, Jacob Steinhardt, Antony Kellermann, Sarah Schwettmann, Matei Zaharia, Ion Stoica, Percy Liang, Daniel Kang", "date": "2025-07-03", "retrieved": "2026-03-25", "tags": "[agentic, benchmark, evaluation, methodology, checklist, best-practices, validity, task-design, outcome-design, swe-bench, tau-bench, webarena, osworld, mle-bench, gaia, kernelbench, swe-lancer, cybench, bird]", "filename": "agentic-benchmark-checklist-abc.md"}, {"source_type": "substack", "title": "AI Agent Benchmarks are Broken", "url": "https://ddkang.substack.com/p/ai-agent-benchmarks-are-broken", "author": "Daniel Kang (Stanford / UIUC)", "date": "2025-07-01", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, methodology, broken, gamability, WebArena, SWE-bench, checklist, criticism]", "filename": "kang_agent_benchmarks_broken.md"}, {"source_type": "announcement", "title": "Introducing OSWorld-Verified", "url": "https://xlang.ai/blog/osworld-verified", "author": "XLANG Lab (Tao Yu, Shuyan Zhou, Boyu Gou et al.)", "date": "2025-07 (original OSWorld: NeurIPS 2024)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, OS-interaction, computer-use, desktop, multimodal, web, GUI]", "filename": "xlang_osworld_verified.md"}, {"source_type": "arxiv", "title": "AgentsNet: Coordination and Collaborative Reasoning in Multi-Agent LLMs", "url": "https://arxiv.org/abs/2507.08616", "author": "Florian Grötschla, Luis Müller, Jan Tönshoff, Mikhail Galkin, Bryan Perozzi", "date": "2025-07", "retrieved": "2026-04-17", "tags": "[multi-agent, coordination, benchmark, distributed-systems, graph-theory, collaboration]", "filename": "agentsnet.md"}, {"source_type": "arxiv", "title": "Evaluating Memory in LLM Agents via Incremental Multi-Turn Interactions", "url": "https://arxiv.org/abs/2507.05257", "author": "Yuanzhe Hu, Yu Wang, Julian McAuley (UC San Diego)", "date": "2025-07", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, memory, dataset, reasoning]", "filename": "memoryagentbench.md"}, {"source_type": "arxiv", "title": "OpenAgentSafety: A Comprehensive Framework for Evaluating Real-World AI Agent Safety", "url": "https://arxiv.org/abs/2507.06134", "author": "Sanidhya Vijayvargiya, Aditya Bharat Soni, Xuhui Zhou, Zora Zhiruo Wang, Nouha Dziri, Graham Neubig, Maarten Sap", "date": "2025-07", "retrieved": "2026-03-22", "tags": "[agentic, benchmark, safety, evaluation, multi-turn, tool-use, red-teaming, alignment]", "filename": "openagentsafety.md"}, {"source_type": "arxiv", "title": "OSWorld-Human: Benchmarking the Efficiency of Computer-Use Agents", "url": "https://arxiv.org/abs/2506.16042", "author": "Reyna Abhyankar, Qi Qi, Yiying Zhang", "date": "2025-06-19", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, computer-use, efficiency, GUI, desktop, OS-interaction, gold-trajectories, MLSys-2026]", "filename": "osworld_gold.md"}, {"source_type": "arxiv", "title": "SHADE-Arena: Evaluating Sabotage and Monitoring in LLM Agents", "url": "https://arxiv.org/abs/2506.15740", "author": "Jonathan Kutasov, Yuqi Sun, Paul Colognese, Teun van der Weij, Linda Petrini, Chen Bo Calvin Zhang, John Hughes, Xiang Deng, Henry Sleight, Tyler Tracy, Buck Shlegeris, Joe Benton", "date": "2025-06-17", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, safety, sabotage, monitoring, alignment, deception, AI-safety]", "filename": "shade-arena.md"}, {"source_type": "substack", "title": "AI Agent Benchmark Compendium", "url": "https://www.philschmid.de/benchmark-compedium", "author": "Philipp Schmid (Google DeepMind, formerly Hugging Face)", "date": "2025-06-15", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, compendium, taxonomy, function-calling, tool-use, coding, computer-interaction, survey]", "filename": "schmid_benchmark_compendium.md"}, {"source_type": "substack", "title": "What does SWE-bench Verified actually measure?", "url": "https://epochai.substack.com/p/what-skills-does-swe-bench-verified-evaluate", "author": "Epoch AI (Florian Brand, JS Denain et al.)", "date": "2025-06-13", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, SWE-bench, evaluation, coding, software-engineering, analysis]", "filename": "epoch_ai_swebench_skills.md"}, {"source_type": "arxiv", "title": "τ²-Bench: Evaluating Conversational Agents in a Dual-Control Environment", "url": "https://arxiv.org/abs/2506.07982", "author": "Victor Barres, Honghua Dong, Soham Ray, Xujie Si, Karthik Narasimhan", "date": "2025-06-09", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, conversational-agents, dual-control, Dec-POMDP, telecom, customer-service, Sierra]", "filename": "tau2-bench.md"}, {"source_type": "arxiv", "title": "macOSWorld: A Multilingual Interactive Benchmark for GUI Agents", "url": "https://arxiv.org/abs/2506.04135", "author": "Pei Yang et al.", "date": "2025-06-04", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, GUI-agent, OS-interaction, multilingual, safety, computer-use, NeurIPS-2025]", "filename": "macosworld.md"}, {"source_type": "arxiv", "title": "CyberGym: Evaluating AI Agents' Real-World Cybersecurity Capabilities at Scale", "url": "https://arxiv.org/abs/2506.02548", "author": "Zhun Wang et al.", "date": "2025-06-03", "retrieved": "2026-03-23", "tags": "[agentic, benchmark, evaluation, cybersecurity, code-generation, tool-use, reasoning, vulnerability-reproduction, proof-of-concept, zero-day]", "filename": "cybergym.md"}, {"source_type": "arxiv", "title": "EconWebArena: Benchmarking Autonomous Agents on Economic Tasks in Realistic Web Environments", "url": "https://arxiv.org/abs/2506.08136", "author": "Zefang Liu et al.", "date": "2025-06-01", "retrieved": "2026-03-31", "tags": "[agentic, benchmark, evaluation, web-navigation, economics, multimodal, tool-use, real-world-web]", "filename": "econwebarena.md"}, {"source_type": "arxiv", "title": "FORTRESS: Frontier Risk Evaluation for National Security and Public Safety", "url": "https://arxiv.org/abs/2506.14922", "author": "Christina Q. Knight et al.", "date": "2025-06-01", "retrieved": "2026-04-03", "tags": "[benchmark, evaluation, safety, adversarial, jailbreak, national-security, CBRNE, red-teaming, LLM-safety, over-refusal, dual-use, Scale-AI]", "filename": "fortress_bench.md"}, {"source_type": "substack", "title": "Benchmarks evaluating LLM agents for software development", "url": "https://symflower.com/en/company/blog/2025/benchmarks-llm-agents/", "author": "Symflower", "date": "2025-06-01", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, software-development, coding, tool-use, planning, agents]", "filename": "symflower_benchmarks_llm_agents_swe.md"}, {"source_type": "arxiv", "title": "AgentMisalignment: Measuring the Propensity for Misaligned Behaviour in LLM-Based Agents", "url": "https://arxiv.org/abs/2506.04018", "author": "Akshat Naik et al.", "date": "2025-06", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, safety, misalignment, alignment, power-seeking, sandbagging, shutdown-resistance, oversight-avoidance, persona, frontier-models, InspectAI]", "filename": "agent_misalignment.md"}, {"source_type": "arxiv", "title": "AGENTSAFE: Benchmarking the Safety of Embodied Agents on Hazardous Instructions", "url": "https://arxiv.org/abs/2506.14697", "author": "Zonghao Ying, Le Wang, Yisong Xiao, Jiakai Wang, Yuqing Ma, Jinyang Guo, Zhenfei Yin, Mingchuan Zhang, Aishan Liu, Xianglong Liu", "date": "2025-06", "retrieved": "2026-03-22", "tags": "[agentic, benchmark, safety, embodied, evaluation, VLM, jailbreak, simulation]", "filename": "agentsafe_embodied.md"}, {"source_type": "arxiv", "title": "DABstep: Data Agent Benchmark for Multi-step Reasoning", "url": "https://arxiv.org/abs/2506.23719", "author": "Alex Egg et al.", "date": "2025-06", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, reasoning, planning, tool-use, code-generation, dataset]", "filename": "dabstep.md"}, {"source_type": "arxiv", "title": "DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents", "url": "https://arxiv.org/abs/2506.11763", "author": "Mingxuan Du et al. (University of Science and Technology of China / MetastoneTechnology)", "date": "2025-06", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, research, reasoning, tool-use]", "filename": "deepresearch_bench.md"}, {"source_type": "arxiv", "title": "MCPWorld: A Unified Benchmarking Testbed for API, GUI, and Hybrid Computer Use Agents", "url": "https://arxiv.org/abs/2506.07672", "author": "Yan et al. (Beijing University of Posts and Telecommunications)", "date": "2025-06", "retrieved": "2026-03-28", "tags": "[benchmark, agentic, evaluation, tool-use, function-calling, os-interaction]", "filename": "mcpworld.md"}, {"source_type": "arxiv", "title": "MEAL: A Benchmark for Continual Multi-Agent Reinforcement Learning", "url": "https://arxiv.org/abs/2506.14990", "author": "Tristan Tomilin et al.", "date": "2025-06", "retrieved": "2026-05-01", "tags": "[benchmark, evaluation, multi-agent, reinforcement-learning, continual-learning, cooperative-ai, marl, catastrophic-forgetting, overcooked, jax]", "filename": "meal_continual_marl_benchmark.md"}, {"source_type": "arxiv", "title": "Mind2Web 2: Evaluating Agentic Search with Agent-as-a-Judge", "url": "https://arxiv.org/abs/2506.21506", "author": "Boyu Gou, Zanming Huang, Yuting Ning, Yu Gu, Michael Lin, Weijian Qi, Andrei Kopanev, Botao Yu, Bernal Jimenez Gutierrez, Yiheng Shu, Chan Hee Song, Jiaman Wu, Shijie Chen, Hanane Nour Moussa, Tianshu Zhang, Jian Xie, Yifei Li, Tianci Xue, Zeyi Liao, Kai Zhang, Boyuan Zheng, Zhaowei Cai, Viktor Rozgic, Morteza Ziyadi, Huan Sun, Yu Su", "date": "2025-06", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, web-search, agentic-search, agent-as-judge, deep-research, NeurIPS-2025, Ohio-State, Amazon]", "filename": "mind2web-2.md"}, {"source_type": "arxiv", "title": "OSWorld-Human: Benchmarking the Efficiency of Computer-Use Agents", "url": "https://arxiv.org/abs/2506.16042", "author": "Reyna Abhyankar, Qi Qi, Yiying Zhang", "date": "2025-06", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, agentic, os-interaction, efficiency, latency, computer-use]", "filename": "osworld_human.md"}, {"source_type": "arxiv", "title": "Sequential Diagnosis with Language Models", "url": "https://arxiv.org/abs/2506.22405", "author": "Harsha Nori, Mayank Daswani, Christopher Kelly, Scott Lundberg, Marco Tulio Ribeiro, Marc Wilson, Xiaoxuan Liu, Viknesh Sounderajah, Jonathan M Carlson, Matthew P Lungren, Bay Gross, Peter Hames, Mustafa Suleyman, Dominic King, Eric Horvitz", "date": "2025-06", "retrieved": "2026-03-27", "tags": "[agentic, benchmark, medical-ai, clinical-reasoning, sequential-diagnosis, interactive, multi-turn, cost-aware, microsoft, tool-use]", "filename": "sdbench.md"}, {"source_type": "arxiv", "title": "Search Arena: Analyzing Search-Augmented LLMs", "url": "https://arxiv.org/abs/2506.05334", "author": "Mihran Miroyan, Tsung-Han Wu et al.", "date": "2025-06", "retrieved": "2026-03-29", "tags": "[benchmark, evaluation, search, leaderboard, dataset, reasoning, web-navigation]", "filename": "search_arena.md"}, {"source_type": "arxiv", "title": "StoryBench: A Dynamic Benchmark for Evaluating Long-Term Memory with Multi Turns", "url": "https://arxiv.org/abs/2506.13356", "author": "Luanbo Wan, Weizhi Ma", "date": "2025-06", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, memory, long-term-memory, multi-turn, reasoning, interactive-fiction, planning]", "filename": "storybench_ltm.md"}, {"source_type": "arxiv", "title": "TurnBench-MS: A Benchmark for Evaluating Multi-Turn, Multi-Step Reasoning in Large Language Models", "url": "https://arxiv.org/abs/2506.01341", "author": "Yiran Zhang, Mo Wang, Xiaoyang Li, Kaixuan Ren, Chencheng Zhu, Usman Naseem", "date": "2025-06", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, reasoning, multi-turn, multi-step, game-based, planning]", "filename": "turnbench_ms.md"}, {"source_type": "arxiv", "title": "VPI-Bench: Visual Prompt Injection Attacks for Computer-Use Agents", "url": "https://arxiv.org/abs/2506.02456", "author": "Tri Cao, Bennett Lim, Yue Liu, Yuan Sui, Yuexin Li, Shumin Deng, Lin Lu, Nay Oo, Shuicheng Yan, Bryan Hooi", "date": "2025-06", "retrieved": "2026-03-22", "tags": "[agentic, benchmark, safety, security, computer-use, prompt-injection]", "filename": "vpi_bench.md"}, {"source_type": "arxiv", "title": "WebChoreArena: Evaluating Web Browsing Agents on Realistic Tedious Web Tasks", "url": "https://arxiv.org/abs/2506.01952", "author": "Atsuyuki Miyai, Zaiying Zhao, Kazuki Egashira et al. (The University of Tokyo)", "date": "2025-06", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, web-navigation, memory, reasoning]", "filename": "webchorearena.md"}, {"source_type": "arxiv", "title": "xbench: Tracking Agents Productivity Scaling with Profession-Aligned Real-World Evaluations", "url": "https://arxiv.org/abs/2506.13651", "author": "Kaiyuan Chen, Yixin Ren, Yang Liu et al.", "date": "2025-06", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, agentic, reasoning, tool-use, research, dataset, enterprise]", "filename": "xbench.md"}, {"source_type": "announcement", "title": "AssetOpsBench: Bridging the Gap Between AI Agent Benchmarks and Industrial Reality", "url": "https://huggingface.co/blog/ibm-research/assetopsbench-playground-on-hugging-face", "author": "IBM Research", "date": "2025-06", "retrieved": "2026-04-17", "tags": "[multi-agent, industrial, benchmark, enterprise, asset-management, tool-use]", "filename": "assetopsbench.md"}, {"source_type": "announcement", "title": "SWE-bench-Live: Contamination-Free Live Software Engineering Benchmark", "url": "https://swe-bench-live.github.io/", "author": "Microsoft (Linghao Zhang, Shilin He, Chaoyun Zhang, Yu Kang, Bowen Li, Chengxing Xie et al.)", "date": "2025-06", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, software-engineering, coding, SWE-bench-Live, contamination-free, microsoft, multi-language]", "filename": "summary_swe_bench_live.md"}, {"source_type": "arxiv", "title": "SWE-rebench: An Automated Pipeline for Task Collection and Decontaminated Evaluation of Software Engineering Agents", "url": "https://arxiv.org/abs/2505.20411", "author": "Ibragim Badertdinov et al.", "date": "2025-05-26", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, code-generation, evaluation, dataset, software-engineering, debugging, decontamination, reinforcement-learning, github]", "filename": "2505.20411-swe-rebench.md"}, {"source_type": "arxiv", "title": "AGENTIF: Benchmarking Instruction Following of Large Language Models in Agentic Scenarios", "url": "https://arxiv.org/abs/2505.16944", "author": "Yunjia Qi, Hao Peng, Xiaozhi Wang, Amy Xin, Youfeng Liu, Bin Xu, Lei Hou, Juanzi Li", "date": "2025-05-22", "retrieved": "2026-05-05", "tags": "[benchmark, evaluation, agentic, instruction-following, tool-use, function-calling, constraint-following, NeurIPS-2025]", "filename": "agentif_instruction_following.md"}, {"source_type": "arxiv", "title": "MCP-Radar: A Multi-Dimensional Benchmark for Evaluating Tool Use Capabilities in Large Language Models", "url": "https://arxiv.org/abs/2505.16700", "author": "Xuanqi Gao, Siyi Xie, Juan Zhai, Shiqing Ma, Chao Shen", "date": "2025-05-22", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, tool-use, MCP, model-context-protocol, function-calling, multi-domain, LLM-evaluation]", "filename": "mcp-radar.md"}, {"source_type": "announcement", "title": "CUB: The Computer Use Benchmark", "url": "https://thetasoftware.com/blog/introducing-cub/", "author": "Theta Software Inc.", "date": "2025-05-15", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, computer-use, browser-use, cross-industry, desktop, enterprise, GUI]", "filename": "cub.md"}, {"source_type": "twitter", "title": "The Ultimate LLM Benchmark List — Comprehensive Benchmark Directory", "url": "https://x.com/scaling01/status/1919092778648408363", "author": "@scaling01", "date": "2025-05-04", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, meta-list, benchmark-saturation, leaderboard, community-curation]", "filename": "thread_ultimate_llm_benchmark_list_scaling01.md"}, {"source_type": "announcement", "title": "MedAgentBoard: A Comprehensive Benchmark for Medical Multi-Agent Collaboration", "url": "https://medagentboard.netlify.app/", "author": "Yinghao Zhu, Liantao Ma, Lequan Yu et al. (Peking University, University of Hong Kong, ETH Zurich, University of Edinburgh)", "date": "2025-05-01", "retrieved": "2026-03-28", "tags": "[benchmark, medical, multi-agent, ehr, clinical, visual-qa, summarization, agent-collaboration]", "filename": "summary_medagentboard.md"}, {"source_type": "substack", "title": "SWE-Bench Deep Dive: Unmasking the Limitations of a Popular Benchmark", "url": "https://runloop.ai/blog/swe-bench-deep-dive-unmasking-the-limitations-of-a-popular-benchmark", "author": "Runloop", "date": "2025-05-01", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, SWE-bench, evaluation, limitations, test-coverage, solution-leakage, coding]", "filename": "runloop_swebench_deep_dive.md"}, {"source_type": "substack", "title": "Why We Think", "url": "https://lilianweng.github.io/posts/2025-05-01-thinking/", "author": "Lilian Weng (OpenAI)", "date": "2025-05-01", "retrieved": "2026-03-07", "tags": "[agentic, reasoning, test-time-compute, chain-of-thought, agent-reasoning, thinking, scaling]", "filename": "weng_why_we_think.md"}, {"source_type": "arxiv", "title": "BioProBench: Comprehensive Dataset and Benchmark in Biological Protocol Understanding and Reasoning", "url": "https://arxiv.org/abs/2505.07889", "author": "Liu et al.", "date": "2025-05", "retrieved": "2026-04-29", "tags": "[benchmark, evaluation, reasoning, biology, scientific-reasoning, dataset, procedural-reasoning]", "filename": "2505.07889-bioproberch.md"}, {"source_type": "arxiv", "title": "FieldWorkArena: A Benchmark for Agentic AI in Field Work Environments", "url": "https://arxiv.org/abs/2505.19662", "author": "Fujitsu & CMU (multiple authors)", "date": "2025-05", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, planning, reasoning, tool-use]", "filename": "fieldworkarena.md"}, {"source_type": "arxiv", "title": "MedAgentBoard: Benchmarking Multi-Agent Collaboration with Conventional Methods for Diverse Medical Tasks", "url": "https://arxiv.org/abs/2505.12371", "author": "Yuhao Zhu et al.", "date": "2025-05", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, multi-agent, medical, healthcare, tool-use, reasoning, function-calling, dataset, leaderboard]", "filename": "medagentboard.md"}, {"source_type": "arxiv", "title": "Benchmarking LLMs' Swarm Intelligence", "url": "https://arxiv.org/abs/2505.04364", "author": "RUC-GSAI (Renmin University of China)", "date": "2025-05", "retrieved": "2026-04-17", "tags": "[multi-agent, swarm-intelligence, benchmark, decentralized, coordination, embodied]", "filename": "swarmbench.md"}, {"source_type": "arxiv", "title": "SWE-rebench: An Automated Pipeline for Task Collection and Decontaminated Evaluation of Software Engineering Agents", "url": "https://arxiv.org/abs/2505.20411", "author": "Badertdinov et al. (Nebius)", "date": "2025-05", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, code-generation, agentic, dataset, debugging, software-engineering, decontamination]", "filename": "swe_rebench.md"}, {"source_type": "announcement", "title": "Introducing TRAIL: A Benchmark for Agentic Evaluation", "url": "https://www.patronus.ai/blog/introducing-trail-a-benchmark-for-agentic-evaluation", "author": "Darshan Deshpande, Varun Gangal, Hersh Mehta, Jitin Krishnan, Anand Kannappan, Rebecca Qian (Patronus AI)", "date": "2025-05", "retrieved": "2026-04-23", "tags": "[agentic, benchmark, evaluation, multi-agent, trace-evaluation, error-localization, debugging, long-context, agent-monitoring, tool-use, reasoning, planning]", "filename": "patronus_trail.md"}, {"source_type": "arxiv", "title": "SocialGrid: A Benchmark for Planning and Social Reasoning in Embodied Multi-Agent Systems", "url": "https://arxiv.org/abs/2604.16022", "author": "Hikaru Shindo, Hanzhao Lin, Lukas Helff, Patrick Schramowski, Kristian Kersting", "date": "2025-04-22", "retrieved": "2026-05-03", "tags": "[agentic, benchmark, evaluation, multi-agent, planning, reasoning, social-reasoning, embodied, deception, gridworld]", "filename": "social_grid.md"}, {"source_type": "arxiv", "title": "PLANET: A Collection of Benchmarks for Evaluating LLMs' Planning Capabilities", "url": "https://arxiv.org/abs/2504.14773", "author": "Haoming Li, Zhaoliang Chen, Jonathan Zhang, Fei Liu", "date": "2025-04-21", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, planning, survey, taxonomy, embodied, web-navigation, scheduling, games]", "filename": "planet.md"}, {"source_type": "arxiv", "title": "BrowseComp: A Simple Yet Challenging Benchmark for Browsing Agents", "url": "https://arxiv.org/abs/2504.12516", "author": "Jason Wei, Zhiqing Sun, Spencer Papay, Scott McKinney, Jeffrey Han, Isa Fulford, Hyung Won Chung, Alex Tachard Passos, William Fedus, Amelia Glaese", "date": "2025-04-16", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, web-browsing, information-retrieval, deep-research, OpenAI]", "filename": "browsecomp.md"}, {"source_type": "arxiv", "title": "RealWebAssist: A Benchmark for Long-Horizon Web Assistance with Real-World Users", "url": "https://arxiv.org/abs/2504.10445", "author": "Suyu Ye et al.", "date": "2025-04-14", "retrieved": "2026-03-31", "tags": "[agentic, benchmark, evaluation, web-navigation, gui-grounding, sequential-instruction-following, long-horizon, real-world-users, speech, visual-language-models]", "filename": "realwebassist.md"}, {"source_type": "arxiv", "title": "AgentRewardBench: Evaluating Automatic Evaluations of Web Agent Trajectories", "url": "https://arxiv.org/abs/2504.08942", "author": "Xing Han Lù, Amirhossein Kazemnejad, Nicholas Meade, Arkil Patel, Dongchan Shin, Alejandra Zambrano, Karolina Stańczak, Peter Shaw, Christopher J. Pal, Siva Reddy", "date": "2025-04-11", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, meta-benchmark, web-agents, evaluation, llm-judge, reward-model]", "filename": "agentrewardbench.md"}, {"source_type": "arxiv", "title": "Benchmarking LLM Tool-Use in the Wild", "url": "https://arxiv.org/abs/2604.06185", "author": "Peijie Yu et al.", "date": "2025-04-08", "retrieved": "2026-04-27", "tags": "[agentic, benchmark, tool-use, function-calling, evaluation, multi-turn, real-world, ICLR-2026]", "filename": "wild_tool_bench.md"}, {"source_type": "twitter", "title": "PaperBench — Evaluating AI's Ability to Replicate State-of-the-Art Research", "url": "https://x.com/OpenAI/status/1907481490457506235", "author": "@OpenAI", "date": "2025-04-03", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, research-replication, ICML, AI-R&D, OpenAI, preparedness]", "filename": "thread_paperbench_openai.md"}, {"source_type": "arxiv", "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research", "url": "https://arxiv.org/abs/2504.01848", "author": "Giulio Starace, Oliver Jaffe, Dane Sherburn, James Aung, Jun Shern Chan, Leon Maksin, Rachel Dias, Evan Mays, Benjamin Kinsella, Wyatt Thompson, Johannes Heidecke, Amelia Glaese, Tejal Patwardhan", "date": "2025-04-02", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, research-replication, coding, ML-engineering, OpenAI, ICML]", "filename": "paperbench.md"}, {"source_type": "substack", "title": "10 AI Agent Benchmarks", "url": "https://www.evidentlyai.com/blog/ai-agent-benchmarks", "author": "Evidently AI", "date": "2025-04-01", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation, survey, tool-use, planning, decision-making, landscape]", "filename": "evidently_ai_agent_benchmarks.md"}, {"source_type": "announcement", "title": "BrowseComp: A Benchmark for Browsing Agents", "url": "https://openai.com/index/browsecomp/", "author": "OpenAI (Jason Wei et al.)", "date": "2025-04 (arxiv 2504.12516)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, web-browsing, information-retrieval, deep-research, search]", "filename": "openai_browsecomp.md"}, {"source_type": "announcement", "title": "PaperBench: Evaluating AI's Ability to Replicate AI Research", "url": "https://openai.com/index/paperbench/", "author": "OpenAI (Preparedness team, including Tejal Patwardhan)", "date": "2025-04 (arxiv 2504.01848, ICML 2025 Poster)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, research-replication, AI-R&D, ICML, coding, experiment-execution]", "filename": "openai_paperbench.md"}, {"source_type": "arxiv", "title": "AI Idea Bench 2025: AI Research Idea Generation Benchmark", "url": "https://arxiv.org/abs/2504.14191", "author": "Yansheng Qiu et al.", "date": "2025-04", "retrieved": "2026-04-29", "tags": "[benchmark, evaluation, research, planning, reasoning, idea-generation, creativity, dataset]", "filename": "2504.14191-ai-idea-bench.md"}, {"source_type": "arxiv", "title": "Multi-SWE-bench: A Multilingual Benchmark for Issue Resolving", "url": "https://arxiv.org/abs/2504.02605", "author": "ByteDance Seed (large team; correspondence: zandaoguang@bytedance.com, shen.kai@bytedance.com)", "date": "2025-04", "retrieved": "2026-03-27", "tags": "[agentic, benchmark, coding, software-engineering, multilingual, issue-resolving, repository-level, execution-based, neurips-2025]", "filename": "multi-swe-bench.md"}, {"source_type": "arxiv", "title": "An Illusion of Progress? Assessing the Current State of Web Agents", "url": "https://arxiv.org/abs/2504.01382", "author": "Tianci Xue, Weijian Qi, Tianneng Shi, Chan Hee Song, Boyu Gou, Dawn Song, Huan Sun, Yu Su", "date": "2025-04", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, web-navigation, planning, survey]", "filename": "online_mind2web.md"}, {"source_type": "arxiv", "title": "SWE-PolyBench: A Multi-Language Benchmark for Repository Level Coding Agents", "url": "https://arxiv.org/abs/2504.08703", "author": "(Amazon AWS team — full author list in paper)", "date": "2025-04", "retrieved": "2026-03-27", "tags": "[agentic, benchmark, coding, software-engineering, multilingual, repository-level, execution-based, evaluation]", "filename": "swe-polybench.md"}, {"source_type": "arxiv", "title": "SWE-smith: Scaling Data for Software Engineering Agents", "url": "https://arxiv.org/abs/2504.21798", "author": "John Yang, Kilian Lieret, Carlos E. Jimenez et al.", "date": "2025-04", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, dataset, code-generation, debugging, evaluation, tool-use]", "filename": "swe_smith.md"}, {"source_type": "arxiv", "title": "WASP: Benchmarking Web Agent Security Against Prompt Injection Attacks", "url": "https://arxiv.org/abs/2504.18575", "author": "Ivan Evtimov et al.", "date": "2025-04", "retrieved": "2026-04-15", "tags": "[agentic, benchmark, evaluation, web-navigation, security, prompt-injection, adversarial, NeurIPS-2025, facebook-research]", "filename": "wasp.md"}, {"source_type": "announcement", "title": "Buyout Game Benchmark: Multi-Agent Bargaining, Transfers, and Hostile Takeovers", "url": "https://github.com/lechmazur/buyout_game", "author": "lechmazur", "date": "2025-03-31", "retrieved": "2026-03-31", "tags": "[agentic, benchmark, evaluation, multi-agent, reasoning, negotiation, economic-coordination, coalition, game-theory, strategic-reasoning]", "filename": "summary_buyout_game.md"}, {"source_type": "arxiv", "title": "GeoBenchX: Benchmarking LLMs in Agent Solving Multistep Geospatial Tasks", "url": "https://arxiv.org/abs/2503.18129", "author": "Krechetova et al.", "date": "2025-03-23", "retrieved": "2026-04-27", "tags": "[agentic, benchmark, tool-use, evaluation, spatial-analysis, geospatial, LLM-as-judge, multi-step, function-calling, GIS]", "filename": "geo_benchx.md"}, {"source_type": "arxiv", "title": "CVE-Bench: A Benchmark for AI Agents' Ability to Exploit Real-World Web Application Vulnerabilities", "url": "https://arxiv.org/abs/2503.17332", "author": "Yuxuan Zhu, Antony Kellermann, Dylan Bowman, Philip Li, Akul Gupta, Adarsh Danda, Richard Fang, Conner Jensen, Eric Ihli, Jason Benn, Jet Geronimo, Avi Dhir, Sudhit Rao, Kaicheng Yu, Twm Stone, Daniel Kang", "date": "2025-03-21", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, cybersecurity, vulnerability-exploitation, CVE, zero-day, safety, UIUC]", "filename": "cve-bench.md"}, {"source_type": "arxiv", "title": "HCAST: Human-Calibrated Autonomy Software Tasks", "url": "https://arxiv.org/abs/2503.17354", "author": "David Rein, Joel Becker, Amy Deng, Seraphina Nix, Chris Canal, Daniel O'Connel, Pip Arnott, Ryan Bloom, Thomas Broadley, Katharyn Garcia, Brian Goodrich, Max Hasin, Sami Jawhar, Megan Kinniment, Thomas Kwa, Aron Lajko, Nate Rush, Lucas Jun Koba Sato, Sydney Von Arx, Ben West, Lawrence Chan, Elizabeth Barnes", "date": "2025-03-21", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, autonomy, software-engineering, cybersecurity, ML-engineering, human-calibrated, METR, safety]", "filename": "hcast.md"}, {"source_type": "arxiv", "title": "ColBench: Collaborative Agent Benchmark (from SWEET-RL)", "url": "https://arxiv.org/abs/2503.15478", "author": "Yifei Zhou, Song Jiang, Yuandong Tian, Jason Weston, Sergey Levine, Sainbayar Sukhbaatar, Xian Li", "date": "2025-03-19", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, collaborative-coding, multi-turn, reinforcement-learning, frontend-design, backend-programming]", "filename": "colbench.md"}, {"source_type": "twitter", "title": "METR Time Horizons — Exponential Growth in AI Agent Task Completion", "url": "https://x.com/METR_Evals/status/1902384502191595869", "author": "@METR_Evals", "date": "2025-03-19", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, METR, HCAST, time-horizons, autonomy, software-engineering, exponential-growth]", "filename": "thread_metr_time_horizons_METR_Evals.md"}, {"source_type": "arxiv", "title": "RefactorBench: Evaluating Stateful Reasoning in Language Agents Through Code", "url": "https://arxiv.org/abs/2503.07832", "author": "Dhruv Gautam et al.", "date": "2025-03-10", "retrieved": "2026-03-08", "tags": "[agentic, benchmark, evaluation, code-generation, reasoning, planning, memory, debugging]", "filename": "refactorbench.md"}, {"source_type": "twitter", "title": "SWE-bench Criticism and Defense — Limitations and the Path Forward", "url": "https://x.com/OfirPress/status/1966227423252595056", "author": "@OfirPress", "date": "2025-03-10", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, SWE-bench, criticism, limitations, benchmark-gaming, software-engineering]", "filename": "thread_swebench_criticism_limitations.md"}, {"source_type": "announcement", "title": "Task-Completion Time Horizons of Frontier AI Models", "url": "https://metr.org/time-horizons/", "author": "METR (Model Evaluation & Threat Research)", "date": "2025-03 (initial publication), updated through 2026-01 (Time Horizon 1.1)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, autonomy, time-horizons, AI-safety, research-engineering, cybersecurity, software-engineering]", "filename": "metr_time_horizons.md"}, {"source_type": "announcement", "title": "BLUR: Browsing Lost Unformed Recollections - A Benchmark for Tip-of-the-Tongue Search and Reasoning", "url": "https://www.patronus.ai/blog/the-blur-benchmark-browsing-lost-unformed-recollections", "author": "Patronus AI", "date": "2025-03 (arxiv 2503.19193, ACL 2025)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, search, multilingual, multimodal, tip-of-the-tongue, information-retrieval, tool-use]", "filename": "patronus_blur.md"}, {"source_type": "announcement", "title": "Factorio Learning Environment: Game-Based Agent Planning Benchmark", "url": "https://github.com/JackHopkins/factorio-learning-environment", "author": "Jack Hopkins et al.", "date": "2025-03 (arxiv 2503.09617)", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, game-based, planning, code-synthesis, resource-management, open-ended, unsaturable]", "filename": "factorio_learning_environment.md"}, {"source_type": "arxiv", "title": "AILuminate: Introducing v1.0 of the AI Risk and Reliability Benchmark from MLCommons", "url": "https://arxiv.org/abs/2503.05731", "author": "Shaona Ghosh et al.", "date": "2025-03", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, safety, taxonomy, dataset]", "filename": "ailuminate.md"}, {"source_type": "arxiv", "title": "MultiAgentBench: Evaluating the Collaboration and Competition of LLM agents", "url": "https://arxiv.org/abs/2503.01935", "author": "Kunlun Zhu, Hongyi Du, Zhaochen Hong et al.", "date": "2025-03", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, multi-agent, planning, reasoning, coordination, competition]", "filename": "multiagentbench.md"}, {"source_type": "arxiv", "title": "An Agentic Evaluation Framework for AI-Generated Scientific Code in PETSc", "url": "https://arxiv.org/abs/2603.15976", "author": "Hong Zhang et al.", "date": "2025-03", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, code-generation, tool-use]", "filename": "petscagent_bench.md"}, {"source_type": "arxiv", "title": "ProjectEval: A Benchmark for Programming Agents Automated Evaluation on Project-Level Code Generation", "url": "https://arxiv.org/abs/2503.07010", "author": "Kaiyuan Liu, Youcheng Pan, Yang Xiang, Daojing He, Jing Li, Yexing Du, Tianrun Gao", "date": "2025-03", "retrieved": "2026-03-27", "tags": "[benchmark, code-generation, project-level, programming-agents, user-interaction, automated-evaluation, HIT, Pengcheng-Laboratory]", "filename": "projecteval.md"}, {"source_type": "arxiv", "title": "Survey on Evaluation of LLM-based Agents", "url": "https://arxiv.org/abs/2503.16416", "author": "Asaf Yehudai et al.", "date": "2025-03", "retrieved": "2026-04-01", "tags": "[survey, agentic, benchmark, evaluation, taxonomy, reasoning, planning, memory, tool-use]", "filename": "survey_llm_agent_eval_2503.md"}, {"source_type": "arxiv", "title": "tau-Knowledge: Evaluating Conversational Agents over Unstructured Knowledge", "url": "https://arxiv.org/abs/2603.04370", "author": "Quan Shi, Alexandra Zytek, Pedram Razavi, Karthik Narasimhan, Victor Barres", "date": "2025-03", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, agentic, tool-use, reasoning, memory, function-calling, planning]", "filename": "tau_knowledge.md"}, {"source_type": "twitter", "title": "HAL — The Holistic Agent Leaderboard for Standardized Agent Evaluation", "url": "https://x.com/benediktstroebl/status/1895148129655365779", "author": "@benediktstroebl", "date": "2025-02-27", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, leaderboard, HAL, Princeton, standardized-evaluation, cost-aware, meta-benchmark]", "filename": "thread_hal_holistic_agent_leaderboard_benediktstroebl.md"}, {"source_type": "arxiv", "title": "Can Large Language Models Detect Errors in Long Chain-of-Thought Reasoning? (DeltaBench)", "url": "https://arxiv.org/abs/2502.19361", "author": "(LivingFutureLab / OpenStellarTeam)", "date": "2025-02-26", "retrieved": "2026-04-19", "tags": "[benchmark, reasoning, chain-of-thought, error-detection, process-reward-models, evaluation, meta-evaluation]", "filename": "2502.19361-deltabench.md"}, {"source_type": "arxiv", "title": "REALM-Bench: A Benchmark for Evaluating Multi-Agent Systems on Real-world, Dynamic Planning and Scheduling Tasks", "url": "https://arxiv.org/abs/2502.18836", "author": "Longling Geng, Edward Y. Chang", "date": "2025-02-26", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, multi-agent, planning, scheduling, logistics, dynamic, real-world]", "filename": "realm-bench.md"}, {"source_type": "arxiv", "title": "WebGames: Challenging General-Purpose Web-Browsing AI Agents", "url": "https://arxiv.org/abs/2502.18356", "author": "George Thomas, Alex J. Chan, Jikun Kang et al.", "date": "2025-02-25", "retrieved": "2026-04-16", "tags": "[agentic, benchmark, evaluation, web-navigation, tool-use, planning, reasoning, dataset]", "filename": "webgames.md"}, {"source_type": "twitter", "title": "Evaluating AI Agents — Andrew Ng's Short Course on Agent Evals", "url": "https://x.com/AndrewYNg/status/1892258190546653392", "author": "@AndrewYNg", "date": "2025-02-19", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, evaluation-methodology, course, LLM-as-judge, code-based-evals, education]", "filename": "thread_andrew_ng_evaluating_agents.md"}, {"source_type": "twitter", "title": "SWE-Lancer — Testing AI on $1 Million Worth of Freelance Coding Tasks", "url": "https://x.com/OpenAI/status/1891911123517018521", "author": "@OpenAI", "date": "2025-02-18", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, coding, software-engineering, freelance, economic-value, OpenAI]", "filename": "thread_swe_lancer_openai.md"}, {"source_type": "announcement", "title": "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?", "url": "https://openai.com/index/swe-lancer/", "author": "OpenAI", "date": "2025-02-17 (arxiv 2502.12115)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, software-engineering, coding, freelance, economic-value, Upwork]", "filename": "openai_swe_lancer.md"}, {"source_type": "arxiv", "title": "BaxBench: Can LLMs Generate Correct and Secure Backends?", "url": "https://arxiv.org/abs/2502.11844", "author": "Mark Vero, Niels Mündler, Victor Chibotaru, Veselin Raychev, Maximilian Baader, Nikola Jovanović, Jingxuan He, Martin Vechev", "date": "2025-02-17", "retrieved": "2026-05-05", "tags": "[benchmark, code-generation, security, backend, agentic, tool-use, correctness, vulnerability, LLM-evaluation, ICML-2025]", "filename": "2502.11844-baxbench.md"}, {"source_type": "arxiv", "title": "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?", "url": "https://arxiv.org/abs/2502.12115", "author": "Samuel Miserendino, Michele Wang, Tejal Patwardhan, Johannes Heidecke", "date": "2025-02-17", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, software-engineering, freelance, coding, real-world, OpenAI, economic-value]", "filename": "swe-lancer.md"}, {"source_type": "arxiv", "title": "WorldGUI: An Interactive Benchmark for Desktop GUI Automation from Any Starting Point", "url": "https://arxiv.org/abs/2502.08047", "author": "Henry Hengyuan Zhao, Kaiming Yang, Wendi Yu, Difei Gao, Mike Zheng Shou", "date": "2025-02-12", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, GUI-automation, desktop, robustness, planning, computer-interaction]", "filename": "worldgui.md"}, {"source_type": "twitter", "title": "Benchmark Saturation — Are We Running Out of Hard Tests?", "url": "https://timkellogg.me/blog/2025/02/12/recursive-improvement", "author": "Various (community discussion)", "date": "2025-02-12", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, saturation, limitations, criticism, MMLU, GPQA, SWE-bench, RE-Bench]", "filename": "thread_benchmark_saturation_timkellogg.md"}, {"source_type": "announcement", "title": "HackerRank-ASTRA: Evaluating Correctness & Consistency of Large Language Models on Cross-Domain Multi-File Project Problems", "url": "https://www.hackerrank.com/ai/astra-reports", "author": "Jun Xing, Mayur Bhatia, Sahil Phulwani, Darshan Suresh, Rafik Matta (HackerRank)", "date": "2025-02-11 (arxiv 2502.00226, submitted 2025-01-31)", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, coding, evaluation, LLM, software-engineering, frontend, multi-file, project-based, consistency, code-generation, SDLC]", "filename": "hackerrank_astra.md"}, {"source_type": "announcement", "title": "Galileo Agent Leaderboard", "url": "https://huggingface.co/spaces/galileo-ai/agent-leaderboard", "author": "Galileo Labs (Pratik Bhavsar, Conor Bronsdon)", "date": "2025-02-11", "retrieved": "2026-03-29", "tags": "[leaderboard, benchmark, evaluation, function-calling, tool-use, agentic]", "filename": "galileo_agent_leaderboard.md"}, {"source_type": "twitter", "title": "Benchmarking Single Agent Performance — When Do Agents Break Down?", "url": "https://x.com/LangChainAI/status/1889006836294074607", "author": "@LangChainAI", "date": "2025-02-10", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, single-agent, tool-use, context-window, performance-degradation, multi-agent]", "filename": "thread_benchmarking_single_agent_langchain.md"}, {"source_type": "arxiv", "title": "AgentDyn: A Dynamic Open-Ended Benchmark for Evaluating Prompt Injection Attacks of Real-World Agent Security System", "url": "https://arxiv.org/abs/2602.03117", "author": "Hao Li et al.", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, tool-use, security, prompt-injection, planning]", "filename": "agentdyn.md"}, {"source_type": "arxiv", "title": "AMA-Bench: Evaluating Long-Horizon Memory for Agentic Applications", "url": "https://arxiv.org/abs/2602.22769", "author": "Yujie Zhao et al.", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, memory, reasoning, planning, dataset]", "filename": "ama_bench.md"}, {"source_type": "arxiv", "title": "Collab-Overcooked: Benchmarking and Evaluating Large Language Models as Collaborative Agents", "url": "https://arxiv.org/abs/2502.20073", "author": "Haochen Sun et al.", "date": "2025-02", "retrieved": "2026-05-01", "tags": "[benchmark, evaluation, multi-agent, collaboration, LLM-MAS, natural-language-communication, cooperative-AI, game-environment, process-oriented-evaluation]", "filename": "collab_overcooked_llm_collaborative_agents.md"}, {"source_type": "arxiv", "title": "DataSciBench: An LLM Agent Benchmark for Data Science", "url": "https://arxiv.org/abs/2502.13897", "author": "Dan Zhang et al.", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, code-generation, reasoning, dataset, planning]", "filename": "datascibench.md"}, {"source_type": "arxiv", "title": "An Extended Benchmarking of Multi-Agent Reinforcement Learning Algorithms in Complex Fully Cooperative Tasks", "url": "https://arxiv.org/abs/2502.04773", "author": "George Papadopoulos et al.", "date": "2025-02", "retrieved": "2026-05-01", "tags": "[benchmark, evaluation, multi-agent, reinforcement-learning, cooperative, MARL, robot-cooperation, warehouse, resource-management, image-observations]", "filename": "extended_marl_benchmarking_cooperative.md"}, {"source_type": "arxiv", "title": "Gaia2: Benchmarking LLM Agents on Dynamic and Asynchronous Environments", "url": "https://arxiv.org/abs/2602.11964", "author": "Romain Froger et al. (Meta SuperIntelligence Labs)", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, multi-agent, reasoning, planning, tool-use, function-calling, memory, robustness]", "filename": "gaia2_benchmark.md"}, {"source_type": "arxiv", "title": "General Agent Evaluation", "url": "https://arxiv.org/abs/2602.22953", "author": "Elron Bandel, Asaf Yehudai, Lilach Eden et al. (IBM Research)", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, agentic, tool-use, code-generation, web-navigation, planning, taxonomy, leaderboard]", "filename": "general_agent_eval.md"}, {"source_type": "arxiv", "title": "Hybrid-Gym: Training Coding Agents to Generalize Across Tasks", "url": "https://arxiv.org/abs/2602.16819", "author": "Yiqing Xie, Emmy Liu, Gaokai Zhang et al. (Carnegie Mellon University, All Hands AI)", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[agentic, evaluation, code-generation, reasoning, planning, dataset, training]", "filename": "hybrid_gym.md"}, {"source_type": "arxiv", "title": "ProjDevBench: Benchmarking AI Coding Agents on End-to-End Project Development", "url": "https://arxiv.org/abs/2602.01655", "author": "Pengrui Lu*, Shiqi Zhang*, Yunzhong Hou* et al.", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, code-generation, planning, debugging]", "filename": "projdevbench.md"}, {"source_type": "arxiv", "title": "SEA-HELM: Southeast Asian Holistic Evaluation of Language Models", "url": "https://arxiv.org/abs/2502.14301", "author": "Yosephine Susanto, Adithya Venkatadri Hulagadri, Jann Railey Montalan et al.", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[evaluation, benchmark, survey, leaderboard]", "filename": "sea_helm.md"}, {"source_type": "arxiv", "title": "SkillsBench: Benchmarking the Efficacy of Agent Skills Augmentation", "url": "https://arxiv.org/abs/2602.12670", "author": "Merrill et al. (Laude Institute)", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, tool-use, code-generation, planning, reasoning]", "filename": "skillsbench.md"}, {"source_type": "arxiv", "title": "SWE-rebench V2: Language-Agnostic SWE Task Collection at Scale", "url": "https://arxiv.org/abs/2602.23866", "author": "Ibragim Badertdinov et al. (Nebius)", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, code-generation, debugging, dataset, tool-use]", "filename": "swe_rebench_v2.md"}, {"source_type": "arxiv", "title": "TRACE: Trajectory-Aware Comprehensive Evaluation for Deep Research Agents", "url": "https://arxiv.org/abs/2602.21230", "author": "Yanyu Chen et al.", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, reasoning, planning, research, taxonomy, dataset]", "filename": "trace_deep_research.md"}, {"source_type": "announcement", "title": "VLAIR: Vals Legal AI Report", "url": "https://www.vals.ai/vlair", "author": "Vals AI / Legaltech Hub", "date": "2025-02", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, legal, evaluation, llm-as-judge, domain-specific, enterprise]", "filename": "summary_vlair.md"}, {"source_type": "announcement", "title": "Humanity's Last Exam", "url": "https://agi.safe.ai/", "author": "Center for AI Safety (CAIS) & Scale AI", "date": "2025-01-28", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, expert-reasoning, multimodal, frontier-evaluation, calibration]", "filename": "summary_humanitys_last_exam.md"}, {"source_type": "arxiv", "title": "KoCo-Bench: Can Large Language Models Leverage Domain Knowledge in Software Development?", "url": "https://arxiv.org/abs/2601.13240", "author": "Xue Jiang, Jiaru Qian, Xianjie Shi, Chenjie Li, Hao Zhu, Ziyu Wang, Jielun Zhang, Zheyu Zhao, Kechi Zhang, Jia Li, Wenpin Jiao, Zhi Jin, Ge Li, Yihong Dong", "date": "2025-01-22", "retrieved": "2026-03-09", "tags": "[benchmark, code-generation, domain-knowledge, agentic, software-engineering, RAG, reinforcement-learning, embodied-AI, agents, evaluation]", "filename": "kocobench.md"}, {"source_type": "arxiv", "title": "ComplexFuncBench: Exploring Multi-Step and Constrained Function Calling under Long-Context Scenario", "url": "https://arxiv.org/abs/2501.10132", "author": "Lucen Zhong et al.", "date": "2025-01-17", "retrieved": "2026-04-27", "tags": "[agentic, benchmark, function-calling, tool-use, evaluation, long-context, multi-step, constraints, parameter-reasoning]", "filename": "complex_func_bench.md"}, {"source_type": "announcement", "title": "DPAB-α: Dria Pythonic Agent Benchmark", "url": "https://huggingface.co/blog/andthattoo/dpab-a", "author": "Atakan Tekparmak, andthattoo (Dria / FirstBatch)", "date": "2025-01-15", "retrieved": "2026-03-23", "tags": "[agentic, benchmark, evaluation, function-calling, tool-use, reasoning]", "filename": "summary_dpab_alpha.md"}, {"source_type": "twitter", "title": "Dria DPAB-alpha — Benchmark for Multi-Step Reasoning and Function Calling", "url": "https://x.com/mervenoyann/status/1879645656576639197", "author": "@mervenoyann", "date": "2025-01-15", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, function-calling, multi-step-reasoning, Dria, creative-tasks]", "filename": "thread_dria_dpab_agentic_benchmark_mervenoyann.md"}, {"source_type": "twitter", "title": "Scale AI SEAL Leaderboards — Comprehensive Agent and Model Evaluation", "url": "https://x.com/scale_AI/status/1909998772631069145", "author": "@scale_AI", "date": "2025-01-09", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, leaderboard, SEAL, Scale-AI, SWE-Bench-Pro, MCP-Atlas, coding, safety]", "filename": "thread_scale_seal_leaderboards.md"}, {"source_type": "arxiv", "title": "Mem-Gallery: Benchmarking Multimodal Long-Term Conversational Memory for MLLM Agents", "url": "https://arxiv.org/abs/2601.03515", "author": "Yuanchen Bei, Tianxin Wei, Xuying Ning, Yanjun Zhao, Zhining Liu, Xiao Lin, Yada Zhu, Hendrik Hamann, Jingrui He, Hanghang Tong", "date": "2025-01-07", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, memory, long-term-memory, multimodal, multi-turn, agentic, conversational]", "filename": "mem_gallery.md"}, {"source_type": "arxiv", "title": "CodeElo: Benchmarking Competition-level Code Generation of LLMs with Human-comparable Elo Ratings", "url": "https://arxiv.org/abs/2501.01257", "author": "Shanghaoran Quan et al.", "date": "2025-01-02", "retrieved": "2026-04-25", "tags": "[benchmark, code-generation, competitive-programming, evaluation, leaderboard, elo-rating, codeforces, reasoning, algorithms]", "filename": "2501.01257-codeelo.md"}, {"source_type": "arxiv", "title": "A3: Android Agent Arena for Mobile GUI Agents with Essential-State Procedural Evaluation", "url": "https://arxiv.org/abs/2501.01149", "author": "Yuxiang Chai, Shunye Tang, Han Xiao, Weifeng Lin, Hanhao Li, Jiayu Zhang, Liang Liu, Pengxiang Zhao, Guangyi Liu, Guozhi Wang, Shuai Ren, Rongduo Han, Haining Zhang, Siyuan Huang, Hongsheng Li", "date": "2025-01-02", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, android, mobile, gui, essential-state-evaluation]", "filename": "android-agent-arena.md"}, {"source_type": "arxiv", "title": "SWE-EVO: Benchmarking Coding Agents in Long-Horizon Software Evolution Scenarios", "url": "https://arxiv.org/abs/2512.18470", "author": "Pham et al. (FPT Software AI Center / University of Melbourne)", "date": "2025-01 (v2)", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, code-generation, agentic, software-engineering, planning, reasoning, long-horizon]", "filename": "swe_evo.md"}, {"source_type": "arxiv", "title": "ABC-Bench: Benchmarking Agentic Backend Coding in Real-World Development", "url": "https://arxiv.org/abs/2601.11077", "author": "Yang et al. (Fudan University / Shanghai Qiji Zhifeng)", "date": "2025-01", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, agentic, code-generation, tool-use, software-engineering, deployment, multi-language]", "filename": "abc_bench.md"}, {"source_type": "arxiv", "title": "AgencyBench: A Comprehensive Benchmark for Long-Horizon Real-World Agent Tasks", "url": "https://arxiv.org/abs/2601.11044", "author": "Yinger Zhang et al. (GAIR-NLP)", "date": "2025-01", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, tool-use, code-generation, debugging, research, planning, memory]", "filename": "agencybench.md"}, {"source_type": "arxiv", "title": "AI-NativeBench: An Open-Source White-Box Agentic Benchmark Suite for AI-Native Systems", "url": "https://arxiv.org/abs/2601.09393", "author": "Zirui Wang, Guangba Yu et al.", "date": "2025-01", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, multi-agent, tool-use, mcp, planning, reasoning, taxonomy]", "filename": "ai_nativebench.md"}, {"source_type": "arxiv", "title": "DeepPlanning: Benchmarking Long-Horizon Agentic Planning with Verifiable Constraints", "url": "https://arxiv.org/abs/2601.18137", "author": "Yinger Zhang, Shutong Jiang, Renhao Li et al. (Qwen Team, Alibaba Group)", "date": "2025-01", "retrieved": "2026-05-03", "tags": "[agentic, benchmark, evaluation, reasoning, planning]", "filename": "deep_planning.md"}, {"source_type": "arxiv", "title": "DeepPlanning: Benchmarking Long-Horizon Agentic Planning with Verifiable Constraints", "url": "https://arxiv.org/abs/2601.18137", "author": "Yinger Zhang, Shutong Jiang, Renhao Li et al.", "date": "2025-01", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, planning, tool-use, reasoning]", "filename": "deepplanning.md"}, {"source_type": "arxiv", "title": "DeepSearchQA: Bridging the Comprehensiveness Gap for Deep Research Agents", "url": "https://arxiv.org/abs/2601.20975", "author": "Nikita Gupta, Riju Chatterjee, Lukas Haas, Connie Tao et al. (Google DeepMind / Google Search / Kaggle / Google Research)", "date": "2025-01", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, reasoning, web-navigation, planning, dataset, deep-research]", "filename": "deepsearchqa.md"}, {"source_type": "arxiv", "title": "DevOps-Gym: Benchmarking AI Agents in Software DevOps Cycle", "url": "https://arxiv.org/abs/2601.20882", "author": "Yuheng Tang, Kaijie Zhu, Bonan Ruan et al. (UC Santa Barbara / NUS / UC Berkeley / Google / UCLA)", "date": "2025-01", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, code-generation, tool-use, debugging, planning, os-interaction]", "filename": "devops_gym.md"}, {"source_type": "arxiv", "title": "FrontierScience", "url": "https://arxiv.org/abs/2601.21165", "author": "OpenAI (Miles Turpin et al.)", "date": "2025-01", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, reasoning, research, dataset]", "filename": "frontierscience.md"}, {"source_type": "arxiv", "title": "IDRBench: Interactive Deep Research Benchmark", "url": "https://arxiv.org/abs/2601.06676", "author": "Yingchaojie Feng et al.", "date": "2025-01", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, deep-research, interactive, multi-agent, planning, reasoning, report-generation]", "filename": "idrbench.md"}, {"source_type": "arxiv", "title": "MCPAgentBench: A Real-world Task Benchmark for Evaluating LLM Agent MCP Tool Use", "url": "https://arxiv.org/abs/2512.24565", "author": "Wenrui Liu, Zixiang Liu, Elsie Dai et al.", "date": "2025-01", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, tool-use, function-calling, MCP, planning, efficiency]", "filename": "mcpagentbench_pku.md"}, {"source_type": "arxiv", "title": "MMDeepResearch-Bench: A Benchmark for Multimodal Deep Research Agents", "url": "https://arxiv.org/abs/2601.12346", "author": "Peizhou Huang et al.", "date": "2025-01", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, deep-research, multimodal, report-generation, retrieval, reasoning, citation-grounding]", "filename": "mmdeepresearch_bench.md"}, {"source_type": "arxiv", "title": "From Laboratory to Real-World Applications: Benchmarking Agentic Code Reasoning at the Repository Level", "url": "https://arxiv.org/abs/2601.03731", "author": "Jia Li, Yuxin Su, Michael R. Lyu", "date": "2025-01", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, code-generation, reasoning, debugging]", "filename": "reporeason.md"}, {"source_type": "announcement", "title": "Context-Bench: Benchmarking LLMs on Agentic Context Engineering", "url": "https://www.letta.com/blog/context-bench", "author": "Letta", "date": "2025 (ongoing updates)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, context-engineering, memory, file-operations, multi-step-reasoning, tool-use]", "filename": "letta_context_bench.md"}, {"source_type": "arxiv", "title": "AgentDS Technical Report: Benchmarking the Future of Human-AI Collaboration in Domain-Specific Data Science", "url": "https://arxiv.org/abs/2603.19005", "author": "An Luo, Jin Du, Xun Xian, Robert Specht, Fangqiao Tian, Ganghua Wang, Xuan Bi, Charles Fleming, Ashish Kundu, Jayanth Srinivasa, Mingyi Hong, Rui Zhang, Tianxi Li, Galin Jones, Jie Ding", "date": "2025 (competition October 2025; submitted 2026)", "retrieved": "2026-03-25", "tags": "[agentic, benchmark, data-science, human-ai-collaboration, multimodal, domain-specific, coding, competition]", "filename": "agentds.md"}, {"source_type": "announcement", "title": "Finance Agent Benchmark: Benchmarking LLMs on Financial Analyst Tasks", "url": "https://www.vals.ai/benchmarks/finance_agent", "author": "Vals AI (in collaboration with Stanford researchers and a Global Systemically Important Bank)", "date": "2025 (arxiv 2508.00828)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, finance, SEC-filings, financial-analysis, tool-use, enterprise]", "filename": "vals_ai_finance_agent.md"}, {"source_type": "announcement", "title": "RIEMANN-BENCH: A Benchmark for Moonshot Mathematics", "url": "https://cdn.prod.website-files.com/68dc970bd6e945ea3fb0f426/69c2d73f5d377a9428089ff7_88b9c61d478380737e8f8dc285adba31_RiemannBench.pdf", "author": "Surge AI Research", "date": "2025", "retrieved": "2026-03-27", "tags": "[benchmark, evaluation, reasoning, research]", "filename": "summary_riemannbench.md"}, {"source_type": "announcement", "title": "Terminal-Bench: Benchmarks for AI Agents in Terminal Environments", "url": "https://www.tbench.ai/", "author": "Stanford x Laude", "date": "2025", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, terminal, cli, system-administration, security, devops, machine-learning, data-science]", "filename": "summary_terminal_bench.md"}, {"source_type": "announcement", "title": "SEAL LLM Leaderboards: Expert-Driven Evaluations", "url": "https://scale.com/leaderboard", "author": "Scale AI", "date": "2024-2025 (ongoing, with 15 new benchmarks introduced in 2025)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, leaderboard, tool-use, coding, reasoning, safety, multimodal, software-engineering, enterprise]", "filename": "scale_ai_seal_leaderboards.md"}, {"source_type": "arxiv", "title": "Plancraft: an evaluation dataset for planning with LLM agents", "url": "https://arxiv.org/abs/2412.21033", "author": "Gautier Dagan, Frank Keller, Alex Lascarides (University of Edinburgh)", "date": "2024-12-30", "retrieved": "2026-03-09", "tags": "[agentic, benchmark, planning, minecraft, multi-modal, knowledge-base, feasibility, crafting, VLM]", "filename": "plancraft.md"}, {"source_type": "arxiv", "title": "SWE-EVO: Benchmarking Coding Agents in Long-Horizon Software Evolution Scenarios", "url": "https://arxiv.org/abs/2512.18470", "author": "Minh VT Thai et al.", "date": "2024-12-23", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, code-generation, evaluation, software-engineering, long-horizon, planning, multi-file, reasoning, tool-use]", "filename": "2512.18470-swe-evo.md"}, {"source_type": "twitter", "title": "TheAgentCompany — Benchmarking AI Agents on Real-World Workplace Tasks", "url": "https://x.com/gneubig/status/1869735196700062089", "author": "@gneubig", "date": "2024-12-19", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, workplace, software-development, project-management, data-science, multi-task]", "filename": "thread_theagentcompany_gneubig.md"}, {"source_type": "arxiv", "title": "SafeAgentBench: A Benchmark for Safe Task Planning of Embodied LLM Agents", "url": "https://arxiv.org/abs/2412.13178", "author": "Sheng Yin et al.", "date": "2024-12-17", "retrieved": "2026-04-03", "tags": "[agentic, benchmark, evaluation, safety, embodied, task-planning, household-robots, LLM-agents, simulation, planning, reasoning]", "filename": "safeagentbench.md"}, {"source_type": "arxiv", "title": "NL2Repo-Bench: Towards Long-Horizon Repository Generation Evaluation of Coding Agents", "url": "https://arxiv.org/abs/2512.12730", "author": "Jingzhe Ding et al.", "date": "2024-12-14", "retrieved": "2026-04-25", "tags": "[agentic, benchmark, code-generation, evaluation, software-engineering, repository-generation, long-horizon, tool-use, python, multi-file, planning]", "filename": "2512.12730-nl2repo-bench.md"}, {"source_type": "arxiv", "title": "Agent-SafetyBench: Evaluating the Safety of LLM Agents", "url": "https://arxiv.org/abs/2412.14470", "author": "Zhexin Zhang et al.", "date": "2024-12", "retrieved": "2026-03-23", "tags": "[agentic, benchmark, evaluation, safety, tool-use, behavioral-safety, failure-modes]", "filename": "agent_safetybench.md"}, {"source_type": "arxiv", "title": "The BrowserGym Ecosystem for Web Agent Research", "url": "https://arxiv.org/abs/2412.05467", "author": "Thibault Le Sellier De Chezelles, Maxime Gasse, Alexandre Lacoste et al.", "date": "2024-12", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, web-navigation, agentic, tool-use, planning, reasoning, leaderboard]", "filename": "browsergym.md"}, {"source_type": "arxiv", "title": "HammerBench: Fine-Grained Function-Calling Evaluation in Real Mobile Assistant Scenarios", "url": "https://arxiv.org/abs/2412.16516", "author": "Jun Wang, Jiamu Zhou, Muning Wen, Xiaoyun Mo, Haoyu Zhang, Qiqiang Lin, Cheng Jin, Xihuai Wang, Weinan Zhang, Qiuying Peng, Jun Wang", "date": "2024-12", "retrieved": "2026-03-29", "tags": "[benchmark, function-calling, tool-use, multi-turn, mobile-assistant, slot-filling, intent-shift]", "filename": "hammerbench.md"}, {"source_type": "arxiv", "title": "RE-Bench: Evaluating Frontier AI R&D Capabilities of Language Model Agents Against Human Experts", "url": "https://arxiv.org/abs/2411.15114", "author": "Hjalmar Wijk, Tao Lin, Joel Becker et al. (METR)", "date": "2024-11-20", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, research, code-generation, planning, debugging, reasoning]", "filename": "re_bench.md"}, {"source_type": "arxiv", "title": "ToolScan: A Benchmark for Characterizing Errors in Tool-Use LLMs", "url": "https://arxiv.org/abs/2411.13547", "author": "Shirley Kokane et al.", "date": "2024-11-20", "retrieved": "2026-04-13", "tags": "[agentic, benchmark, evaluation, tool-use, error-analysis, function-calling, error-taxonomy, diagnostic]", "filename": "toolscan.md"}, {"source_type": "arxiv", "title": "CRMArena: Understanding the Capacity of LLM Agents to Perform Professional CRM Tasks in Realistic Environments", "url": "https://arxiv.org/abs/2411.02305", "author": "Kung-Hsiang Huang et al.", "date": "2024-11", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, tool-use, function-calling, enterprise, crm]", "filename": "crmarena.md"}, {"source_type": "arxiv", "title": "PARTNR: A Benchmark for Planning and Reasoning in Embodied Multi-agent Tasks", "url": "https://arxiv.org/abs/2411.00081", "author": "Matthew Chang et al.", "date": "2024-11", "retrieved": "2026-04-01", "tags": "[agentic, benchmark, evaluation, multi-agent, planning, reasoning, tool-use, dataset]", "filename": "partnr_embodied_multiagent.md"}, {"source_type": "arxiv", "title": "Spider 2.0: Evaluating Language Models on Real-World Enterprise Text-to-SQL Workflows (DBT Component)", "url": "https://arxiv.org/abs/2411.07763", "author": "Fangyu Lei, Jixuan Chen et al. (XLANG Lab, Tao Yu)", "date": "2024-11", "retrieved": "2026-03-29", "tags": "[agentic, benchmark, evaluation, code-generation, database, tool-use, reasoning]", "filename": "spider_2_0_dbt.md"}, {"source_type": "twitter", "title": "How Anthropic Topped SWE-bench Verified — Prompt, Tools, and Agent Design", "url": "https://x.com/ErikSchluntz/status/1851690352714867074", "author": "@ErikSchluntz", "date": "2024-10-30", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, SWE-bench, coding, software-engineering, Anthropic, Claude, agent-design]", "filename": "thread_swebench_anthropic_erikschluntz.md"}, {"source_type": "announcement", "title": "Spider 2.0: Evaluating Language Models on Real-World Enterprise Text-to-SQL Workflows", "url": "https://spider2-sql.github.io/", "author": "XLANG Lab (Fangyu Lei, Jixuan Chen, Ruisheng Cao, Yuxiao Ye, Tao Yu et al.)", "date": "2024-10-29", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, text-to-sql, enterprise, database, spider, XLANG]", "filename": "summary_spider_2.md"}, {"source_type": "arxiv", "title": "AgentHarm: A Benchmark for Measuring Harmfulness of LLM Agents", "url": "https://arxiv.org/abs/2410.09024", "author": "Maksym Andriushchenko, Alexandra Souly, Mateusz Dziemian, Derek Duenas, Maxwell Lin, Justin Wang, Dan Hendrycks, Andy Zou, Zico Kolter, Matt Fredrikson, Eric Winsor, Jerome Wynne, Yarin Gal, Xander Davies", "date": "2024-10-11", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, safety, harmful-agents, jailbreak, adversarial, ICLR-2025, cybersecurity, fraud]", "filename": "agentharm.md"}, {"source_type": "arxiv", "title": "COMMA: A Communicative Multimodal Multi-Agent Benchmark", "url": "https://arxiv.org/abs/2410.07553", "author": "Timothy Ossowski, Danyal Maqbool, Jixuan Chen, Zefan Cai, Tyler Bradshaw, Junjie Hu", "date": "2024-10-10", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, multi-agent, multimodal, communication, collaboration, reasoning]", "filename": "comma.md"}, {"source_type": "arxiv", "title": "VoiceAgentBench: Are Voice Assistants ready for agentic tasks?", "url": "https://arxiv.org/abs/2510.07978", "author": "Dhruv Jain et al.", "date": "2024-10-10", "retrieved": "2026-04-27", "tags": "[agentic, benchmark, tool-use, evaluation, voice, speech, function-calling, multilingual, safety, indic-languages]", "filename": "voice_agent_bench.md"}, {"source_type": "twitter", "title": "MLE-bench — Evaluating ML Agents on Kaggle Engineering Tasks", "url": "https://x.com/OpenAI/status/1844429536353714427", "author": "@OpenAI", "date": "2024-10-10", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, machine-learning, Kaggle, ML-engineering, OpenAI]", "filename": "thread_mle_bench_openai.md"}, {"source_type": "arxiv", "title": "ST-WebAgentBench: A Benchmark for Evaluating Safety and Trustworthiness in Web Agents", "url": "https://arxiv.org/abs/2410.06703", "author": "Ido Levy, Ben Wiesel, Sami Marreed, Alon Oved, Avi Yaeli, Segev Shlomov", "date": "2024-10-09", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, web-agents, safety, trustworthiness, enterprise, policy-compliance]", "filename": "st-webagentbench.md"}, {"source_type": "arxiv", "title": "SWE-bench Multimodal: Do AI Systems Generalize to Visual Software Domains?", "url": "https://arxiv.org/abs/2410.03859", "author": "John Yang, Carlos E. Jimenez, Alex L. Zhang, Kilian Lieret, Joyce Yang, Xindi Wu, Ori Press, Niklas Muennighoff, Gabriel Synnaeve, Karthik R. Narasimhan, Diyi Yang, Sida I. Wang, Ofir Press", "date": "2024-10-04", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, software-engineering, multimodal, visual, JavaScript, front-end, bug-fixing]", "filename": "swe-bench-multimodal.md"}, {"source_type": "arxiv", "title": "Agent Security Bench (ASB): Formalizing and Benchmarking Attacks and Defenses in LLM-based Agents", "url": "https://arxiv.org/abs/2410.02644", "author": "Hanrong Zhang et al.", "date": "2024-10-03", "retrieved": "2026-04-03", "tags": "[agentic, benchmark, evaluation, security, adversarial, prompt-injection, backdoor, memory-poisoning, tool-use, safety]", "filename": "agent_security_bench.md"}, {"source_type": "arxiv", "title": "ScienceAgentBench: Toward Rigorous Assessment of Language Agents for Data-Driven Scientific Discovery", "url": "https://arxiv.org/abs/2410.05080", "author": "Ziru Chen, Shijie Chen, Yuting Ning et al.", "date": "2024-10", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, code-generation, research, reasoning, tool-use, dataset]", "filename": "scienceagentbench.md"}, {"source_type": "arxiv", "title": "VideoWebArena: Evaluating Long Context Multimodal Agents with Video Understanding Web Tasks", "url": "https://arxiv.org/abs/2410.19100", "author": "Lawrence Jang, Yinheng Li, Dan Zhao, Charles Ding, Justin Lin, Paul Pu Liang, Rogerio Bonatti, Kazuhito Koishida", "date": "2024-10", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, web-navigation, reasoning, planning, memory]", "filename": "videowebarena.md"}, {"source_type": "announcement", "title": "ForecastBench: A Dynamic Benchmark of AI Forecasting Capabilities", "url": "https://www.forecastbench.org/", "author": "Forecasting Research Institute", "date": "2024-09-30", "retrieved": "2026-04-03", "tags": "[benchmark, evaluation, forecasting, reasoning, probabilistic-prediction, llm, human-comparison, dynamic-benchmark, contamination-free]", "filename": "summary_forecastbench.md"}, {"source_type": "arxiv", "title": "CORE-Bench: Fostering the Credibility of Published Research Through a Computational Reproducibility Agent Benchmark", "url": "https://arxiv.org/abs/2409.11363", "author": "Zachary S. Siegel, Sayash Kapoor, Nitya Nadgir, Benedikt Stroebl, Arvind Narayanan", "date": "2024-09-17", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, research, code-generation, debugging, tool-use]", "filename": "core_bench.md"}, {"source_type": "twitter", "title": "How Cognition Evaluates Coding Agents — The cognition-golden Benchmark", "url": "https://x.com/cognition_labs/status/1834292727464488966", "author": "@cognition_labs", "date": "2024-09-12", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, coding, Devin, internal-benchmark, economically-valuable, evaluation-methodology]", "filename": "thread_cognition_devin_evaluation_methodology.md"}, {"source_type": "arxiv", "title": "EnIGMA: Interactive Tools Substantially Assist LM Agents in Finding Security Vulnerabilities", "url": "https://arxiv.org/abs/2409.16165", "author": "Talor Abramovich et al.", "date": "2024-09", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, cybersecurity, tool-use, debugging, reasoning, planning]", "filename": "enigma_ctf.md"}, {"source_type": "arxiv", "title": "AppWorld: A Controllable World of Apps and People for Benchmarking Interactive Coding Agents", "url": "https://arxiv.org/abs/2407.18901", "author": "Harsh Trivedi et al.", "date": "2024-07", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, tool-use, code-generation, function-calling, planning, reasoning, memory]", "filename": "appworld.md"}, {"source_type": "arxiv", "title": "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?", "url": "https://arxiv.org/abs/2407.15711", "author": "Ori Yoran et al.", "date": "2024-07", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, web-navigation, planning, reasoning, dataset]", "filename": "assistantbench.md"}, {"source_type": "arxiv", "title": "OfficeBench: Benchmarking Language Agents across Multiple Applications for Office Automation", "url": "https://arxiv.org/abs/2407.19056", "author": "Zilong Wang, Yuedong Cui, Li Zhong et al.", "date": "2024-07", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, tool-use, planning, reasoning]", "filename": "officebench.md"}, {"source_type": "arxiv", "title": "POGEMA: A Benchmark Platform for Cooperative Multi-Agent Navigation", "url": "https://arxiv.org/abs/2407.14931", "author": "Skrynnik et al.", "date": "2024-07", "retrieved": "2026-05-01", "tags": "[benchmark, evaluation, multi-agent, pathfinding, navigation, cooperative, reinforcement-learning, marl, grid-world, decentralized]", "filename": "pogema_cooperative_pathfinding_benchmark.md"}, {"source_type": "arxiv", "title": "SciCode: A Research Coding Benchmark Curated by Scientists", "url": "https://arxiv.org/abs/2407.13168", "author": "Minyang Tian, Luyu Gao, Shizhuo Dylan Zhang et al.", "date": "2024-07", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, code-generation, reasoning, research, dataset]", "filename": "scicode.md"}, {"source_type": "twitter", "title": "tau-bench — Benchmark for Tool-Agent-User Interaction in Real-World Domains", "url": "https://x.com/karthik_r_n/status/1803846916800942292", "author": "@karthik_r_n", "date": "2024-06-20", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, tool-use, customer-service, reliability, pass-k, Sierra-AI]", "filename": "thread_tau_bench_karthik_r_n.md"}, {"source_type": "arxiv", "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents", "url": "https://arxiv.org/abs/2406.13352", "author": "Edoardo Debenedetti et al.", "date": "2024-06", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, tool-use, function-calling, security, prompt-injection]", "filename": "agentdojo.md"}, {"source_type": "announcement", "title": "Nexus Function Calling Evaluation (NexusFCEval)", "url": "https://huggingface.co/datasets/Nexusflow/NexusFCEval", "author": "Nexusflow", "date": "2024-05", "retrieved": "2026-03-29", "tags": "[benchmark, evaluation, function-calling, tool-use, agentic]", "filename": "nexus_function_calling.md"}, {"source_type": "arxiv", "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents", "url": "https://arxiv.org/abs/2403.02691", "author": "Qiusi Zhan, Zhixiang Liang, Zifan Ying, Daniel Kang", "date": "2024-03-05", "retrieved": "2026-03-22", "tags": "[agentic, benchmark, safety, security, tool-use, prompt-injection]", "filename": "injecagent.md"}, {"source_type": "announcement", "title": "Introducing the WorkArena Benchmark", "url": "https://www.servicenow.com/blogs/2024/introducing-workarena-benchmark", "author": "ServiceNow Research", "date": "2024-03 (arxiv 2403.07718)", "retrieved": "2026-03-07", "tags": "[agentic, benchmark, web-agents, knowledge-work, enterprise, browser-automation, ServiceNow]", "filename": "servicenow_workarena.md"}, {"source_type": "arxiv", "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", "url": "https://arxiv.org/abs/2403.07974", "author": "Naman Jain, King Han, Alex Gu, Wen-Ding Li, Fanjia Yan, Tianjun Zhang, Sida I. Wang, Armando Solar-Lezama, Koushik Sen, Ion Stoica", "date": "2024-03", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, code-generation, debugging, reasoning, contamination, leaderboard]", "filename": "livecodebench.md"}, {"source_type": "arxiv", "title": "Benchmarking Data Science Agents", "url": "https://arxiv.org/abs/2402.17168", "author": "Yuge Zhang et al.", "date": "2024-02-27", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, agentic, code-generation, dataset, reasoning]", "filename": "dsbench.md"}, {"source_type": "arxiv", "title": "API-BLEND: A Comprehensive Corpora for Training and Benchmarking API LLMs", "url": "https://arxiv.org/abs/2402.15491", "author": "Kinjal Basu, Ibrahim Abdelaziz et al.", "date": "2024-02-23", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, function-calling, tool-use, dataset]", "filename": "api_blend.md"}, {"source_type": "arxiv", "title": "Executable Code Actions Elicit Better LLM Agents", "url": "https://arxiv.org/abs/2402.01030", "author": "Wang et al. (UIUC / Apple)", "date": "2024-02", "retrieved": "2026-03-28", "tags": "[agentic, tool-use, code-generation, benchmark, evaluation, function-calling, reasoning, planning]", "filename": "codeact.md"}, {"source_type": "arxiv", "title": "VisualWebArena: Evaluating Multimodal Agents on Realistic Visually Grounded Web Tasks", "url": "https://arxiv.org/abs/2401.13649", "author": "Jing Yu Koh, Robert Lo, Lawrence Jang et al.", "date": "2024-01-24", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, web-navigation, multimodal, planning, reasoning]", "filename": "visualwebarena.md"}, {"source_type": "arxiv", "title": "WebVoyager: Building an End-to-End Web Agent with Large Multimodal Models", "url": "https://arxiv.org/abs/2401.13919", "author": "Hongliang He, Wenlin Yao, Kaixin Ma, Wenhao Yu, Yong Dai, Hongming Zhang, Zhenzhong Lan, Dong Yu", "date": "2024-01", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, web-navigation, planning]", "filename": "webvoyager.md"}, {"source_type": "announcement", "title": "MINT: Evaluating LLMs in Multi-turn Interaction with Tools and Language Feedback", "url": "https://xwang.dev/mint-bench/", "author": "Xingyao Wang, Zihan Wang, Jiateng Liu, Yangyi Chen, Lifan Yuan, Hao Peng, Heng Ji (UIUC & Renmin University)", "date": "2024 (ICLR 2024)", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, tool-use, multi-turn, feedback, reasoning, coding, decision-making]", "filename": "mint_benchmark.md"}, {"source_type": "announcement", "title": "SWE-bench: Resolving Real-World GitHub Issues (Lite, Verified, Multilingual, Multimodal)", "url": "https://www.swebench.com/", "author": "Princeton NLP (Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, Karthik Narasimhan)", "date": "2024", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, software-engineering, coding, SWE-bench, github-issues, python]", "filename": "summary_swe_bench_lite.md"}, {"source_type": "announcement", "title": "SWT-Bench: Testing and Validating Real-World Bug-Fixes with Code Agents", "url": "https://swtbench.com/", "author": "LogicStar AI / Secure, Reliable, and Intelligent Systems Lab (SRI), ETH Zurich", "date": "2024", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, software-testing, test-generation, bug-reproduction, code-agents, NeurIPS-2024]", "filename": "summary_swt_bench.md"}, {"source_type": "arxiv", "title": "GAIA: A Benchmark for General AI Assistants", "url": "https://arxiv.org/abs/2311.12983", "author": "Gregoire Mialon, Clementine Fourrier et al.", "date": "2023-11-21", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, reasoning, tool-use, multimodal, web-navigation]", "filename": "gaia_benchmark.md"}, {"source_type": "arxiv", "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark", "url": "https://arxiv.org/abs/2311.12022", "author": "David Rein et al.", "date": "2023-11-20", "retrieved": "2026-03-28", "tags": "[benchmark, evaluation, reasoning, dataset, scalable-oversight]", "filename": "gpqa_diamond.md"}, {"source_type": "arxiv", "title": "LLM-Coordination: Evaluating and Analyzing Multi-agent Coordination Abilities in Large Language Models", "url": "https://arxiv.org/abs/2310.03903", "author": "Saaket Agashe, Yue Fan, Anthony Reyna, Xin Eric Wang", "date": "2023-10-05", "retrieved": "2026-03-12", "tags": "[agentic, benchmark, multi-agent, coordination, theory-of-mind, game-playing, zero-shot]", "filename": "llm-coordination.md"}, {"source_type": "arxiv", "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents", "url": "https://arxiv.org/abs/2307.13854", "author": "Shuyan Zhou, Frank F. Xu et al.", "date": "2023-07-25", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, web-navigation, planning, tool-use, reasoning]", "filename": "webarena.md"}, {"source_type": "arxiv", "title": "InterCode: Standardizing and Benchmarking Interactive Coding with Execution Feedback", "url": "https://arxiv.org/abs/2306.14898", "author": "John Yang, Akshara Prabhakar, Karthik Narasimhan, Shunyu Yao", "date": "2023-06", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, code-generation, debugging, tool-use, planning, reasoning]", "filename": "intercode.md"}, {"source_type": "arxiv", "title": "On the Tool Manipulation Capability of Open-source Large Language Models", "url": "https://arxiv.org/abs/2305.16504", "author": "Qiantong Xu, Fenglu Hong, Bo Li, Changran Hu, Zhengyu Chen, Jian Zhang", "date": "2023-05-26", "retrieved": "2026-03-28", "tags": "[benchmark, tool-use, function-calling, evaluation, agentic, code-generation]", "filename": "toolbench_sambanova.md"}, {"source_type": "announcement", "title": "Chatbot Arena & MT-Bench", "url": "https://lmsys.org/", "author": "LMSYS (Wei-Lin Chiang, Lianmin Zheng, Ying Sheng et al., UC Berkeley)", "date": "2023-05-03", "retrieved": "2026-03-28", "tags": "[benchmark, human-preference, elo-rating, pairwise-comparison, conversation-quality, llm-as-judge]", "filename": "summary_chatbot_arena.md"}, {"source_type": "announcement", "title": "HELM: Holistic Evaluation of Language Models", "url": "https://crfm.stanford.edu/helm/", "author": "Stanford Center for Research on Foundation Models (CRFM)", "date": "2023-01-01", "retrieved": "2026-03-28", "tags": "[benchmark, holistic-evaluation, language-models, safety, capabilities, multimodal, framework]", "filename": "summary_helm.md"}, {"source_type": "arxiv", "title": "WebShop: Towards Scalable Real-World Web Interaction with Grounded Language Agents", "url": "https://arxiv.org/abs/2207.01206", "author": "Shunyu Yao, Howard Chen, John Yang, Karthik Narasimhan", "date": "2022-07", "retrieved": "2026-03-28", "tags": "[agentic, benchmark, evaluation, web-navigation, reasoning, planning, memory, grounding]", "filename": "webshop.md"}]}