BioML-bench: Evaluation of AI Agents for End-to-End Biomedical ML

doi:10.1101/2025.09.01.673319

BioML-bench: Evaluation of AI Agents for End-to-End Biomedical ML

2025 · doi:10.1101/2025.09.01.673319

preprint OA: closed CC-BY-4.0

📄 Open PDF Full text JSON View at publisher

Full text 36,394 characters · extracted from preprint-html · click to expand

BioML-bench: Evaluation of AI Agents for End-to-End Biomedical ML | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results BioML-bench: Evaluation of AI Agents for End-to-End Biomedical ML View ORCID Profile Henry E. Miller , View ORCID Profile Matthew Greenig , Benjamin Tenmann , View ORCID Profile Bo Wang doi: https://doi.org/10.1101/2025.09.01.673319 Henry E. Miller 1 Shift Bioscience , Toronto, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Henry E. Miller For correspondence: henrymiller2024{at}gmail.com Matthew Greenig 2 University of Cambridge , ScienceMachine, Cambridge, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Matthew Greenig Benjamin Tenmann 3 ScienceMachine , London, UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Bo Wang 4 University of Toronto, Vector Institute , Toronto, Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Bo Wang Abstract Full Text Info/History Metrics Preview PDF Abstract Large language model (LLM) agents hold promise for accelerating biomedical research and development (R&D). Several biomedical agents have recently been proposed, but their evaluation has largely been restricted to question answering (e.g., LAB-Bench) or narrow bioinformatics tasks. Presently, there remains a lack of benchmarks evaluating agent capability in multi-step data analysis workflows or in solving the machine learning (ML) challenges central to AI-driven therapeutics development, such as perturbation response modeling or drug toxicity prediction. We introduce BioML-bench , the first benchmarking suite for evaluating AI agents on end-to-end biomedical ML tasks. BioML-bench spans four domains (protein engineering, single-cell omics, biomedical imaging, and drug discovery) with tasks that require agents to parse a task description, build a pipeline, implement models, and submit predictions graded by established metrics (e.g., AUROC, Spearman). We evaluate four open-source agents: two biomedical specialists (STELLA, Biomni) and two generalists (AIDE, MLAgentBench). On average, agents underperform relative to human baselines, and biomedical specialization does not confer a consistent advantage. We also found that agents which employed more diverse ML strategies more often tended to score highest, suggesting that architecture and scaffolding may be stronger determinants of performance. These findings underscore both the potential and current limits of agentic systems for biomedical ML, and highlight the need for systematic, reproducible evaluations. BioML-bench is provided open-source at github.com/science-machine/biomlbench . 1 Introduction Large language model (LLM) agents are systems built around LLMs that can plan and take actions autonomously via tools and code execution. They hold promise for transforming biomedical research and development (R&D) by automating time-consuming tasks such as literature search ( Skarlinski et al. [2024 ]) and by enabling in silico experimentation, as exemplified by a recently-described virtual lab for identifying nanobodies against SARS-CoV-2 ( Swanson et al. [2025 ]). A key opportunity is the reliable automation of biomedical machine learning (ML) workflows (e.g., perturbation-response modeling, image analysis, and drug property prediction) which demand competence across data preprocessing, model design/selection, and hyperparameter optimization. However, most evaluations of biomedical agents to date focus on question answering or narrow bioinformatics tasks ( Jin et al. [2025 ], Huang et al. [2025 ]), leaving open the question of whether agents can complete multi-step, end-to-end ML pipelines that reflect real-world use cases. We introduce BioML-bench , a benchmarking suite for evaluating AI agents on end-to-end biomedical ML tasks ( Figure 1 ). BioML-bench comprises biomedical-specific tasks and metrics across four domains (protein engineering, single-cell omics, biomedical imaging, and drug discovery). To complete these tasks, agents must parse a task description, build data pipelines, implement models, and submit predictions, which are then graded by task-specific metrics (e.g., AUROC, Spearman). We evaluate two biomedical specialist agents (STELLA, Biomni) and two general-purpose ML agents (AIDE, MLAgentBench). While performance varies across agents, all agents on average underperform human baselines; moreover, we observe no consistent advantage for biomedical-specialized over generalist agents, suggesting that agent architecture and scaffolding may be the primary drivers of capability at present. In particular, we find that successful agents use more diverse ML strategies (e.g., feature engineering, model stacking) more often than less successful agents. Together, these findings clarify both the current reach and the limits of agentic systems for biomedical ML and motivate systematic, reproducible evaluation going forward. To meet this need, BioML-bench is released as a pip-installable package (with tutorials and a documentation website) in order to lower barriers for benchmarking and to accelerate reproducible progress in agentic biomedical ML: github.com/science-machine/biomlbench . Download figure Open in new tab Figure 1. BioML-bench overview. The benchmark spans four biomedical ML domains ( protein engineering, single-cell omics, biomedical imaging, drug discovery ). Based upon MLE-bench, each task is packaged as a task capsule with a task description, training data, test features, and an example submission file. Agents run in a Docker container with packages provided via a pre-built conda environment. To complete a task, agents ingest the task description, design and fit ML models, and emit predictions that are automatically validated and graded by task-specific metrics (e.g., AUROC, Spearman) and ranked against human leaderboards which are largely expert-populated. Created with BioRender.com . 2 Related Work Benchmarks for Biomedical ML Numerous biomedical benchmarks exist for evaluating ML models, including the Therapeutics Data Commons (drug discovery) ( Huang et al. [2021 ]), ProteinGym (protein fitness prediction) ( Notin et al. [2023 ]), BBBC (bioimage analysis) ( Ljosa et al. [2012 ]), and OpenProblems (single-cell omics) ( Luecken et al. [2025 ]). These provide valuable task definitions and human baselines, but they assess isolated model performance rather than whether autonomous agents can design and execute complete ML workflows. Biomedical Agents Several biomedical agents have recently been proposed, including Biomni ( Huang et al. [2025 ]), STELLA (Jin et al. [2025 ]), Virtual Lab ( Swanson et al. [2025 ]), Perturbo-Agent ( Hao et al. [2025 ]), CellForge ( Tang et al. [2025 ]), and Cell Voyager ( Alber et al. [2025 ]). These systems highlight the potential of agentic AI for computational biology, spanning designs that emphasize self-improvement and orchestration of external tools. However, evaluations to date have typically been restricted to question answering, literature reasoning, or narrow domain-specific analyses such as perturbation panel design. For this work, we focused on STELLA and Biomni as the only biomedical agents sufficiently general to be evaluated across the variety of tasks in BioML-bench. Agent Benchmarks Existing agent benchmarks are complementary, but major gaps remain. MLE-bench demonstrated that executable, end-to-end evaluation of ML agents is feasible at scale ( Chan et al. [2025 ]). However, its scope is restricted to Kaggle-derived tasks, whose leaderboards are populated by crowd-sourced submissions rather than domain experts. BixBench provided a broad evaluation of bioinformatics agents with expert human baselines ( Mitchener et al. [2025 ]), yet it relies mainly on text-based responses and bioinformatics tasks that do not involve supervised learning, leaving open whether agents can design and run complete biomedical ML workflows. Together, these efforts underscore the need for a benchmark that combines executable evaluation with biomedical domain-specific tasks and expert-populated leaderboards. Gap To our knowledge, no existing benchmark evaluates whether LLM agents can complete end-to-end ML tasks across diverse biomedical domains. BioML-bench fills this gap by combining graded biomedical ML tasks with human baselines drawn mostly from expert-populated leaderboards. By reporting not only capability but also success and failure modes, it provides a more realistic measure of agent readiness for biomedical R&D. 3 Methods 3.1 Benchmark Setup BioML-bench evaluates whether agents can design and execute full ML pipelines whose predictions are graded against a held-out ground truth. The framework is adapted from MLE-bench, but with key modifications to support biomedical contexts, including support for multiple biomedical task domains, new biomedical agents, tasks, and evaluation metrics, and support for bioinformatics-specific file formats (e.g., h5ad). It was also extended with multiple cloud deployment configurations and full mkdocs documentation. In BioML-bench, task capsules provide agents with a structured task description, training data, test features, and a sample submission in valid format. Agents must design, implement, and run ML models to generate test-set predictions, and emit outputs. Submissions are automatically validated and scored. 3.2 Task Suite The benchmark spans four biomedical ML domains central to R&D: drug discovery, biomedical imaging, protein engineering, and single-cell omics. Tasks were selected for both practical relevance to biotechnology and the availability of populated human leaderboards to provide performance baselines. Sources include PolarisHub (drug discovery), Kaggle (biomedical imaging), ProteinGym (protein engineering), and OpenProblems (single-cell) ( Table 1 , with detailed task descriptions in Table 3 ). View this table: View inline View popup Download powerpoint Table 1: Summary of task domains and sources. Representative tasks are shown; full task lists appear in the appendix. 3.3 Agents We evaluated four agent frameworks: two biomedical research agents (Biomni ( Huang et al. [2025 ]) and STELLA ( Jin et al. [2025 ])) and two general-purpose ML agents (AIDE ( Jiang et al. [2025 ]) and MLAgentBench ( Huang et al. [2024 ])). Biomni (specifically, v0.0.5) and STELLA were chosen as the only biomedical agents sufficiently general to operate across multiple task domains, while AIDE (v6.3.3) and MLAgentBench represent strong baselines from MLE-bench. At evaluation time, we used the latest compatible LLM backends: GPT-4.1 for MLAgentBench and AIDE, Claude Sonnet 4 for Biomni, and the OpenRouter default for STELLA. A simple “Dummy” agent was also included, producing uninformative predictions (random or zero depending on the task). 3.4 Evaluation Protocol and Compute Budget Each task provides a structured task card describing the objective and dataset. Agents must load data, build pipelines, train models, and output predictions to a file in a standardised format. Submissions are validated and graded using the task’s original metric. For each agent–task combination, four independent replicates were run. To quantify both performance and reliability we report: Leaderboard percentile: agent score mapped to the public leaderboard percentile for the task ( higher is better ). Runs with no valid submission either receive a percentile of zero (penalized scoring) or are not considered (non-penalized scoring). Mean rank: average relative rank of an agent (compared to other agents) across tasks, based on leaderboard percentile ( lower is better ). This allows aggregation across domains with varying leaderboard distributions. Above-median (%): proportion of tasks where the agent’s mean score exceeds the median human submission. Any-medal (%): proportion of tasks where the agent achieves a public leaderboard medal threshold, following the MLE-bench definition ( Chan et al. [2025 ]). Completion rate (%): proportion of tasks that yield a valid, gradable submission. All runs were executed under matched resource budgets to ensure comparability. Imaging tasks used a single NVIDIA L4 GPU (16 vCPUs, 64 GB RAM) with a 16-hour wall-clock limit. All other tasks used CPU-only machines with identical specifications and an 8-hour limit. 4 Results 4.1 Overall performance Table 2 reports aggregate agent performance across all tasks. In the “penalized” setting, failed runs (runs in which an agent fails to produce a valid submission file) are assigned a leaderboard percentile of zero, reflecting the view that a failed run corresponds to the poorest possible performance. Under this analysis, Biomni (which had no failed tasks) achieves the strongest overall results, with the best leaderboard percentile (34.45±5.36),= and mean rank (1.88 ±0.18). In the “non-penalized” setting, failed runs are excluded from percentile calculations, reflecting the idea that many failures stem from scaffolding issues rather than fundamental capability and thus shouldn’t penalize benchmarking performance. Here, STELLA is highly competitive with Biomni, outperforming it on most metrics. View this table: View inline View popup Download powerpoint Table 2: Overall BioML-bench performance. Metrics defined in Section 3.4 . In the penalized setting, failed runs are scored as zero; in the non-penalized setting, failed runs are omitted. Biomni achieves the highest penalized performance while STELLA achieves the highest in the non-penalized regime. However, the performance difference between AIDE, STELLA, and Biomni is generally within the reported SEM. 4.2 Domain-level analysis of performance Figure 2 and Table 4 summarize agent performance by domain with penalties applied. In single-cell omics, Biomni is strongest, closely followed by STELLA and AIDE. In Biomedical Imaging, MLAgentBench yields the best mean percentile, while in Protein Engineering and Drug Discovery Biomni leads. Removing the failed-run penalty changes the picture: AIDE and STELLA appear more robust, with rankings shifting across domains. Specifically, STELLA leads Biomedical Imaging by a wide margin, AIDE leads Protein Engineering, and Biomni leads Drug Discovery. As with overall results, domain-level analyses indicate that once failure penalties are removed, Biomni no longer shows a consistent performance lead compared to the other agents. Download figure Open in new tab Figure 2. Domain-level agent performance (penalized). Per-task leaderboard percentiles by agent within each domain. Points are individual tasks; bars and whiskers summarize the mean SEM. Failed run penalties are applied such that a run without a successful answer submission is graded as having a leaderboard percentile of zero. 4.3 Successful Agent Strategies To better understand factors driving performance, we manually examined logs from runs where agents exceeded the human leaderboard median (percentile >50%). Across 94 such runs, we cataloged the prevalence of different ML strategies ( Figure 3 ). The strongest agents overall (AIDE, STELLA, Download figure Open in new tab Figure 3. ML strategies used by agents in above-median runs. Heatmap showing the proportion of above-median runs (runs in which agents perform above 50% of the human leaderboard) in which an agent used any particular ML strategy. and Biomni) employed a wider variety of strategies more often, frequently testing multiple feature engineering approaches and model types. Detailed examination of particularly successful runs sometimes revealed sophisticated agent strategies leveraging clear domain understanding, even for generalist agents. For example, AIDE achieved a 100th percentile leaderboard position on the Tsuboyama 2023 ProteinGym benchmark ( Notin et al. [2023 ], Tsuboyama et al. [2023 ]) using advanced feature engineering approaches that involved calculating a wide range of evolutionary and biophysical quantities from raw protein sequences, including BLOSUM substitution scores, sequence conservation indices, and specific biochemical characteristics of amino acids such as charge, volume, and hydrogen bond propensities. Interestingly, these strategies were not always consistently applied, even in replicate runs with the same agent on the same task; in fact, on a different run for the Tsuboyama task, AIDE achieved only a 54th percentile leaderboard position using an approach with a significantly less-extensive protein feature engineering pipeline. Substantial variability was also observed in the ML architectures chosen by agents ( Figure 5 ): Biomni and MLAgentBench relied on random forests in roughly 50% of cases; STELLA alternated among random forests, ridge regression, and XGBoost; AIDE often used stacked or gradient boosting models (e.g., LightGBM). Strikingly, deep learning (DL) was rarely attempted and almost never selected as the final model, even on image-based tasks. This stands in contrast to human leaderboards, which are dominated by DL approaches. 4.4 Agent failure modes We also analyzed 41 failed runs to characterize agent failure modes ( Figure 6 ). AIDE often failed due to resource exhaustion, particularly when copying large image files. MLAgentBench and STELLA frequently exited early without producing outputs, often due to scripts that executed but failed to save predictions (e.g., silent errors from try–except fallbacks) or agents attempting unnecessary environment creation that stalled on package installation. AIDE and STELLA were affected by uncaught exceptions in their frameworks. Finally, some runs exceeded the time limit without producing results. Importantly, these failure modes did not reflect fundamental architectural flaws but rather are likely addressable issues in scaffolding and execution. 5 Discussion Our results show that both specialized biomedical agents and general-purpose ML agents can solve end-to-end tasks in biology-heavy domains such as single-cell omics and protein engineering. While Biomni achieved the strongest average performance, STELLA overtook it once failed-run penalties were removed, suggesting that scaffolding issues (e.g., resource exhaustion, uncaught exceptions) rather than fundamental agent limitations are an important factor in benchmarking performance. Moreover, while Biomni and STELLA demonstrated strong benchmarking performance, their advantage over one non-specialist agent, AIDE, was typically within the SEM range. These findings indicate that biomedical specialization alone does not guarantee superior performance on biomedical ML tasks. More broadly, all agents struggled to consistently exceed median human leaderboard performance, with Biomni, STELLA, and AIDE averaging only in the 34–39th percentile range. Analysis of successful runs revealed that top-performing agents primarily relied on classical ML strategies such as feature engineering, feature selection, and hyperparameter tuning. Strikingly, these approaches achieved strong leaderboard placements despite the fact that human submissions were often dominated by deep learning models. Failure analysis was equally revealing: most failed runs were attributable to scaffolding issues (e.g., silent errors in agent-written code or uncaught framework exceptions) rather than inherent weaknesses of the underlying agent architectures. Taken together, these results underscore both the reach and the current limits of agentic systems for biomedical ML. Agents are capable of assembling and executing workable pipelines across diverse biomedical tasks, but they generally underperform human domain experts, and their reliability is heavily constrained by framework issues. While BioML-bench reuses the scaffold of MLE-bench (e.g., task capsules, automated grading), it diverges in scope and substance: tasks are drawn from ProteinGym, OpenProblems, and PolarisHub rather than just from Kaggle, extended to biomedical data formats, and benchmarked against expert-populated leaderboards. These differences shift the evaluation from ranking well in generic ML competitions to solving scientific ML problems end-to-end, better-reflecting the task domains and constraints of real-world biomedical R&D. Overall, BioML-bench provides the community with a reproducible and extensible framework for evaluating agentic systems on real biomedical ML problems. By grounding evaluation in verifiable outputs rather than textual answers, it enables systematic progress toward agents that can automate meaningful components of computational biomedical research. Future work will expand the task suite, incorporate additional agents, and investigate strategies for improving both capability and robustness, with the long-term goal of advancing trustworthy agentic systems for biomedical R&D. 5.1 Limitations and Future Work Some limitations of our study should be noted. First, the choice of LLMs for agents introduces potential confounds: stronger base LLMs may yield stronger agent performance. To mitigate this, each agent was paired with its recommended or default LLM (using the latest-generation compatible version). Still, future work should systematically vary LLMs to more cleanly disentangle agent design from LLM capability. Additionally, the interpretation of human performance is itself imperfect. Public leaderboards in-evitably reflect a mix of participant expertise, effort, and compute budgets. A key advantage of the biomedical benchmarks used in BioML-bench is that their leaderboards are populated largely by domain experts, providing a more meaningful point of reference than generic crowd-sourced competitions. Still, our use of these leaderboards remains pragmatic: they offer an external, reproducible baseline for situating agent performance, consistent with MLE-bench. Future work may consider complementary baselines, such as automated ML optimization systems like H2O AutoML ( LeDell and Poirier [2020 ]) and Lazy Predict, which are useful for evaluating whether LLM agents that employ classical ML approaches provide an advantage over automated pipelines. It will also be valuable to explore agent scaffolding that better supports deep learning architectures, as current agents did not seem willing to use these approaches. Finally, it will be interesting to evaluate the impact of tool-usage (e.g., web search, literature review), memory/self-evolving agents, and multi-agent system designs on agent performance. Competing Interests H. M. is an employee of Shift Bioscience. M. G. is a contract employee of ScienceMachine. B. T. is the CTO of ScienceMachine. B. W. is an SVP at Xaira Therapeutics. A Supplementary Material View this table: View inline View popup Download powerpoint Table 3: Comprehensive task descriptions for the BioML-bench suite. Each row specifies the domain, task identifier, and biomedical objective, with dataset sources noted in the text. This table provides the full task set underlying BioML-bench; a high-level summary of domains and representative problems is given in Table 1 . Download figure Open in new tab Figure 4. Domain-level agent performance in the non-penalized setting. Each point shows the leaderboard percentile for a single task; bars and whiskers indicate mean SEM across tasks within a domain. Unlike Figure 2 , failed runs are omitted rather than assigned a score of zero. Download figure Open in new tab Figure 5. Model architectures used by agents in above-median runs (i.e., runs scoring above 50% of the human leaderboard). Bars show the proportion of runs using each architecture. “Stacked Models” denotes any metalearner that combines multiple base models. View this table: View inline View popup Download powerpoint Table 4: BioML-bench results by domain. Metrics are defined in Section 3.4 . Results are reported under penalized (failed runs scored as zero) and non-penalized (failed runs omitted) settings. Download figure Open in new tab Figure 6. Failure mode analysis. Stacked bar chart showing the proportion of failed runs for each agent attributable to specific failure modes. Acknowledgments Gabriel Mejia provided extensive and valuable feedback on this manuscript. Additionally, ScienceMachine provided the compute and LLM API resources for this work. Footnotes Based on critical updates to the STELLA agent, benchmarking was rerun and new figures were generated. References ↵ Samuel Alber , Bowen Chen , Eric Sun , Alina Isakova , Aaron J. Wilk , and James Zou . Cellvoyager: Ai compbio agent generates new insights by autonomously analyzing biological data . bioRxiv , 2025 . doi: 10.1101/2025.06.03.657517 . URL https://www.biorxiv.org/content/early/2025/06/04/2025.06.03.657517 . OpenUrl Abstract / FREE Full Text ↵ Jun Shern Chan , Neil Chowdhury , Oliver Jaffe , James Aung , Dane Sherburn , Evan Mays , Giulio Starace , Kevin Liu , Leon Maksin , Tejal Patwardhan , Lilian Weng , and Aleksander Mądry . Mle-bench: Evaluating machine learning agents on machine learning engineering , 2025 . URL https://arxiv.org/abs/2410.07095 . ↵ Minsheng Hao , Yongju Lee , Hanchen Wang , Gabriele Scalia , and Aviv Regev . Perturboagent: A self-planning agent for boosting sequential perturb-seq experiments . bioRxiv , 2025 . doi: 10.1101/2025.05.25.656020 . URL https://www.biorxiv.org/content/early/2025/05/27/2025.05.25.656020 . OpenUrl Abstract / FREE Full Text ↵ Kexin Huang , Tianfan Fu , Wenhao Gao , Yue Zhao , Yusuf Roohani , Jure Leskovec , Connor W. Coley , Cao Xiao , Jimeng Sun , and Marinka Zitnik . Therapeutics data commons: Machine learning datasets and tasks for drug discovery and development , 2021 . URL https://arxiv.org/abs/2102.09548 . ↵ Kexin Huang , Serena Zhang , Hanchen Wang , Yuanhao Qu , Yingzhou Lu , Yusuf Roohani , Ryan Li , Lin Qiu , Gavin Li , Junze Zhang , D. Yin , Shruti Marwaha , Jennefer N. Carter , Xin Zhou , Matthew Wheeler , Jonathan A. Bernstein , Mengdi Wang , Peng He , Jingtian Zhou , Michael Snyder , L. Cong , Aviv Regev , and Jure Leskovec . Biomni: A general-purpose biomedical ai agent . bioRxiv , 2025 . doi: 10.1101/2025.05.30.656746 . URL https://www.biorxiv.org/content/early/2025/06/02/2025.05.30.656746 . OpenUrl Abstract / FREE Full Text ↵ Qian Huang , Jian Vora , Percy Liang , and Jure Leskovec . Mlagentbench: Evaluating language agents on machine learning experimentation , 2024 . URL https://arxiv.org/abs/2310.03302 . ↵ Zhengyao Jiang , Dominik Schmidt , Dhruv Srikanth , Dixing Xu , Ian Kaplan , Deniss Jacenko , and Yuxiang Wu . Aide: Ai-driven exploration in the space of code , 2025 . URL https://arxiv.org/abs/2502.13138 . ↵ Ruofan Jin , Zaixi Zhang , Mengdi Wang , and Le Cong . Stella: Self-evolving llm agent for biomedical research , 2025 . URL https://arxiv.org/abs/2507.02004 . ↵ Erin LeDell and Sebastien Poirier . H2o automl: Scalable automatic machine learning . In 7th ICML Workshop on Automated Machine Learning , 2020 . URL http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html . ↵ Vebjorn Ljosa , Katherine L. Sokolnicki , and Anne E. Carpenter. Annotated high-throughput microscopy image sets for validation . Nature Methods , 9 ( 7 ): 637 – 637 , Jul 2012 . ISSN 1548-7105 . doi: 10.1038/nmeth.2083 . URL https://doi.org/10.1038/nmeth.2083. OpenUrl CrossRef PubMed Web of Science ↵ Malte D. Luecken , Scott Gigante , Daniel B. Burkhardt , Robrecht Cannoodt , Daniel C. Strobl , Nikolay S. Markov , Luke Zappia , Giovanni Palla , Wesley Lewis , Daniel Dimitrov , Michael E. Vinyard , D. S. Magruder , Michaela F. Mueller , Alma Andersson , Emma Dann , Qian Qin , Dominik J. Otto , Michal Klein , Olga Borisovna Botvinnik , Louise Deconinck , Kai Waldrant , Sai Nirmayi Yasa , Artur Szałata , Andrew Benz , Zhijian Li , Bastian Rieck , Constantin Ahlmann-Eltze , Eduardo da Veiga Beltrame , Carmen Bravo González-Blas , Ann T. Chen , Benjamin DeMeo , Can Ergen , Swann Floc’hlay , Adam Gayoso , Stephanie Hicks , Yuge Ji , Vitalii Kleshchevnikov , Gioele La Manno , Maximilian G. Lombardo , Romain Lopez , Dario Righelli , Hirak Sarkar , Valentine Svensson , Alexander Tong , Galen Xing , Chenling Xu , Jonathan M. Bloom , Angela Oliveira Pisco , Julio Saez-Rodriguez , Drausin Wulsin , Luca Pinello , Yvan Saeys , Fabian J. Theis , Smita Krishnaswamy , and Open Problems Jamboree Members . Defining and benchmarking open problems in single-cell analysis . Nature Biotechnology , 43 ( 7 ): 1035 – 1040 , Jul 2025 . ISSN 1546-1696 . doi: 10.1038/s41587-025-02694-w . URL https://doi.org/10.1038/s41587-025-02694-w. OpenUrl CrossRef PubMed ↵ Ludovico Mitchener , Jon M Laurent , Benjamin Tenmann , Siddharth Narayanan , Geemi P Wellawatte , Andrew White , Lorenzo Sani , and Samuel G Rodriques . Bixbench: a comprehensive benchmark for llm-based agents in computational biology , 2025 . URL https://arxiv.org/abs/2503.00096 . ↵ A. Oh , T. Naumann , A. Globerson , K. Saenko , M. Hardt , and S. Levine Pascal Notin , Aaron Kollasch , Daniel Ritter , Lood van Niekerk , Steffanie Paul , Han Spinner , Nathan Rollins , Ada Shaw , Rose Orenbuch , Ruben Weitzman , Jonathan Frazer , Mafalda Dias , Dinko Franceschi , Yarin Gal , and Debora Marks . Proteingym: Large-scale benchmarks for protein fitness prediction and design . In A. Oh , T. Naumann , A. Globerson , K. Saenko , M. Hardt , and S. Levine , editors, Advances in Neural Information Processing Systems , volume 36 , pages 64331 – 64379 . Curran Associates, Inc ., 2023 . URL https://proceedings.neurips.cc/paper_files/paper/2023/file/cac723e5ff29f65e3fcbb0739ae91bee-Paper-Datasets_and_Benchmarks.pdf . OpenUrl ↵ Michael D. Skarlinski , Sam Cox , Jon M. Laurent , James D. Braza , Michaela Hinks , Michael J. Hammerling , Manvitha Ponnapati , Samuel G. Rodriques , and Andrew D. White . Language agents achieve superhuman synthesis of scientific knowledge , 2024 . URL https://arxiv.org/abs/2409.13740 . ↵ Kyle Swanson , Wesley Wu , Nash L. Bulaong , John E. Pak , and James Zou . The virtual lab of ai agents designs new sars-cov-2 nanobodies . Nature , Jul 2025 . ISSN 1476-4687 . doi: 10.1038/s41586-025-09442-9 . URL https://doi.org/10.1038/s41586-025-09442-9. OpenUrl CrossRef ↵ Xiangru Tang , Zhuoyun Yu , Jiapeng Chen , Yan Cui , Daniel Shao , Weixu Wang , Fang Wu , Yuchen Zhuang , Wenqi Shi , Zhi Huang , Arman Cohan , Xihong Lin , Fabian Theis , Smita Krishnaswamy , and Mark Gerstein . Cellforge: Agentic design of virtual cell models , 2025 . URL https://arxiv.org/abs/2508.02276 . ↵ Kotaro Tsuboyama , Justas Dauparas , Jonathan Chen , Elodie Laine , Yasser Mohseni Behbahani , Jonathan J. Weinstein , Niall M. Mangan , Sergey Ovchinnikov , and Gabriel J. Rocklin . Mega-scale experimental analysis of protein folding stability in biology and design . Nature , 620 ( 7973 ): 434 – 444 , July 2023 . ISSN 1476-4687 . doi: 10.1038/s41586-023-06328-6 . URL http://dx.doi.org/10.1038/s41586-023-06328-6. OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted September 28, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following BioML-bench: Evaluation of AI Agents for End-to-End Biomedical ML Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share BioML-bench: Evaluation of AI Agents for End-to-End Biomedical ML Henry E. Miller , Matthew Greenig , Benjamin Tenmann , Bo Wang bioRxiv 2025.09.01.673319; doi: https://doi.org/10.1101/2025.09.01.673319 Share This Article: Copy Citation Tools BioML-bench: Evaluation of AI Agents for End-to-End Biomedical ML Henry E. Miller , Matthew Greenig , Benjamin Tenmann , Bo Wang bioRxiv 2025.09.01.673319; doi: https://doi.org/10.1101/2025.09.01.673319 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7636) Biochemistry (17704) Bioengineering (13898) Bioinformatics (41967) Biophysics (21460) Cancer Biology (18599) Cell Biology (25525) Clinical Trials (138) Developmental Biology (13384) Ecology (19909) Epidemiology (2067) Evolutionary Biology (24326) Genetics (15613) Genomics (22512) Immunology (17740) Microbiology (40423) Molecular Biology (17191) Neuroscience (88645) Paleontology (667) Pathology (2835) Pharmacology and Toxicology (4825) Physiology (7646) Plant Biology (15158) Scientific Communication and Education (2046) Synthetic Biology (4302) Systems Biology (9825) Zoology (2271)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-24T02:00:01.246996+00:00

License: CC-BY-4.0