Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data

preprint OA: closed
Full text JSON View at publisher
Full text 287,370 characters · extracted from preprint-html · click to expand
Computational Study Protocol: Leveraging... | F1000Research "use strict";function _typeof(t){return(_typeof="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t})(t)}!function(){var t=function(){var t,e,o=[],n=window,r=n;for(;r;){try{if(r.frames.__tcfapiLocator){t=r;break}}catch(t){}if(r===n.top)break;r=r.parent}t||(!function t(){var e=n.document,o=!!n.frames.__tcfapiLocator;if(!o)if(e.body){var r=e.createElement("iframe");r.style.cssText="display:none",r.name="__tcfapiLocator",e.body.appendChild(r)}else setTimeout(t,5);return!o}(),n.__tcfapi=function(){for(var t=arguments.length,n=new Array(t),r=0;r 3&&2===parseInt(n[1],10)&&"boolean"==typeof n[3]&&(e=n[3],"function"==typeof n[2]&&n[2]("set",!0)):"ping"===n[0]?"function"==typeof n[2]&&n[2]({gdprApplies:e,cmpLoaded:!1,cmpStatus:"stub"}):o.push(n)},n.addEventListener("message",(function(t){var e="string"==typeof t.data,o={};if(e)try{o=JSON.parse(t.data)}catch(t){}else o=t.data;var n="object"===_typeof(o)&&null!==o?o.__tcfapiCall:null;n&&window.__tcfapi(n.command,n.version,(function(o,r){var a={__tcfapiReturn:{returnValue:o,success:r,callId:n.callId}};t&&t.source&&t.source.postMessage&&t.source.postMessage(e?JSON.stringify(a):a,"*")}),n.parameter)}),!1))};"undefined"!=typeof module?module.exports=t:t()}(); dataLayer = dataLayer || []; // Standard GTM initialization - Google Consent Mode handles consent automatically (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= 'https://www.googletagmanager.com/gtm.js?id='+i+dl+ '>m_auth=hzk0Vc3qFsQYhCrIoHz68A>m_preview=env-1>m_cookies_win=x';f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-MWFK8L5J'); ;window.NREUM||(NREUM={});NREUM.init={distributed_tracing:{enabled:true},privacy:{cookies_enabled:true},ajax:{deny_list:["bam.nr-data.net"]}}; ;NREUM.loader_config={accountID:"438030",trustKey:"438030",agentID:"772317073",licenseKey:"97f8f67f26",applicationID:"772317073"} ;NREUM.info={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net",licenseKey:"97f8f67f26",applicationID:"772317073",sa:1} ;/*! For license information please see nr-loader-spa-1.236.0.min.js.LICENSE.txt */ (()=>{"use strict";var e,t,r={5763:(e,t,r)=>{r.d(t,{P_:()=>l,Mt:()=>g,C5:()=>s,DL:()=>v,OP:()=>T,lF:()=>D,Yu:()=>y,Dg:()=>h,CX:()=>c,GE:()=>b,sU:()=>_});var n=r(8632),i=r(9567);const o={beacon:n.ce.beacon,errorBeacon:n.ce.errorBeacon,licenseKey:void 0,applicationID:void 0,sa:void 0,queueTime:void 0,applicationTime:void 0,ttGuid:void 0,user:void 0,account:void 0,product:void 0,extra:void 0,jsAttributes:{},userAttributes:void 0,atts:void 0,transactionName:void 0,tNamePlain:void 0},a={};function s(e){if(!e)throw new Error("All info objects require an agent identifier!");if(!a[e])throw new Error("Info for ".concat(e," was never set"));return a[e]}function c(e,t){if(!e)throw new Error("All info objects require an agent identifier!");a[e]=(0,i.D)(t,o),(0,n.Qy)(e,a[e],"info")}var u=r(7056);const d=()=>{const e={blockSelector:"[data-nr-block]",maskInputOptions:{password:!0}};return{allow_bfcache:!0,privacy:{cookies_enabled:!0},ajax:{deny_list:void 0,enabled:!0,harvestTimeSeconds:10},distributed_tracing:{enabled:void 0,exclude_newrelic_header:void 0,cors_use_newrelic_header:void 0,cors_use_tracecontext_headers:void 0,allowed_origins:void 0},session:{domain:void 0,expiresMs:u.oD,inactiveMs:u.Hb},ssl:void 0,obfuscate:void 0,jserrors:{enabled:!0,harvestTimeSeconds:10},metrics:{enabled:!0},page_action:{enabled:!0,harvestTimeSeconds:30},page_view_event:{enabled:!0},page_view_timing:{enabled:!0,harvestTimeSeconds:30,long_task:!1},session_trace:{enabled:!0,harvestTimeSeconds:10},harvest:{tooManyRequestsDelay:60},session_replay:{enabled:!1,harvestTimeSeconds:60,sampleRate:.1,errorSampleRate:.1,maskTextSelector:"*",maskAllInputs:!0,get blockClass(){return"nr-block"},get ignoreClass(){return"nr-ignore"},get maskTextClass(){return"nr-mask"},get blockSelector(){return e.blockSelector},set blockSelector(t){e.blockSelector+=",".concat(t)},get maskInputOptions(){return e.maskInputOptions},set maskInputOptions(t){e.maskInputOptions={...t,password:!0}}},spa:{enabled:!0,harvestTimeSeconds:10}}},f={};function l(e){if(!e)throw new Error("All configuration objects require an agent identifier!");if(!f[e])throw new Error("Configuration for ".concat(e," was never set"));return f[e]}function h(e,t){if(!e)throw new Error("All configuration objects require an agent identifier!");f[e]=(0,i.D)(t,d()),(0,n.Qy)(e,f[e],"config")}function g(e,t){if(!e)throw new Error("All configuration objects require an agent identifier!");var r=l(e);if(r){for(var n=t.split("."),i=0;i {r.d(t,{D:()=>i});var n=r(50);function i(e,t){try{if(!e||"object"!=typeof e)return(0,n.Z)("Setting a Configurable requires an object as input");if(!t||"object"!=typeof t)return(0,n.Z)("Setting a Configurable requires a model to set its initial properties");const r=Object.create(Object.getPrototypeOf(t),Object.getOwnPropertyDescriptors(t)),o=0===Object.keys(r).length?e:r;for(let a in o)if(void 0!==e[a])try{"object"==typeof e[a]&&"object"==typeof t[a]?r[a]=i(e[a],t[a]):r[a]=e[a]}catch(e){(0,n.Z)("An error occurred while setting a property of a Configurable",e)}return r}catch(e){(0,n.Z)("An error occured while setting a Configurable",e)}}},6818:(e,t,r)=>{r.d(t,{Re:()=>i,gF:()=>o,q4:()=>n});const n="1.236.0",i="PROD",o="CDN"},385:(e,t,r)=>{r.d(t,{FN:()=>a,IF:()=>u,Nk:()=>f,Tt:()=>s,_A:()=>o,il:()=>n,pL:()=>c,v6:()=>i,w1:()=>d});const n="undefined"!=typeof window&&!!window.document,i="undefined"!=typeof WorkerGlobalScope&&("undefined"!=typeof self&&self instanceof WorkerGlobalScope&&self.navigator instanceof WorkerNavigator||"undefined"!=typeof globalThis&&globalThis instanceof WorkerGlobalScope&&globalThis.navigator instanceof WorkerNavigator),o=n?window:"undefined"!=typeof WorkerGlobalScope&&("undefined"!=typeof self&&self instanceof WorkerGlobalScope&&self||"undefined"!=typeof globalThis&&globalThis instanceof WorkerGlobalScope&&globalThis),a=""+o?.location,s=/iPad|iPhone|iPod/.test(navigator.userAgent),c=s&&"undefined"==typeof SharedWorker,u=(()=>{const e=navigator.userAgent.match(/Firefox[/\s](\d+\.\d+)/);return Array.isArray(e)&&e.length>=2?+e[1]:0})(),d=Boolean(n&&window.document.documentMode),f=!!navigator.sendBeacon},1117:(e,t,r)=>{r.d(t,{w:()=>o});var n=r(50);const i={agentIdentifier:"",ee:void 0};class o{constructor(e){try{if("object"!=typeof e)return(0,n.Z)("shared context requires an object as input");this.sharedContext={},Object.assign(this.sharedContext,i),Object.entries(e).forEach((e=>{let[t,r]=e;Object.keys(i).includes(t)&&(this.sharedContext[t]=r)}))}catch(e){(0,n.Z)("An error occured while setting SharedContext",e)}}}},8e3:(e,t,r)=>{r.d(t,{L:()=>d,R:()=>c});var n=r(2177),i=r(1284),o=r(4322),a=r(3325);const s={};function c(e,t){const r={staged:!1,priority:a.p[t]||0};u(e),s[e].get(t)||s[e].set(t,r)}function u(e){e&&(s[e]||(s[e]=new Map))}function d(){let e=arguments.length>0&&void 0!==arguments[0]?arguments[0]:"",t=arguments.length>1&&void 0!==arguments[1]?arguments[1]:"feature";if(u(e),!e||!s[e].get(t))return a(t);s[e].get(t).staged=!0;const r=[...s[e]];function a(t){const r=e?n.ee.get(e):n.ee,a=o.X.handlers;if(r.backlog&&a){var s=r.backlog[t],c=a[t];if(c){for(var u=0;s&&u {let[t,r]=e;return r.staged}))&&(r.sort(((e,t)=>e[1].priority-t[1].priority)),r.forEach((e=>{let[t]=e;a(t)})))}function f(e,t){var r=e[1];(0,i.D)(t[r],(function(t,r){var n=e[0];if(r[0]===n){var i=r[1],o=e[3],a=e[2];i.apply(o,a)}}))}},2177:(e,t,r)=>{r.d(t,{c:()=>f,ee:()=>u});var n=r(8632),i=r(2210),o=r(1284),a=r(5763),s="nr@context";let c=(0,n.fP)();var u;function d(){}function f(e){return(0,i.X)(e,s,l)}function l(){return new d}function h(){u.aborted=!0,u.backlog={}}c.ee?u=c.ee:(u=function e(t,r){var n={},c={},f={},g=!1;try{g=16===r.length&&(0,a.OP)(r).isolatedBacklog}catch(e){}var p={on:b,addEventListener:b,removeEventListener:y,emit:v,get:x,listeners:w,context:m,buffer:A,abort:h,aborted:!1,isBuffering:E,debugId:r,backlog:g?{}:t&&"object"==typeof t.backlog?t.backlog:{}};return p;function m(e){return e&&e instanceof d?e:e?(0,i.X)(e,s,l):l()}function v(e,r,n,i,o){if(!1!==o&&(o=!0),!u.aborted||i){t&&o&&t.emit(e,r,n);for(var a=m(n),s=w(e),d=s.length,f=0;fn,p:()=>i});var n=r(2177).ee.get("handle");function i(e,t,r,i,o){o?(o.buffer([e],i),o.emit(e,t,r)):(n.buffer([e],i),n.emit(e,t,r))}},4322:(e,t,r)=>{r.d(t,{X:()=>o});var n=r(5546);o.on=a;var i=o.handlers={};function o(e,t,r,o){a(o||n.E,i,e,t,r)}function a(e,t,r,i,o){o||(o="feature"),e||(e=n.E);var a=t[o]=t[o]||{};(a[r]=a[r]||[]).push([e,i])}},3239:(e,t,r)=>{r.d(t,{bP:()=>s,iz:()=>c,m$:()=>a});var n=r(385);let i=!1,o=!1;try{const e={get passive(){return i=!0,!1},get signal(){return o=!0,!1}};n._A.addEventListener("test",null,e),n._A.removeEventListener("test",null,e)}catch(e){}function a(e,t){return i||o?{capture:!!e,passive:i,signal:t}:!!e}function s(e,t){let r=arguments.length>2&&void 0!==arguments[2]&&arguments[2],n=arguments.length>3?arguments[3]:void 0;window.addEventListener(e,t,a(r,n))}function c(e,t){let r=arguments.length>2&&void 0!==arguments[2]&&arguments[2],n=arguments.length>3?arguments[3]:void 0;document.addEventListener(e,t,a(r,n))}},4402:(e,t,r)=>{r.d(t,{Ht:()=>u,M:()=>c,Rl:()=>a,ky:()=>s});var n=r(385);const i="xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx";function o(e,t){return e?15&e[t]:16*Math.random()|0}function a(){const e=n._A?.crypto||n._A?.msCrypto;let t,r=0;return e&&e.getRandomValues&&(t=e.getRandomValues(new Uint8Array(31))),i.split("").map((e=>"x"===e?o(t,++r).toString(16):"y"===e?(3&o()|8).toString(16):e)).join("")}function s(e){const t=n._A?.crypto||n._A?.msCrypto;let r,i=0;t&&t.getRandomValues&&(r=t.getRandomValues(new Uint8Array(31)));const a=[];for(var s=0;s {r.d(t,{Bq:()=>n,Hb:()=>o,oD:()=>i});const n="NRBA",i=144e5,o=18e5},7894:(e,t,r)=>{function n(){return Math.round(performance.now())}r.d(t,{z:()=>n})},7243:(e,t,r)=>{r.d(t,{e:()=>o});var n=r(385),i={};function o(e){if(e in i)return i[e];if(0===(e||"").indexOf("data:"))return{protocol:"data"};let t;var r=n._A?.location,o={};if(n.il)t=document.createElement("a"),t.href=e;else try{t=new URL(e,r.href)}catch(e){return o}o.port=t.port;var a=t.href.split("://");!o.port&&a[1]&&(o.port=a[1].split("/")[0].split("@").pop().split(":")[1]),o.port&&"0"!==o.port||(o.port="https"===a[0]?"443":"80"),o.hostname=t.hostname||r.hostname,o.pathname=t.pathname,o.protocol=a[0],"/"!==o.pathname.charAt(0)&&(o.pathname="/"+o.pathname);var s=!t.protocol||":"===t.protocol||t.protocol===r.protocol,c=t.hostname===r.hostname&&t.port===r.port;return o.sameOrigin=s&&(!t.hostname||c),"/"===o.pathname&&(i[e]=o),o}},50:(e,t,r)=>{function n(e,t){"function"==typeof console.warn&&(console.warn("New Relic: ".concat(e)),t&&console.warn(t))}r.d(t,{Z:()=>n})},2587:(e,t,r)=>{r.d(t,{N:()=>c,T:()=>u});var n=r(2177),i=r(5546),o=r(8e3),a=r(3325);const s={stn:[a.D.sessionTrace],err:[a.D.jserrors,a.D.metrics],ins:[a.D.pageAction],spa:[a.D.spa],sr:[a.D.sessionReplay,a.D.sessionTrace]};function c(e,t){const r=n.ee.get(t);e&&"object"==typeof e&&(Object.entries(e).forEach((e=>{let[t,n]=e;void 0===u[t]&&(s[t]?s[t].forEach((e=>{n?(0,i.p)("feat-"+t,[],void 0,e,r):(0,i.p)("block-"+t,[],void 0,e,r),(0,i.p)("rumresp-"+t,[Boolean(n)],void 0,e,r)})):n&&(0,i.p)("feat-"+t,[],void 0,void 0,r),u[t]=Boolean(n))})),Object.keys(s).forEach((e=>{void 0===u[e]&&(s[e]?.forEach((t=>(0,i.p)("rumresp-"+e,[!1],void 0,t,r))),u[e]=!1)})),(0,o.L)(t,a.D.pageViewEvent))}const u={}},2210:(e,t,r)=>{r.d(t,{X:()=>i});var n=Object.prototype.hasOwnProperty;function i(e,t,r){if(n.call(e,t))return e[t];var i=r();if(Object.defineProperty&&Object.keys)try{return Object.defineProperty(e,t,{value:i,writable:!0,enumerable:!1}),i}catch(e){}return e[t]=i,i}},1284:(e,t,r)=>{r.d(t,{D:()=>n});const n=(e,t)=>Object.entries(e||{}).map((e=>{let[r,n]=e;return t(r,n)}))},4351:(e,t,r)=>{r.d(t,{P:()=>o});var n=r(2177);const i=()=>{const e=new WeakSet;return(t,r)=>{if("object"==typeof r&&null!==r){if(e.has(r))return;e.add(r)}return r}};function o(e){try{return JSON.stringify(e,i())}catch(e){try{n.ee.emit("internal-error",[e])}catch(e){}}}},3960:(e,t,r)=>{r.d(t,{K:()=>a,b:()=>o});var n=r(3239);function i(){return"undefined"==typeof document||"complete"===document.readyState}function o(e,t){if(i())return e();(0,n.bP)("load",e,t)}function a(e){if(i())return e();(0,n.iz)("DOMContentLoaded",e)}},8632:(e,t,r)=>{r.d(t,{EZ:()=>u,Qy:()=>c,ce:()=>o,fP:()=>a,gG:()=>d,mF:()=>s});var n=r(7894),i=r(385);const o={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net"};function a(){return i._A.NREUM||(i._A.NREUM={}),void 0===i._A.newrelic&&(i._A.newrelic=i._A.NREUM),i._A.NREUM}function s(){let e=a();return e.o||(e.o={ST:i._A.setTimeout,SI:i._A.setImmediate,CT:i._A.clearTimeout,XHR:i._A.XMLHttpRequest,REQ:i._A.Request,EV:i._A.Event,PR:i._A.Promise,MO:i._A.MutationObserver,FETCH:i._A.fetch}),e}function c(e,t,r){let i=a();const o=i.initializedAgents||{},s=o[e]||{};return Object.keys(s).length||(s.initializedAt={ms:(0,n.z)(),date:new Date}),i.initializedAgents={...o,[e]:{...s,[r]:t}},i}function u(e,t){a()[e]=t}function d(){return function(){let e=a();const t=e.info||{};e.info={beacon:o.beacon,errorBeacon:o.errorBeacon,...t}}(),function(){let e=a();const t=e.init||{};e.init={...t}}(),s(),function(){let e=a();const t=e.loader_config||{};e.loader_config={...t}}(),a()}},7956:(e,t,r)=>{r.d(t,{N:()=>i});var n=r(3239);function i(e){let t=arguments.length>1&&void 0!==arguments[1]&&arguments[1],r=arguments.length>2?arguments[2]:void 0,i=arguments.length>3?arguments[3]:void 0;return void(0,n.iz)("visibilitychange",(function(){if(t)return void("hidden"==document.visibilityState&&e());e(document.visibilityState)}),r,i)}},1214:(e,t,r)=>{r.d(t,{em:()=>v,u5:()=>N,QU:()=>S,_L:()=>I,Gm:()=>L,Lg:()=>M,gy:()=>U,BV:()=>Q,Kf:()=>ee});var n=r(2177);const i="nr@original";var o=Object.prototype.hasOwnProperty,a=!1;function s(e,t){return e||(e=n.ee),r.inPlace=function(e,t,n,i,o){n||(n="");var a,s,c,u="-"===n.charAt(0);for(c=0;c 2?n-2:0),o=2;o {r(A[T],e,w),r(E[T],e,w)})),r(l._A,"fetch",y),t.on(y+"end",(function(e,r){var n=this;if(r){var i=r.headers.get("content-length");null!==i&&(n.rxSize=i),t.emit(y+"done",[null,r],n)}else t.emit(y+"done",[e],n)})),t}const O={},j=["pushState","replaceState"];function S(e){const t=function(e){return(e||n.ee).get("history")}(e);return!l.il||O[t.debugId]++||(O[t.debugId]=1,s(t).inPlace(window.history,j,"-")),t}var P=r(3239);const C={},R=["appendChild","insertBefore","replaceChild"];function I(e){const t=function(e){return(e||n.ee).get("jsonp")}(e);if(!l.il||C[t.debugId])return t;C[t.debugId]=!0;var r=s(t),i=/[?&](?:callback|cb)=([^&#]+)/,o=/(.*)\.([^.]+)/,a=/^(\w+)(\.|$)(.*)$/;function c(e,t){var r=e.match(a),n=r[1],i=r[3];return i?c(i,t[n]):t[n]}return r.inPlace(Node.prototype,R,"dom-"),t.on("dom-start",(function(e){!function(e){if(!e||"string"!=typeof e.nodeName||"script"!==e.nodeName.toLowerCase())return;if("function"!=typeof e.addEventListener)return;var n=(a=e.src,s=a.match(i),s?s[1]:null);var a,s;if(!n)return;var u=function(e){var t=e.match(o);if(t&&t.length>=3)return{key:t[2],parent:c(t[1],window)};return{key:e,parent:window}}(n);if("function"!=typeof u.parent[u.key])return;var d={};function f(){t.emit("jsonp-end",[],d),e.removeEventListener("load",f,(0,P.m$)(!1)),e.removeEventListener("error",l,(0,P.m$)(!1))}function l(){t.emit("jsonp-error",[],d),t.emit("jsonp-end",[],d),e.removeEventListener("load",f,(0,P.m$)(!1)),e.removeEventListener("error",l,(0,P.m$)(!1))}r.inPlace(u.parent,[u.key],"cb-",d),e.addEventListener("load",f,(0,P.m$)(!1)),e.addEventListener("error",l,(0,P.m$)(!1)),t.emit("new-jsonp",[e.src],d)}(e[0])})),t}var k=r(5763);const H={};function L(e){const t=function(e){return(e||n.ee).get("mutation")}(e);if(!l.il||H[t.debugId])return t;H[t.debugId]=!0;var r=s(t),i=k.Yu.MO;return i&&(window.MutationObserver=function(e){return this instanceof i?new i(r(e,"fn-")):i.apply(this,arguments)},MutationObserver.prototype=i.prototype),t}const z={};function M(e){const t=function(e){return(e||n.ee).get("promise")}(e);if(z[t.debugId])return t;z[t.debugId]=!0;var r=n.c,o=s(t),a=k.Yu.PR;return a&&function(){function e(r){var n=t.context(),i=o(r,"executor-",n,null,!1);const s=Reflect.construct(a,[i],e);return t.context(s).getCtx=function(){return n},s}l._A.Promise=e,Object.defineProperty(e,"name",{value:"Promise"}),e.toString=function(){return a.toString()},Object.setPrototypeOf(e,a),["all","race"].forEach((function(r){const n=a[r];e[r]=function(e){let i=!1;[...e||[]].forEach((e=>{this.resolve(e).then(a("all"===r),a(!1))}));const o=n.apply(this,arguments);return o;function a(e){return function(){t.emit("propagate",[null,!i],o,!1,!1),i=i||!e}}}})),["resolve","reject"].forEach((function(r){const n=a[r];e[r]=function(e){const r=n.apply(this,arguments);return e!==r&&t.emit("propagate",[e,!0],r,!1,!1),r}})),e.prototype=a.prototype;const n=a.prototype.then;a.prototype.then=function(){var e=this,i=r(e);i.promise=e;for(var a=arguments.length,s=new Array(a),c=0;c e())),t};function m(e,t){i.inPlace(t,["onreadystatechange"],"fn-",E)}function b(){var e=this,t=r.context(e);e.readyState>3&&!t.resolved&&(t.resolved=!0,r.emit("xhr-resolved",[],e)),i.inPlace(e,f,"fn-",E)}if(function(e,t){for(var r in e)t[r]=e[r]}(o,p),p.prototype=o.prototype,i.inPlace(p.prototype,J,"-xhr-",E),r.on("send-xhr-start",(function(e,t){m(e,t),function(e){h.push(e),a&&(y?y.then(A):u?u(A):(w=-w,x.data=w))}(t)})),r.on("open-xhr-start",m),a){var y=c&&c.resolve();if(!u&&!c){var w=1,x=document.createTextNode(w);new a(A).observe(x,{characterData:!0})}}else t.on("fn-end",(function(e){e[0]&&e[0].type===d||A()}));function A(){for(var e=0;e {r.d(t,{t:()=>n});const n=r(3325).D.ajax},6660:(e,t,r)=>{r.d(t,{A:()=>i,t:()=>n});const n=r(3325).D.jserrors,i="nr@seenError"},3081:(e,t,r)=>{r.d(t,{gF:()=>o,mY:()=>i,t9:()=>n,vz:()=>s,xS:()=>a});const n=r(3325).D.metrics,i="sm",o="cm",a="storeSupportabilityMetrics",s="storeEventMetrics"},4649:(e,t,r)=>{r.d(t,{t:()=>n});const n=r(3325).D.pageAction},7633:(e,t,r)=>{r.d(t,{Dz:()=>i,OJ:()=>a,qw:()=>o,t9:()=>n});const n=r(3325).D.pageViewEvent,i="firstbyte",o="domcontent",a="windowload"},9251:(e,t,r)=>{r.d(t,{t:()=>n});const n=r(3325).D.pageViewTiming},3614:(e,t,r)=>{r.d(t,{BST_RESOURCE:()=>i,END:()=>s,FEATURE_NAME:()=>n,FN_END:()=>u,FN_START:()=>c,PUSH_STATE:()=>d,RESOURCE:()=>o,START:()=>a});const n=r(3325).D.sessionTrace,i="bstResource",o="resource",a="-start",s="-end",c="fn"+a,u="fn"+s,d="pushState"},7836:(e,t,r)=>{r.d(t,{BODY:()=>A,CB_END:()=>E,CB_START:()=>u,END:()=>x,FEATURE_NAME:()=>i,FETCH:()=>_,FETCH_BODY:()=>v,FETCH_DONE:()=>m,FETCH_START:()=>p,FN_END:()=>c,FN_START:()=>s,INTERACTION:()=>l,INTERACTION_API:()=>d,INTERACTION_EVENTS:()=>o,JSONP_END:()=>b,JSONP_NODE:()=>g,JS_TIME:()=>T,MAX_TIMER_BUDGET:()=>a,REMAINING:()=>f,SPA_NODE:()=>h,START:()=>w,originalSetTimeout:()=>y});var n=r(5763);const i=r(3325).D.spa,o=["click","submit","keypress","keydown","keyup","change"],a=999,s="fn-start",c="fn-end",u="cb-start",d="api-ixn-",f="remaining",l="interaction",h="spaNode",g="jsonpNode",p="fetch-start",m="fetch-done",v="fetch-body-",b="jsonp-end",y=n.Yu.ST,w="-start",x="-end",A="-body",E="cb"+x,T="jsTime",_="fetch"},5938:(e,t,r)=>{r.d(t,{W:()=>o});var n=r(5763),i=r(2177);class o{constructor(e,t,r){this.agentIdentifier=e,this.aggregator=t,this.ee=i.ee.get(e,(0,n.OP)(this.agentIdentifier).isolatedBacklog),this.featureName=r,this.blocked=!1}}},9144:(e,t,r)=>{r.d(t,{j:()=>m});var n=r(3325),i=r(5763),o=r(5546),a=r(2177),s=r(7894),c=r(8e3),u=r(3960),d=r(385),f=r(50),l=r(3081),h=r(8632);function g(){const e=(0,h.gG)();["setErrorHandler","finished","addToTrace","inlineHit","addRelease","addPageAction","setCurrentRouteName","setPageViewName","setCustomAttribute","interaction","noticeError","setUserId"].forEach((t=>{e[t]=function(){for(var r=arguments.length,n=new Array(r),i=0;i 1?r-1:0),i=1;i {e.exposed&&e.api[t]&&o.push(e.api[t](...n))})),o.length>1?o:o[0]}(t,...n)}}))}var p=r(2587);function m(e){let t=arguments.length>1&&void 0!==arguments[1]?arguments[1]:{},m=arguments.length>2?arguments[2]:void 0,v=arguments.length>3?arguments[3]:void 0,{init:b,info:y,loader_config:w,runtime:x={loaderType:m},exposed:A=!0}=t;const E=(0,h.gG)();y||(b=E.init,y=E.info,w=E.loader_config),(0,i.Dg)(e,b||{}),(0,i.GE)(e,w||{}),(0,i.sU)(e,x),y.jsAttributes??={},d.v6&&(y.jsAttributes.isWorker=!0),(0,i.CX)(e,y),g();const T=function(e,t){t||(0,c.R)(e,"api");const h={};var g=a.ee.get(e),p=g.get("tracer"),m="api-",v=m+"ixn-";function b(t,r,n,o){const a=(0,i.C5)(e);return null===r?delete a.jsAttributes[t]:(0,i.CX)(e,{...a,jsAttributes:{...a.jsAttributes,[t]:r}}),x(m,n,!0,o||null===r?"session":void 0)(t,r)}function y(){}["setErrorHandler","finished","addToTrace","inlineHit","addRelease"].forEach((e=>h[e]=x(m,e,!0,"api"))),h.addPageAction=x(m,"addPageAction",!0,n.D.pageAction),h.setCurrentRouteName=x(m,"routeName",!0,n.D.spa),h.setPageViewName=function(t,r){if("string"==typeof t)return"/"!==t.charAt(0)&&(t="/"+t),(0,i.OP)(e).customTransaction=(r||"http://custom.transaction")+t,x(m,"setPageViewName",!0)()},h.setCustomAttribute=function(e,t){let r=arguments.length>2&&void 0!==arguments[2]&&arguments[2];if("string"==typeof e){if(["string","number"].includes(typeof t)||null===t)return b(e,t,"setCustomAttribute",r);(0,f.Z)("Failed to execute setCustomAttribute.\nNon-null value must be a string or number type, but a type of was provided."))}else(0,f.Z)("Failed to execute setCustomAttribute.\nName must be a string type, but a type of was provided."))},h.setUserId=function(e){if("string"==typeof e||null===e)return b("enduser.id",e,"setUserId",!0);(0,f.Z)("Failed to execute setUserId.\nNon-null value must be a string type, but a type of was provided."))},h.interaction=function(){return(new y).get()};var w=y.prototype={createTracer:function(e,t){var r={},i=this,a="function"==typeof t;return(0,o.p)(v+"tracer",[(0,s.z)(),e,r],i,n.D.spa,g),function(){if(p.emit((a?"":"no-")+"fn-start",[(0,s.z)(),i,a],r),a)try{return t.apply(this,arguments)}catch(e){throw p.emit("fn-err",[arguments,this,"string"==typeof e?new Error(e):e],r),e}finally{p.emit("fn-end",[(0,s.z)()],r)}}}};function x(e,t,r,i){return function(){return(0,o.p)(l.xS,["API/"+t+"/called"],void 0,n.D.metrics,g),i&&(0,o.p)(e+t,[(0,s.z)(),...arguments],r?null:this,i,g),r?void 0:this}}function A(){r.e(439).then(r.bind(r,7438)).then((t=>{let{setAPI:r}=t;r(e),(0,c.L)(e,"api")})).catch((()=>(0,f.Z)("Downloading runtime APIs failed...")))}return["actionText","setName","setAttribute","save","ignore","onEnd","getContext","end","get"].forEach((e=>{w[e]=x(v,e,void 0,n.D.spa)})),h.noticeError=function(e,t){"string"==typeof e&&(e=new Error(e)),(0,o.p)(l.xS,["API/noticeError/called"],void 0,n.D.metrics,g),(0,o.p)("err",[e,(0,s.z)(),!1,t],void 0,n.D.jserrors,g)},d.il?(0,u.b)((()=>A()),!0):A(),h}(e,v);return(0,h.Qy)(e,T,"api"),(0,h.Qy)(e,A,"exposed"),(0,h.EZ)("activatedFeatures",p.T),T}},3325:(e,t,r)=>{r.d(t,{D:()=>n,p:()=>i});const n={ajax:"ajax",jserrors:"jserrors",metrics:"metrics",pageAction:"page_action",pageViewEvent:"page_view_event",pageViewTiming:"page_view_timing",sessionReplay:"session_replay",sessionTrace:"session_trace",spa:"spa"},i={[n.pageViewEvent]:1,[n.pageViewTiming]:2,[n.metrics]:3,[n.jserrors]:4,[n.ajax]:5,[n.sessionTrace]:6,[n.pageAction]:7,[n.spa]:8,[n.sessionReplay]:9}}},n={};function i(e){var t=n[e];if(void 0!==t)return t.exports;var o=n[e]={exports:{}};return r[e](o,o.exports,i),o.exports}i.m=r,i.d=(e,t)=>{for(var r in t)i.o(t,r)&&!i.o(e,r)&&Object.defineProperty(e,r,{enumerable:!0,get:t[r]})},i.f={},i.e=e=>Promise.all(Object.keys(i.f).reduce(((t,r)=>(i.f[r](e,t),t)),[])),i.u=e=>(({78:"page_action-aggregate",147:"metrics-aggregate",242:"session-manager",317:"jserrors-aggregate",348:"page_view_timing-aggregate",412:"lazy-feature-loader",439:"async-api",538:"recorder",590:"session_replay-aggregate",675:"compressor",733:"session_trace-aggregate",786:"page_view_event-aggregate",873:"spa-aggregate",898:"ajax-aggregate"}[e]||e)+"."+{78:"ac76d497",147:"3dc53903",148:"1a20d5fe",242:"2a64278a",317:"49e41428",348:"bd6de33a",412:"2f55ce66",439:"30bd804e",538:"1b18459f",590:"cf0efb30",675:"ae9f91a8",733:"83105561",786:"06482edd",860:"03a8b7a5",873:"e6b09d52",898:"998ef92b"}[e]+"-1.236.0.min.js"),i.o=(e,t)=>Object.prototype.hasOwnProperty.call(e,t),e={},t="NRBA:",i.l=(r,n,o,a)=>{if(e[r])e[r].push(n);else{var s,c;if(void 0!==o)for(var u=document.getElementsByTagName("script"),d=0;d {s.onerror=s.onload=null,clearTimeout(h);var i=e[r];if(delete e[r],s.parentNode&&s.parentNode.removeChild(s),i&&i.forEach((e=>e(n))),t)return t(n)},h=setTimeout(l.bind(null,void 0,{type:"timeout",target:s}),12e4);s.onerror=l.bind(null,s.onerror),s.onload=l.bind(null,s.onload),c&&document.head.appendChild(s)}},i.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.j=364,i.p="https://js-agent.newrelic.com/",(()=>{var e={364:0,953:0};i.f.j=(t,r)=>{var n=i.o(e,t)?e[t]:void 0;if(0!==n)if(n)r.push(n[2]);else{var o=new Promise(((r,i)=>n=e[t]=[r,i]));r.push(n[2]=o);var a=i.p+i.u(t),s=new Error;i.l(a,(r=>{if(i.o(e,t)&&(0!==(n=e[t])&&(e[t]=void 0),n)){var o=r&&("load"===r.type?"missing":r.type),a=r&&r.target&&r.target.src;s.message="Loading chunk "+t+" failed.\n("+o+": "+a+")",s.name="ChunkLoadError",s.type=o,s.request=a,n[1](s)}}),"chunk-"+t,t)}};var t=(t,r)=>{var n,o,[a,s,c]=r,u=0;if(a.some((t=>0!==e[t]))){for(n in s)i.o(s,n)&&(i.m[n]=s[n]);if(c)c(i)}for(t&&t(r);u {i.r(o);var e=i(3325),t=i(5763);const r=Object.values(e.D);function n(e){const n={};return r.forEach((r=>{n[r]=function(e,r){return!1!==(0,t.Mt)(r,"".concat(e,".enabled"))}(r,e)})),n}var a=i(9144);var s=i(5546),c=i(385),u=i(8e3),d=i(5938),f=i(3960),l=i(50);class h extends d.W{constructor(e,t,r){let n=!(arguments.length>3&&void 0!==arguments[3])||arguments[3];super(e,t,r),this.auto=n,this.abortHandler,this.featAggregate,this.onAggregateImported,n&&(0,u.R)(e,r)}importAggregator(){let e=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};if(this.featAggregate||!this.auto)return;const r=c.il&&!0===(0,t.Mt)(this.agentIdentifier,"privacy.cookies_enabled");let n;this.onAggregateImported=new Promise((e=>{n=e}));const o=async()=>{let t;try{if(r){const{setupAgentSession:e}=await Promise.all([i.e(860),i.e(242)]).then(i.bind(i,3228));t=e(this.agentIdentifier)}}catch(e){(0,l.Z)("A problem occurred when starting up session manager. This page will not start or extend any session.",e)}try{if(!this.shouldImportAgg(this.featureName,t))return void(0,u.L)(this.agentIdentifier,this.featureName);const{lazyFeatureLoader:r}=await i.e(412).then(i.bind(i,8582)),{Aggregate:o}=await r(this.featureName,"aggregate");this.featAggregate=new o(this.agentIdentifier,this.aggregator,e),n(!0)}catch(e){(0,l.Z)("Downloading and initializing ".concat(this.featureName," failed..."),e),this.abortHandler?.(),n(!1)}};c.il?(0,f.b)((()=>o()),!0):o()}shouldImportAgg(r,n){return r!==e.D.sessionReplay||!1!==(0,t.Mt)(this.agentIdentifier,"session_trace.enabled")&&(!!n?.isNew||!!n?.state.sessionReplay)}}var g=i(7633),p=i(7894);class m extends h{static featureName=g.t9;constructor(r,n){let i=!(arguments.length>2&&void 0!==arguments[2])||arguments[2];if(super(r,n,g.t9,i),("undefined"==typeof PerformanceNavigationTiming||c.Tt)&&"undefined"!=typeof PerformanceTiming){const n=(0,t.OP)(r);n[g.Dz]=Math.max(Date.now()-n.offset,0),(0,f.K)((()=>n[g.qw]=Math.max((0,p.z)()-n[g.Dz],0))),(0,f.b)((()=>{const t=(0,p.z)();n[g.OJ]=Math.max(t-n[g.Dz],0),(0,s.p)("timing",["load",t],void 0,e.D.pageViewTiming,this.ee)}))}this.importAggregator()}}var v=i(1117),b=i(1284);class y extends v.w{constructor(e){super(e),this.aggregatedData={}}store(e,t,r,n,i){var o=this.getBucket(e,t,r,i);return o.metrics=function(e,t){t||(t={count:0});return t.count+=1,(0,b.D)(e,(function(e,r){t[e]=w(r,t[e])})),t}(n,o.metrics),o}merge(e,t,r,n,i){var o=this.getBucket(e,t,n,i);if(o.metrics){var a=o.metrics;a.count+=r.count,(0,b.D)(r,(function(e,t){if("count"!==e){var n=a[e],i=r[e];i&&!i.c?a[e]=w(i.t,n):a[e]=function(e,t){if(!t)return e;t.c||(t=x(t.t));return t.min=Math.min(e.min,t.min),t.max=Math.max(e.max,t.max),t.t+=e.t,t.sos+=e.sos,t.c+=e.c,t}(i,a[e])}}))}else o.metrics=r}storeMetric(e,t,r,n){var i=this.getBucket(e,t,r);return i.stats=w(n,i.stats),i}getBucket(e,t,r,n){this.aggregatedData[e]||(this.aggregatedData[e]={});var i=this.aggregatedData[e][t];return i||(i=this.aggregatedData[e][t]={params:r||{}},n&&(i.custom=n)),i}get(e,t){return t?this.aggregatedData[e]&&this.aggregatedData[e][t]:this.aggregatedData[e]}take(e){for(var t={},r="",n=!1,i=0;i t.max&&(t.max=e),e 2&&void 0!==arguments[2])||arguments[2];super(e,r,j.t,n),c.il&&((0,t.OP)(e).initHidden=Boolean("hidden"===document.visibilityState),(0,N.N)((()=>(0,s.p)("docHidden",[(0,p.z)()],void 0,j.t,this.ee)),!0),(0,O.bP)("pagehide",(()=>(0,s.p)("winPagehide",[(0,p.z)()],void 0,j.t,this.ee))),this.importAggregator())}}var P=i(3081);class C extends h{static featureName=P.t9;constructor(e,t){let r=!(arguments.length>2&&void 0!==arguments[2])||arguments[2];super(e,t,P.t9,r),this.importAggregator()}}var R,I=i(2210),k=i(1214),H=i(2177),L={};try{R=localStorage.getItem("__nr_flags").split(","),console&&"function"==typeof console.log&&(L.console=!0,-1!==R.indexOf("dev")&&(L.dev=!0),-1!==R.indexOf("nr_dev")&&(L.nrDev=!0))}catch(e){}function z(e){try{L.console&&z(e)}catch(e){}}L.nrDev&&H.ee.on("internal-error",(function(e){z(e.stack)})),L.dev&&H.ee.on("fn-err",(function(e,t,r){z(r.stack)})),L.dev&&(z("NR AGENT IN DEVELOPMENT MODE"),z("flags: "+(0,b.D)(L,(function(e,t){return e})).join(", ")));var M=i(6660);class B extends h{static featureName=M.t;constructor(r,n){let i=!(arguments.length>2&&void 0!==arguments[2])||arguments[2];super(r,n,M.t,i),this.skipNext=0;try{this.removeOnAbort=new AbortController}catch(e){}const o=this;o.ee.on("fn-start",(function(e,t,r){o.abortHandler&&(o.skipNext+=1)})),o.ee.on("fn-err",(function(t,r,n){o.abortHandler&&!n[M.A]&&((0,I.X)(n,M.A,(function(){return!0})),this.thrown=!0,(0,s.p)("err",[n,(0,p.z)()],void 0,e.D.jserrors,o.ee))})),o.ee.on("fn-end",(function(){o.abortHandler&&!this.thrown&&o.skipNext>0&&(o.skipNext-=1)})),o.ee.on("internal-error",(function(t){(0,s.p)("ierr",[t,(0,p.z)(),!0],void 0,e.D.jserrors,o.ee)})),this.origOnerror=c._A.onerror,c._A.onerror=this.onerrorHandler.bind(this),c._A.addEventListener("unhandledrejection",(t=>{const r=function(e){let t="Unhandled Promise Rejection: ";if(e instanceof Error)try{return e.message=t+e.message,e}catch(t){return e}if(void 0===e)return new Error(t);try{return new Error(t+(0,D.P)(e))}catch(e){return new Error(t)}}(t.reason);(0,s.p)("err",[r,(0,p.z)(),!1,{unhandledPromiseRejection:1}],void 0,e.D.jserrors,this.ee)}),(0,O.m$)(!1,this.removeOnAbort?.signal)),(0,k.gy)(this.ee),(0,k.BV)(this.ee),(0,k.em)(this.ee),(0,t.OP)(r).xhrWrappable&&(0,k.Kf)(this.ee),this.abortHandler=this.#e,this.importAggregator()}#e(){this.removeOnAbort?.abort(),this.abortHandler=void 0}onerrorHandler(t,r,n,i,o){"function"==typeof this.origOnerror&&this.origOnerror(...arguments);try{this.skipNext?this.skipNext-=1:(0,s.p)("err",[o||new F(t,r,n),(0,p.z)()],void 0,e.D.jserrors,this.ee)}catch(t){try{(0,s.p)("ierr",[t,(0,p.z)(),!0],void 0,e.D.jserrors,this.ee)}catch(e){}}return!1}}function F(e,t,r){this.message=e||"Uncaught error with no additional information",this.sourceURL=t,this.line=r}let U=1;const q="nr@id";function G(e){const t=typeof e;return!e||"object"!==t&&"function"!==t?-1:e===c._A?0:(0,I.X)(e,q,(function(){return U++}))}function V(e){if("string"==typeof e&&e.length)return e.length;if("object"==typeof e){if("undefined"!=typeof ArrayBuffer&&e instanceof ArrayBuffer&&e.byteLength)return e.byteLength;if("undefined"!=typeof Blob&&e instanceof Blob&&e.size)return e.size;if(!("undefined"!=typeof FormData&&e instanceof FormData))try{return(0,D.P)(e).length}catch(e){return}}}var X=i(7243);class W{constructor(e){this.agentIdentifier=e,this.generateTracePayload=this.generateTracePayload.bind(this),this.shouldGenerateTrace=this.shouldGenerateTrace.bind(this)}generateTracePayload(e){if(!this.shouldGenerateTrace(e))return null;var r=(0,t.DL)(this.agentIdentifier);if(!r)return null;var n=(r.accountID||"").toString()||null,i=(r.agentID||"").toString()||null,o=(r.trustKey||"").toString()||null;if(!n||!i)return null;var a=(0,_.M)(),s=(0,_.Ht)(),c=Date.now(),u={spanId:a,traceId:s,timestamp:c};return(e.sameOrigin||this.isAllowedOrigin(e)&&this.useTraceContextHeadersForCors())&&(u.traceContextParentHeader=this.generateTraceContextParentHeader(a,s),u.traceContextStateHeader=this.generateTraceContextStateHeader(a,c,n,i,o)),(e.sameOrigin&&!this.excludeNewrelicHeader()||!e.sameOrigin&&this.isAllowedOrigin(e)&&this.useNewrelicHeaderForCors())&&(u.newrelicHeader=this.generateTraceHeader(a,s,c,n,i,o)),u}generateTraceContextParentHeader(e,t){return"00-"+t+"-"+e+"-01"}generateTraceContextStateHeader(e,t,r,n,i){return i+"@nr=0-1-"+r+"-"+n+"-"+e+"----"+t}generateTraceHeader(e,t,r,n,i,o){if(!("function"==typeof c._A?.btoa))return null;var a={v:[0,1],d:{ty:"Browser",ac:n,ap:i,id:e,tr:t,ti:r}};return o&&n!==o&&(a.d.tk=o),btoa((0,D.P)(a))}shouldGenerateTrace(e){return this.isDtEnabled()&&this.isAllowedOrigin(e)}isAllowedOrigin(e){var r=!1,n={};if((0,t.Mt)(this.agentIdentifier,"distributed_tracing")&&(n=(0,t.P_)(this.agentIdentifier).distributed_tracing),e.sameOrigin)r=!0;else if(n.allowed_origins instanceof Array)for(var i=0;i 2&&void 0!==arguments[2])||arguments[2];super(r,n,Z.t,i),(0,t.OP)(r).xhrWrappable&&(this.dt=new W(r),this.handler=(e,t,r,n)=>(0,s.p)(e,t,r,n,this.ee),(0,k.u5)(this.ee),(0,k.Kf)(this.ee),function(r,n,i,o){function a(e){var t=this;t.totalCbs=0,t.called=0,t.cbTime=0,t.end=E,t.ended=!1,t.xhrGuids={},t.lastSize=null,t.loadCaptureCalled=!1,t.params=this.params||{},t.metrics=this.metrics||{},e.addEventListener("load",(function(r){_(t,e)}),(0,O.m$)(!1)),c.IF||e.addEventListener("progress",(function(e){t.lastSize=e.loaded}),(0,O.m$)(!1))}function s(e){this.params={method:e[0]},T(this,e[1]),this.metrics={}}function u(e,n){var i=(0,t.DL)(r);i.xpid&&this.sameOrigin&&n.setRequestHeader("X-NewRelic-ID",i.xpid);var a=o.generateTracePayload(this.parsedOrigin);if(a){var s=!1;a.newrelicHeader&&(n.setRequestHeader("newrelic",a.newrelicHeader),s=!0),a.traceContextParentHeader&&(n.setRequestHeader("traceparent",a.traceContextParentHeader),a.traceContextStateHeader&&n.setRequestHeader("tracestate",a.traceContextStateHeader),s=!0),s&&(this.dt=a)}}function d(e,t){var r=this.metrics,i=e[0],o=this;if(r&&i){var a=V(i);a&&(r.txSize=a)}this.startTime=(0,p.z)(),this.listener=function(e){try{"abort"!==e.type||o.loadCaptureCalled||(o.params.aborted=!0),("load"!==e.type||o.called===o.totalCbs&&(o.onloadCalled||"function"!=typeof t.onload)&&"function"==typeof o.end)&&o.end(t)}catch(e){try{n.emit("internal-error",[e])}catch(e){}}};for(var s=0;s 1?e[1]=i:e.push(i)}else e[0]&&e[0].headers&&s(e[0].headers,n)&&(this.dt=n);function s(e,t){var r=!1;return t.newrelicHeader&&(e.set("newrelic",t.newrelicHeader),r=!0),t.traceContextParentHeader&&(e.set("traceparent",t.traceContextParentHeader),t.traceContextStateHeader&&e.set("tracestate",t.traceContextStateHeader),r=!0),r}}function x(e,t){this.params={},this.metrics={},this.startTime=(0,p.z)(),this.dt=t,e.length>=1&&(this.target=e[0]),e.length>=2&&(this.opts=e[1]);var r,n=this.opts||{},i=this.target;"string"==typeof i?r=i:"object"==typeof i&&i instanceof Y?r=i.url:c._A?.URL&&"object"==typeof i&&i instanceof URL&&(r=i.href),T(this,r);var o=(""+(i&&i instanceof Y&&i.method||n.method||"GET")).toUpperCase();this.params.method=o,this.txSize=V(n.body)||0}function A(t,r){var n;this.endTime=(0,p.z)(),this.params||(this.params={}),this.params.status=r?r.status:0,"string"==typeof this.rxSize&&this.rxSize.length>0&&(n=+this.rxSize);var o={txSize:this.txSize,rxSize:n,duration:(0,p.z)()-this.startTime};i("xhr",[this.params,o,this.startTime,this.endTime,"fetch"],this,e.D.ajax)}function E(t){var r=this.params,n=this.metrics;if(!this.ended){this.ended=!0;for(var o=0;o 2&&void 0!==arguments[2])||arguments[2];super(e,t,we.t,r),this.importAggregator()}}new class{constructor(e){let t=arguments.length>1&&void 0!==arguments[1]?arguments[1]:(0,_.ky)(16);c._A?(this.agentIdentifier=t,this.sharedAggregator=new y({agentIdentifier:this.agentIdentifier}),this.features={},this.desiredFeatures=new Set(e.features||[]),this.desiredFeatures.add(m),Object.assign(this,(0,a.j)(this.agentIdentifier,e,e.loaderType||"agent")),this.start()):(0,l.Z)("Failed to initial the agent. Could not determine the runtime environment.")}get config(){return{info:(0,t.C5)(this.agentIdentifier),init:(0,t.P_)(this.agentIdentifier),loader_config:(0,t.DL)(this.agentIdentifier),runtime:(0,t.OP)(this.agentIdentifier)}}start(){const t="features";try{const r=n(this.agentIdentifier),i=[...this.desiredFeatures];i.sort(((t,r)=>e.p[t.featureName]-e.p[r.featureName])),i.forEach((t=>{if(r[t.featureName]||t.featureName===e.D.pageViewEvent){const n=function(t){switch(t){case e.D.ajax:return[e.D.jserrors];case e.D.sessionTrace:return[e.D.ajax,e.D.pageViewEvent];case e.D.sessionReplay:return[e.D.sessionTrace];case e.D.pageViewTiming:return[e.D.pageViewEvent];default:return[]}}(t.featureName);n.every((e=>r[e]))||(0,l.Z)("".concat(t.featureName," is enabled but one or more dependent features has been disabled (").concat((0,D.P)(n),"). This may cause unintended consequences or missing data...")),this.features[t.featureName]=new t(this.agentIdentifier,this.sharedAggregator)}})),(0,T.Qy)(this.agentIdentifier,this.features,t)}catch(e){(0,l.Z)("Failed to initialize all enabled instrument classes (agent aborted) -",e);for(const e in this.features)this.features[e].abortHandler?.();const r=(0,T.fP)();return delete r.initializedAgents[this.agentIdentifier]?.api,delete r.initializedAgents[this.agentIdentifier]?.[t],delete this.sharedAggregator,r.ee?.abort(),delete r.ee?.get(this.agentIdentifier),!1}}}({features:[J,m,S,class extends h{static featureName=oe;constructor(t,r){if(super(t,r,oe,!(arguments.length>2&&void 0!==arguments[2])||arguments[2]),!c.il)return;const n=this.ee;let i;(0,k.QU)(n),this.eventsEE=(0,k.em)(n),this.eventsEE.on(se,(function(e,t){this.bstStart=(0,p.z)()})),this.eventsEE.on(ae,(function(t,r){(0,s.p)("bst",[t[0],r,this.bstStart,(0,p.z)()],void 0,e.D.sessionTrace,n)})),n.on(ce+ne,(function(e){this.time=(0,p.z)(),this.startPath=location.pathname+location.hash})),n.on(ce+ie,(function(t){(0,s.p)("bstHist",[location.pathname+location.hash,this.startPath,this.time],void 0,e.D.sessionTrace,n)}));try{i=new PerformanceObserver((t=>{const r=t.getEntries();(0,s.p)(te,[r],void 0,e.D.sessionTrace,n)})),i.observe({type:re,buffered:!0})}catch(e){}this.importAggregator({resourceObserver:i})}},C,xe,B,class extends h{static featureName=de;constructor(e,r){if(super(e,r,de,!(arguments.length>2&&void 0!==arguments[2])||arguments[2]),!c.il)return;if(!(0,t.OP)(e).xhrWrappable)return;try{this.removeOnAbort=new AbortController}catch(e){}let n,i=0;const o=this.ee.get("tracer"),a=(0,k._L)(this.ee),s=(0,k.Lg)(this.ee),u=(0,k.BV)(this.ee),d=(0,k.Kf)(this.ee),f=this.ee.get("events"),l=(0,k.u5)(this.ee),h=(0,k.QU)(this.ee),g=(0,k.Gm)(this.ee);function m(e,t){h.emit("newURL",[""+window.location,t])}function v(){i++,n=window.location.hash,this[ve]=(0,p.z)()}function b(){i--,window.location.hash!==n&&m(0,!0);var e=(0,p.z)();this[pe]=~~this[pe]+e-this[ve],this[ye]=e}function y(e,t){e.on(t,(function(){this[t]=(0,p.z)()}))}this.ee.on(ve,v),s.on(be,v),a.on(be,v),this.ee.on(ye,b),s.on(ge,b),a.on(ge,b),this.ee.buffer([ve,ye,"xhr-resolved"],this.featureName),f.buffer([ve],this.featureName),u.buffer(["setTimeout"+le,"clearTimeout"+fe,ve],this.featureName),d.buffer([ve,"new-xhr","send-xhr"+fe],this.featureName),l.buffer([me+fe,me+"-done",me+he+fe,me+he+le],this.featureName),h.buffer(["newURL"],this.featureName),g.buffer([ve],this.featureName),s.buffer(["propagate",be,ge,"executor-err","resolve"+fe],this.featureName),o.buffer([ve,"no-"+ve],this.featureName),a.buffer(["new-jsonp","cb-start","jsonp-error","jsonp-end"],this.featureName),y(l,me+fe),y(l,me+"-done"),y(a,"new-jsonp"),y(a,"jsonp-end"),y(a,"cb-start"),h.on("pushState-end",m),h.on("replaceState-end",m),window.addEventListener("hashchange",m,(0,O.m$)(!0,this.removeOnAbort?.signal)),window.addEventListener("load",m,(0,O.m$)(!0,this.removeOnAbort?.signal)),window.addEventListener("popstate",(function(){m(0,i>1)}),(0,O.m$)(!0,this.removeOnAbort?.signal)),this.abortHandler=this.#e,this.importAggregator()}#e(){this.removeOnAbort?.abort(),this.abortHandler=void 0}}],loaderType:"spa"})})(),window.NRBA=o})(); window.jQuery || document.write(' ') CKEDITOR_BASEPATH='https://f1000research.com/js/vendor/ckeditor/' window.reactTheme = 'research'; window.MathJax = { CommonHTML: { linebreaks: { automatic: true } }, 'HTML-CSS': { linebreaks: { automatic: true } }, SVG: { linebreaks: { automatic: true } }, AuthorInit: function() { MathJax.Hub.Register.MessageHook('End Process', function () { let timeout = false; // holder for timeout id const delay = 250; // delay after event is "complete" to run callback const reflowMath = function() { const dispFormulas = document.querySelectorAll('.disp-formula.panel'); if (!dispFormulas) { return; } for (const dispFormula of dispFormulas) { const child = dispFormula.querySelector('.MathJax_Preview').nextSibling.firstChild; const isMultiline = MathJax.Hub.getAllJax(dispFormula)[0].root.isMultiline; if (dispFormula.offsetWidth < child.offsetWidth || isMultiline) { MathJax.Hub.Queue(['Rerender', MathJax.Hub, dispFormula]); } } }; window.addEventListener('resize', function() { clearTimeout(timeout); // clear the timeout timeout = setTimeout(reflowMath, delay); // start timing for event "completion" }); }); }, }; if (window.location.hash == '#_=_'){ window.location = window.location.href.split('#')[0] } !function(f,b,e,v,n,t,s){if(f.fbq)return;n=f.fbq=function() {n.callMethod? n.callMethod.apply(n,arguments):n.queue.push(arguments)} ;if(!f._fbq)f._fbq=n; n.push=n;n.loaded=!0;n.version='2.0';n.queue=[];t=b.createElement(e);t.async=!0; t.src=v;s=b.getElementsByTagName(e)[0];s.parentNode.insertBefore(t,s)}(window, document,'script','https://connect.facebook.net/en_US/fbevents.js'); fbq('init', '1641728616063202'); fbq('track', "PixelInitialized", {}); (function(h,o,t,j,a,r){ h.hj=h.hj||function(){(h.hj.q=h.hj.q||[]).push(arguments)}; h._hjSettings={hjid:2318163,hjsv:6}; a=o.getElementsByTagName('head')[0]; r=o.createElement('script');r.async=1; r.src=t+h._hjSettings.hjid+j+h._hjSettings.hjsv; a.appendChild(r); })(window,document,'https://static.hotjar.com/c/hotjar-','.js?sv='); search file_upload Submit your research search menu close search Browse Gateways & Collections How to Publish Submit your Research My Submissions Article Guidelines Article Guidelines (New Versions) Open Data, Software and Code Guidelines Open Data and Accessible Source Materials Guidelines (HSS) Open Data, Software and Code Guidelines (PSE) Prepublication Checks Production Process Posters and Slides Guidelines Document Guidelines Article Processing Charges Peer Review Finding Article Reviewers About How it Works For Reviewers Our Advisors Policies Glossary FAQs For Developers Newsroom Contact My Research Submissions Content and Tracking Alerts My Details Sign In file_upload Submit your research { "@context": "https://schema.org", "@type": "ScholarlyArticle", "mainEntityOfPage": { "@type": "WebPage", "@id": "https://f1000research.com/articles/13-1180" }, "headline": "Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential...", "datePublished": "2024-10-09T10:21:47", "dateModified": "2025-01-02T15:37:52", "author": [ { "@type": "Person", "name": "Eva Kohnert" }, { "@type": "Person", "name": "Clemens Kreutz" } ], "publisher": { "@type": "Organization", "name": "F1000Research", "logo": { "@type": "ImageObject", "url": "https://f1000research.com/img/AMP/F1000Research_image.png", "height": 480, "width": 60 } }, "image": { "@type": "ImageObject", "url": "https://f1000research.com/img/AMP/F1000Research_image.png", "height": 1200, "width": 150 }, "description": " Background Synthetic data’s utility in benchmark studies depends on its ability to closely mimic real-world conditions and reproduce results obtained from experimental data. Building on Nearing et al.’s study (1), who assessed 14 differential abundance tests using 38 experimental 16S rRNA datasets in a case-control design, we are generating synthetic datasets that mimic the experimental data to verify their findings. We will employ statistical tests to rigorously assess the similarity between synthetic and experimental data and to validate the conclusions on the performance of these tests drawn by Nearing et al. (1). This protocol adheres to the SPIRIT guidelines, demonstrating how established reporting frameworks can support robust, transparent, and unbiased study planning. Methods We replicate Nearing et al.’s (1) methodology, incorporating synthetic data simulated using two distinct tools, mirroring the 38 experimental datasets. Equivalence tests will be conducted on a non-redundant subset of 46 data characteristics comparing synthetic and experimental data, complemented by principal component analysis for overall similarity assessment. The 14 differential abundance tests will be applied to synthetic and experimental datasets, evaluating the consistency of significant feature identification and the number of significant features per tool. Correlation analysis and multiple regression will explore how differences between synthetic and experimental data characteristics may affect the results. Conclusions Synthetic data enables the validation of findings through controlled experiments. We assess how well synthetic data replicates experimental data, try to validate previous findings with the most recent versions of the DA methods and delineate the strengths and limitations of synthetic data in benchmark studies. Moreover, to our knowledge this is the first computational benchmark study to systematically incorporate synthetic data for validating differential abundance methods while strictly adhering to a pre-specified study protocol following SPIRIT guidelines, contributing to transparency, reproducibility, and unbiased research. " } { "@context": "http://schema.org", "@type": "BreadcrumbList", "itemListElement": [ { "@type": "ListItem", "position": "1", "item": { "@id": "https://f1000research.com/", "name": "Home" } }, { "@type": "ListItem", "position": "2", "item": { "@id": "https://f1000research.com/browse/articles", "name": "Browse" } }, { "@type": "ListItem", "position": "3", "item": { "@id": "https://f1000research.com/articles/13-1180/v2", "name": "Computational Study Protocol: Leveraging Synthetic Data to Validate..." } } ] } Home Browse Computational Study Protocol: Leveraging Synthetic Data to Validate... ALL Metrics - Views Downloads Get PDF Get XML Cite How to cite this article Kohnert E and Kreutz C. Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.12688/f1000research.155230.2 ) NOTE: If applicable, it is important to ensure the information in square brackets after the title is included in all citations of this article. Close Copy Citation Details Export Export Citation Sciwheel EndNote Ref. Manager Bibtex ProCite Sente EXPORT Select a format first Track Share ▬ ✚ Study Protocol Revised Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] Eva Kohnert https://orcid.org/0009-0007-9976-2441 1 , Clemens Kreutz 1 Eva Kohnert https://orcid.org/0009-0007-9976-2441 1 , Clemens Kreutz 1 PUBLISHED 02 Jan 2025 Author details Author details 1 Institute of Medical Biometry and Statistics, Faculty of Medicine and Medical Center, University of Freiburg, Baden-Württemberg, Germany Eva Kohnert Roles: Writing – Original Draft Preparation, Writing – Review & Editing Clemens Kreutz Roles: Writing – Original Draft Preparation, Writing – Review & Editing OPEN PEER REVIEW DETAILS REVIEWER STATUS This article is included in the Cell & Molecular Biology gateway. Abstract Background Synthetic data’s utility in benchmark studies depends on its ability to closely mimic real-world conditions and reproduce results obtained from experimental data. Building on Nearing et al.’s study (1), who assessed 14 differential abundance tests using 38 experimental 16S rRNA datasets in a case-control design, we are generating synthetic datasets that mimic the experimental data to verify their findings. We will employ statistical tests to rigorously assess the similarity between synthetic and experimental data and to validate the conclusions on the performance of these tests drawn by Nearing et al. (1). This protocol adheres to the SPIRIT guidelines, demonstrating how established reporting frameworks can support robust, transparent, and unbiased study planning. Methods We replicate Nearing et al.’s (1) methodology, incorporating synthetic data simulated using two distinct tools, mirroring the 38 experimental datasets. Equivalence tests will be conducted on a non-redundant subset of 46 data characteristics comparing synthetic and experimental data, complemented by principal component analysis for overall similarity assessment. The 14 differential abundance tests will be applied to synthetic and experimental datasets, evaluating the consistency of significant feature identification and the number of significant features per tool. Correlation analysis and multiple regression will explore how differences between synthetic and experimental data characteristics may affect the results. Conclusions Synthetic data enables the validation of findings through controlled experiments. We assess how well synthetic data replicates experimental data, try to validate previous findings with the most recent versions of the DA methods and delineate the strengths and limitations of synthetic data in benchmark studies. Moreover, to our knowledge this is the first computational benchmark study to systematically incorporate synthetic data for validating differential abundance methods while strictly adhering to a pre-specified study protocol following SPIRIT guidelines, contributing to transparency, reproducibility, and unbiased research. READ ALL READ LESS Keywords 16S, microbiome, differential abundance, simulation, synthetic data, benchmarking Corresponding Author(s) Eva Kohnert ( [email protected] ) Close Corresponding author: Eva Kohnert Competing interests: No competing interests were disclosed. Grant information: The author(s) declared that no grants were involved in supporting this work. Copyright: © 2025 Kohnert E and Kreutz C. This is an open access article distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. How to cite: Kohnert E and Kreutz C. Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.12688/f1000research.155230.2 ) First published: 09 Oct 2024, 13 :1180 ( https://doi.org/10.12688/f1000research.155230.1 ) Latest published: 02 Jan 2025, 13 :1180 ( https://doi.org/10.12688/f1000research.155230.2 ) Revised Amendments from Version 1 The abstract and the study setting were updated to more clearly convey the scope of the study. In ‘intervention description {11a}’ more information about the data generating mechanism were added. A new strategy to remove highly correlated data characteristics was added in ‘choice of comparators {6b}’. For some hypotheses in aim 2 it was clarified which statistical test is used. Hypothesis 9 for aim 2b was changed. The discussion was extended and three new data characteristics were added to table 5. For the statistical tests ANCOM-BC was changed to ANCOM-II to be in line with the reference study. In multiple locations the text has been updated to be more precise. The abstract and the study setting were updated to more clearly convey the scope of the study. In ‘intervention description {11a}’ more information about the data generating mechanism were added. A new strategy to remove highly correlated data characteristics was added in ‘choice of comparators {6b}’. For some hypotheses in aim 2 it was clarified which statistical test is used. Hypothesis 9 for aim 2b was changed. The discussion was extended and three new data characteristics were added to table 5. For the statistical tests ANCOM-BC was changed to ANCOM-II to be in line with the reference study. In multiple locations the text has been updated to be more precise. See the authors' detailed response to the review by Leo Lahti and Juho Pelto See the authors' detailed response to the review by Jacob T. Nearing READ REVIEWER RESPONSES Protocol Table of Contents Introduction Background and rationale {6a} Objectives {7} Trial design {8} Summary Table Methods Study population/participants Study setting {9} Eligibility criteria {10} Who will take informed consent? {26a} Additional consent provisions for collection and use of participant data and biological specimens {26b} Interventions Explanation for the choice of comparators {6b} Intervention description {11a} Criteria for discontinuing or modifying allocated interventions {11b} Strategies to improve adherence to interventions {11c} Relevant concomitant care permitted or prohibited during the trial {11d} Provisions for post-trial care {30} Outcomes {12} Participant timeline {13} Sample size {14} Recruitment {15} Assignment of interventions: allocation Sequence generation {16a} Concealment mechanism {16b} Implementation {16c} Assignment of interventions: Blinding Who will be blinded {17a} Procedure for unblinding if needed {17b} Data collection and management Plans for assessment and collection of outcomes {18a} Plans to promote participant retention and complete follow-up {18b} Data management {19} Confidentiality {27} Plans for collection, laboratory evaluation and storage of biological specimens for genetic or molecular analysis in this trial/future use {33} Statistical methods {20} Data monitoring committee {21a} Statistical methods for primary and secondary outcomes {20a} Interim analyses {21b} Methods for additional analyses (e.g. subgroup analyses) {20b} Methods in analysis to handle protocol non-adherence and any statistical methods to handle missing data {20c} Plans to give access to the full protocol, participant level-data and statistical code {31c} Timeline Oversight and monitoring Composition of the coordinating centre and trial steering committee {5d} Composition of the data monitoring committee, its role and reporting structure {21a} Adverse event reporting and harms {22} Ancillary and post-trial care {30} Frequency and plans for auditing trial conduct {23} Plans for communicating important protocol amendments to relevant parties (e.g. trial participants, ethical committees) {25} Dissemination policy {31a} Discussion Abbreviations Declarations Acknowledgements Authors’ contributions {31b} Availability of data and materials {29} Ethics and consent {24} Consent for publication {32} Authors’ information (optional) Data availability References Study status Note: To achieve a rigorous methodology, this protocol adheres to an established standard and checklist for Standard Protocol Items: Recommendations for Interventional Trials (SPIRIT). 2 The numbers in curly brackets in this protocol refer to SPIRIT checklist item numbers. The order of the items has been modified to group similar items. Since using a standardized terminology for study designs is essential, we also formulate our protocol using standard terminology, i.e. terms such as study population, comparator, intervention, outcome, modification, inclusion and exclusion. Table 1. Administrative information summary. Title {1} Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data. Trial registration {2a and 2b}. Currently, there exists no registry tailored specifically to computational benchmark studies. This study does not involve interventions on humans or animals; rather, it exclusively incorporates publicly accessible sequencing data. Protocol version {3} December 17, 2024, Version 4 Grant information (Funding {4}) The author(s) declared that no third-party grants were involved in supporting this work. Author details {5a} Eva Kohnert: Institute of Medical Biometry and Statistics, Faculty of Medicine and Medical Center – University of Freiburg, Germany Clemens Kreutz: Institute of Medical Biometry and Statistics, Faculty of Medicine and Medical Center – University of Freiburg, Germany. Name and contact information for the trial sponsor {5b} n/a: There is no sponsor. Role of sponsor {5c} n/a: There is no sponsor. Introduction Background and rationale {6a} Differential abundance (DA) analysis of metagenomic microbiome data has emerged as a pivotal tool in understanding the complex dynamics of microbial communities across various environments and host organisms. 3 – 5 Microbiome studies are crucial for identifying specific microorganisms that differ significantly in abundance between different conditions, such as health and disease states, different environmental conditions, or before and after a treatment. The insights we gain from analyzing the differential abundance of microorganisms are critical to understanding the role that microbial communities play in environmental adaptations, disease development and health of the host. 6 Refining statistical methods for the identification of changes in microbial abundance is essential for understanding how these communities influence disease progression and other interactions with the host, which then enables new strategies for therapeutic interventions and diagnostic analyses. 7 The statistical interpretation of microbiome data is notably challenged by its inherent sparsity and compositional nature. Sparsity refers to the disproportionate proportion of zeros in metagenomic sequencing data and requires tailored statistical methods, 8 , 9 e.g. to account for so-called structural zeros that originate from technical limitations rather than from real absence. 10 Additionally, due to the compositional aspect of microbiome data regulation of highly abundant microbes can lead to biased quantification of low-abundant organisms. 11 Such bias might be erroneously interpreted as apparent regulation that is mainly due to the compositional character of the data. Such characteristics of microbiome data have a notable impact on the performance of common statistical approaches for DA analysis, delimits their applicability for microbiome data and poses challenges about the optimal selection of DA tests. A number of benchmark studies have been conducted to evaluate the performance of DA tests in the analysis of microbiome data. 12 – 15 However, the results show a very heterogeneous picture and clear guidelines or rules for the appropriate selection of DA tests have yet to be established. In order to assess and contextualize the findings of those studies, additional benchmarking efforts using a rigorous methodology, 16 , 17 as well as further experimental and synthetic benchmark data sets are essential. Synthetic data is frequently utilized to evaluate the performance of computational methods because for such simulated data the ‘correct’ or ‘true’ answer is known and can be used to assess whether a specific method can recover this known truth. 16 Moreover, characteristics of the data can be changed to explore the relationship between data characteristics such as effect size, variability or sample size and the performance of the considered methods. Several simulation tools have been introduced for generating synthetic microbiome data. 18 – 23 They cover a broad range of functionality. For example, MB-GAN 22 leverages generative adversarial networks to capture complex patterns and interactions present in the data, while metaSPARSim, 18 sparseDOSSA2 19 or nuMetaSim 24 employ different statistical models to generate microbiome data. Introducing a new simulation tool typically involves demonstrating its capacity to replicate key data characteristics. Nonetheless, an ongoing question persists regarding the feasibility of validating findings derived from experimental data when synthetic data, generated to embody the characteristics of the experimental data, is used in its place. Here we refer to the recent high-impact benchmark study of Nearing et al. 1 in which the performance of a comprehensive set of 14 DA tests applied to 38 experimental 16S microbiome data sets was compared. This 16S microbiome sequencing data is used to study communities in various environments, here from human gut, soil, wastewater, freshwater, plastisphere, marine and built environments. The data sets are presented in a two group design for which DA tools are applied to identify variations in species abundances between the groups. In this validation study we replicate the primary analysis conducted in the reference study by substituting the actual datasets with corresponding synthetic counterparts and using the most recent version of the DA methods. The objective is to explore the validity of the main findings from the reference benchmark study when the analysis workflow is repeated with an independent implementation and with synthetic data, generated to recapitulate the characteristics of the original real data. Objectives {7} Aim 1: Synthetic data, simulated based on an experimental template, overall reflect main data characteristics. Aim 2: Study results from Nearing et al. can be validated using synthetic data, simulated based on corresponding experimental data. Further objective(s): A further key aim of this study is to provide the first attempt to define a structured study protocol for methodological computational research. Trial design {8} Aim 1: Exploratory comparative study Aim 2: Confirmatory benchmark study Summary Table Table 2. Summary of hypotheses and the statistical analyses used for evaluation. Research question Hypothesis Statistical analyses Confirmation criteria Aim 1: Can state of the art simulation tools for 16S rRNA sequencing data realistically generate synthetic data across a broad range of simulation templates? Main data characteristics calculated from synthetic data are equivalent to experimental templates. Equivalence tests, i.e. two one-sided one-sample t-tests for each data characteristic as implemented in the TOSTER R-package. PCA of all data characteristics and equivalence test for Euclidean distances in 2D. We interpret a p-value < 0.05 for rejecting the null hypothesis “non-equivalence” as significant and then conclude that the respective data characteristic is equivalent. Aim 2: Can conclusions based on performance outcomes (proportion of significant taxa and overlap across DA tests) from 16S microbiome sequencing data be validated with synthetic data, simulated after calibration based on the used experimental data? Hypotheses 1: 13 extracted outcomes from (1) concerning the overlap of significant features across exp. data sets and DA test can be confirmed based on their corresponding simulations. Hypothesis 2: 14 extracted outcomes from (1) concerning the proportion of significant features identified across multiple DA tools can be confirmed based on their corresponding simulations. For 23/27 hypotheses: Estimating the proportion P where the hypotheses are fulfilled by counting, and calculation of 95% confidence intervals a) for independent observations based on the SE formula b) for dependent observation using bootstrap. For 2/27 hypotheses: 2-way ANOVA For 1/27 hypothesis: mean of Kolmogorov-Smirnov test statistic For 1/27 hypothesis: visualization by histograms. For each hypothesis, we specified individual confirmation thresholds. In 18/27 cases, we use a 95% threshold as criterion for the estimated proportion of cases, where the hypothesis is valid. We check these criteria to be fulfilled by considering the 95% CI. For ANOVA and Kolmogorov-Smirnov tests, we also specify individual confirmation criteria. Methods Study population/participants In the context of our benchmark study, the study population is given by the experimental data sets from the reference study. 1 Study setting {9} While designing a benchmark study for assessing sensitivities and specificities of DA methods using simulated data, we recognized the need to first assess the feasibility of generating synthetic data that realistically resemble all characteristics of experimental data, to ensure the validity of conclusions drawn from simulated data. This led us to develop the validation study presented in this protocol with primary goals are to compare the results based on synthetic data to those from the reference study. This study is going to be followed by a subsequent benchmark study, in which the known truth in the simulated data sets will be used for performance testing, the dependence on characteristics such as effect size, sample size etc. will be systematically evaluated, and all recently published DA methods will be considered. Where possible, this study is conducted analogously to the benchmark study conducted by Nearing et al., 1 e.g. the same data and primary outcomes will be used. All data sets as provided by Nearing et al. 1 will be included in the study. We employ two published simulation tools, metaSPARSim 18 and sparseDOSSA2, 19 which have been developed for simulating microbial abundance profiles as they are generated by 16s sequencing. We also apply the same DA tests as in 1 and implementation in the R statistical programming language. In order to provide the most valuable results for the bioinformatics community, the latest versions of these implementations will be used. Eligibility criteria {10} Inclusion criteria We will include the same experimental data sets and DA tests as in Ref. 1 . Exclusion criteria There are no exclusion criteria for the data sets. Who will take informed consent? {26a} n/a: Data is publicly available, there is no need to obtain consent. Additional consent provisions for collection and use of participant data and biological specimens {26b} n/a: Data is publicly available, there is no need to obtain consent. Interventions Explanation for the choice of comparators {6b} For aim 1, the comparator are 46 data characteristics calculated from the 38 experimental data sets. These characteristics are chosen such that they provide a comprehensive description of count matrices and enabling unbiased comparison between experimental and synthetic data sets. They cover for example information about the sparsity in a data set, mean-variance trends of features (taxa), or effect sizes between groups of samples. Tables 4 and 5 provide a detailed summary of all data characteristic and how they are calculated. To prevent overrepresentation of specific distributional aspects in assessing similarity between experimental and simulated data, we exclude redundant data characteristics. This is achieved by calculating rank correlations for all data characteristic pairs across all data sets (experimental and simulated) and iteratively eliminating with a rank correlation ≥ 0.95. Table 3. Study status. Version Date Changes made Reason for changes 1 February 13, 2024 Initial submission as registered report to PLOS Biology and PLOS ONE (not accepted) Initial version Link to initial version: https://nxc-fredato.imbi.uni-freiburg.de/s/o6TsmZBMdngtamp 2 August 09, 2024 Clarify data used for the hypotheses; some minor text changes (no methodological changes) Hypotheses align to conclusions in Nearing et al.; make some sentences more precise Link to version with edits: https://nxc-fredato.imbi.uni-freiburg.de/s/jSRNoQxYzk5E6LW 3 August 13, 2024 Change order and naming of sections Naming of protocoll sections need to align with F1000 requirements Link to version with edits: https://nxc-fredato.imbi.uni-freiburg.de/s/j5ibzbXMwW3Ssj9 4 December 17, 2024 Change abstract and update study setting, Add more information about data generating mechanism in intervention description {11a}, In explanation for the choice of comparators {6b} add procedure to eliminate highly correlated DC, Clarify hypotheses for aim2, Extend discussion In table 5 add three data characteristics Address comments in revision Link to version with edits: https://nxc-fredato.imbi.uni-freiburg.de/s/wD34Ge6mYtkxWrB Table 4. Calculation of data characteristics in R. Name of data characteristic Name in matrix (data.prop) summarizing all data characteristics Calculation in R Dimension dat.cpm Counts per million normalized and log transformed data edgeR::cpm (dat, log=TRUE, prior.count = 1) mxn Feature sparsity data.prop$P0_feature apply (dat==0,1,sum)/ncol (dat) m Sample sparsity data.prop$P0_sample apply (dat==0,2,sum)/nrow (dat) n Feature mean abundance data.prop$mean_log2cpm apply (dat.cpm, 1,mean,na.rm=T) m Feature median abundance data.prop$median_log2cpm apply (dat.cpm,1, median,na.rm=T) m Feature variance data.prop$var_log2cpm apply (dat.cpm, 1, var) m Library size data.prop$lib_size colSums (dat) n Sample means data.prop$sample_means apply (dat,2,mean) n Sample correlation data.prop$corr_sample cor (dat, dat, method="spearman",use="na.or.complete") nxn Feature correlation data.prop$corr_feature cor(t (dat), t (dat), method="spearman",use="na.or.complete") mxm Table 5. Final integer values data characteristic and their calculation in R. Name of data characteristic Calculation in R Number of features nrow (dat) Number of samples ncol (dat) Sparsity of data set sum (dat==0)/length (dat) Median of data set median (dat,na.rm=TRUE) 95th Quantile quantile (dat,probs=.95) 99th Quantile quantile (dat,probs=.99) Mean library size mean (colSums (dat),na.rm = T) Median library size median (colSums (dat),na.rm = T) Standard deviation library size sd (colSums (dat),na.rm = T) Coefficient of variation of library size sd (colSums (dat),na.rm = T)/mean (colSums (dat),na.rm = T)*100 Maximum library size max (colSums (dat),na.rm = T) Minimum library size min (colSums (dat),na.rm = T) Read depth range between samples diff (range (colSums (dat),na.rm = T)) Mean sample richness mean (colSums (dat>0), na.rm=T) Spearman correlation library size with P0*(sample) cor (data.prop$P0_sample, data.prop$lib_size, method=“spearman”) Bimodality of feature correlations bimodalIndex (matrix (data.prop$corr_feature,nrow=1))$BI Bimodality of sample correlations bimodalIndex (matrix (data.prop$corr_sample,nrow=1))$BI Mean of all feature means mean (data.prop$mean_log2cpm,na.rm=T) SD of all feature means sd (data.prop$mean_log2cpm,na.rm=T) Median of all feature means median (data.prop$median_log2cpm,na.rm=T) SD of all feature medians sd (data.prop$median_log2cpm,na.rm=T) Mean of all feature variances mean (data.prop$var_log2cpm,na.rm=T) SD of all feature variances sd (data.prop$var_log2cpm,na.rm=T) Mean of all sample means mean (data.prop$sample_means,na.rm=T) SD of all sample means sd (data.prop$sample_means,na.rm=T) Mean of sample correlation matrix mean (data.prop$corr_sample,na.rm=T) SD of sample correlation matrix sd (data.prop$corr_sample,na.rm=T) Mean of feature correlation matrix mean (data.prop$corr_feature,na.rm=T) SD of feature correlation matrix sd (data.prop$corr_feature,na.rm=T) Mean-Variance relation: Linear component res <-lm(y~x+I(x 2 ),data=data.frame(y=data.prop$var_log2cpm,x=data.prop$mean_log2cpm)) res$coefficients[2] Mean-Variance relation: Quadratic component res=lm(y~x+I(x 2 ),data=data.frame(y=data.prop$var_log2cpm,x=data.prop$mean_log2cpm)) res$coefficients[3] Slope feature sparsity vs. feature mean res=lm(y~slope,data=data.frame (slope=data.prop$P0_feature-1,y=data.prop$mean_log2cpm)) res$coefficients[2] Clustering of features coef.hclust (hcluster (dat.tmp)) Clustering of samples coef.hclust (hcluster(t (dat.tmp))) Sample sparsity apply (dat==0,2,sum)/nrow (dat) Library sizes colSums (dat) Mean read depths apply (dat,2,mean) Feature sparsity apply (dat==0,1,sum)/ncol (dat) Feature mean intensity apply (dat.cpm, 1,mean) Feature median intensity apply (dat.cpm,1, median) Feature variances apply (dat.cpm, 1, var) Sample correlations cor (dat, dat, method=“spearman”) Feature correlations calc_feature_corr(dat) Mean inverse Simpson diversity mean (vegan::diversity (dat, index = “invsimpson”),na.rm=T) Mean Pilou evenness shannon_div <- vegan::diversity (count.data, index = “shannon”) richness 0,na.rm = T)) pilou <- shannon_div/log (richness) mean (pilou (dat),na.rm=T) Mean Bray-Curtis dissimilarity mean (vegan::vegdist (dat, method = “bray”),na.rm=T) For aim 2, 14 differential abundance (DA) tests are applied to the experimental data (ALDEx2, ANCOM-II, corncob, DESeq2, edgeR, LEfSe, limma voom (TMM), limma voom (TMMwsp), MaAsLin2, MaSsLin2 (rare), metagenomeSeq, t-test (rare), Wilcoxon test (CLR), Wilcoxon test (rare)), i.e. the outcomes (number of significant features) calculated from the experimental data sets will serve as comparator. As in Ref. 1 , we analyzed unfiltered data as well as data filtered with respect to features with a sufficient number of non-zero counts. Intervention description {11a} The intervention consists of using synthetic data instead of experimental data. When defining the interventions, we had to balance the complexity and runtime of the study, with the requirements of conducting the data simulation as realistic as possible and with a sufficient sample size. Due to the complexity of our study, the huge expected computational demands (see section 11b), and the fact that one key aim of this study is also to introduce and emphasize the importance of formulating a study protocol specifically for computational research, we decided to restrict our study to two published simulation approaches for 16S rRNA sequencing data. For each of the 38 experimental data sets, synthetic data will be simulated using metaSPARSim 18 version 1.1.2 and sparseDOSSA2 19 version 0.99.2 as simulation tools. Simulation parameters are calibrated using the experimental data, such that the simulated data reflect the experimental data template. Both simulation approaches offer such a calibration functionality. Multiple (N=10) data realizations will be generated for each experimental data template to assess the impact of different realizations of simulation noise and to test for significant differences between interventions and the comparator. The major calibration and simulation functions are called with the following options: # metaSPARSim calibration: params <- metaSPARSim::estimate_parameter_from_data(raw_data=counts, norm_data=norm_data, conditions=conditions, intensity_func = "mean", keep_zeros = TRUE) # metaSPARSim simulation: simData <- metaSPARSim (params) # SparseDOSSA2 calibration: fit <- SparseDOSSA2::fit_SparseDOSSA2(data=counts, control=list (verbose = TRUE, debug_dir = "./")) # SparseDOSSA2 simulation: simResult <- SparseDOSSA2::SparseDOSSA2(template=fit,new_features=F, n_sample=n_sample, n_feature=n_feature) To account for the two-group design of the data, calibration and simulation are conducted for each group independently. Specifically, the data are first split into two groups of samples according to the metadata. Then, calibration and simulation are performed for each group, and finally, the two simulated count matrices are merged. Paragraph “Modification by adjusting the proportions of zeros and effect sizes” in the following section 11b describes how this procedure is changed to adapt the effect size. For aim 1, the data characteristics will be computed for each of the synthetic and experimental data sets. For aim 2, 14 DA tests will be applied to the synthetic data generated in aim 1. Criteria for discontinuing or modifying allocated interventions {11b} For assessing the similarity of the synthetic data templates, we apply equivalence tests based on two one-sided t-tests as implemented in the TOSTER R-package with a 5% significance level. The scales of different DCs are inherently incomparable; for instance, the proportion of zeros ranges between 0 and 1, while the number of features varies from 327 to 59,736. We use the SD of the respective values from all experimental data templates as lower and upper margins. Figure 1 illustrates the equivalence testing procedure for the proportion of zeros in the whole data set as an exemplary data characteristic. For equivalence testing, the combined null hypothesis that the tested values are below the lower margin or above the upper margin has to be rejected to conclude equivalence. This only occurs when the average data characteristic of synthetic data is inside both margins and not too close to those two bounds, i.e. the whole 95% CI interval of the estimated mean has to be between both margins. Figure 1. Illustration of assessing similarity based on an equivalence test. The black dots indicate a data characteristic computed for experimental data sets (here the proportion of zeros). Equivalence tests requires an interval that is considered as equivalent given by lower and upper margin bounds (dashed lines). We use the SD over all values from experimental templates to define these margins. The values computed from the synthetic data for a template are considered as equivalent if values below the lower margin and above the upper margin can be rejected according to the prespecified significance level. Depending on the variation of the characteristic for the synthetic data (here indicated by the boxplot), the average characteristic has to be inside a region (brown region) that is smaller than the interval between both margins. Modification by adjusting the proportions of zeros and effect sizes If equivalence tests fail, i.e. the synthetic data turns out to be partly unrealistic, we try to reduce the number of failed tests by adjusting two important characteristics, the proportion of zeros in the synthetic data sets, and the effect size, i.e., magnitude of differences between the two groups of samples. Modifying the proportion of zeros will be performed by the following procedure for all synthetic data sets: 1. If the number of rows or columns of the experimental data template does not coincide, randomly add or delete columns and rows in the template. 2. Count the number of zeros that have to be added (or removed) for a simulated data set to obtain the same number as in the template. 3. If the simulation method does not generate data with matching order of features (i.e. rows), sort all rows of both count matrices according to row means. 4. Copy and replace an appropriate number of zeros (or non-zeros) one-by-one (i.e. with the same row and column indices) from the template to the synthetic data by randomly drawing those positions. 5. Reorder the rows to get the original ordering. 6. Check, whether the total number of failed equivalence test across all data templates is reduced. Since we calibrate the simulation tools for both groups separately, all simulation parameters controlling the count distribution will be different in both groups. Therefore, we anticipate that the differences between both groups might be overestimated. We therefore try to make the simulation more realistic by modifying the effect size by the following procedure for all synthetic data sets: 1. Estimate the proportion of unregulated features from the results of all DA methods applied to the experimental data templates. This is done by the pi0est function in the qvalue R-package. 2. In addition to calibrations within the two groups of samples, the simulation tool is calibrated by using all samples from both groups (then there is no difference between both groups of samples) and a synthetic data set without considering the assignment of samples to groups is generated. This data set then has not differences between both groups. 3. Replace an appropriate number of rows in the original synthetic data with group differences by rows from the group-independent simulation that lack such differences. To ensure that rows with significant regulation are less frequently replaced, we randomly select the rows to be replaced by taking the FDR into account. Specifically, we use the FDR as a proxy for the probability that a taxon is unregulated, i.e. differentially abundant taxa are drawn via. isDiffAbundant = runif(n=length (pvalues)) > p.adjust(p-values, method="BH") to ensure that taxa with smaller p-values are more likely assigned to be differentially abundant. In addition to both individual modifications, we also apply both modification. For the following analyses, we then use the synthetic data where most data characteristics are equivalent. Exclusion criteria In our study, we use experimental data as templates for generating synthetic data which are then analyzed by DA methods. At both levels, generation of synthetic data and applying DA methods, we define exclusion or modification criteria in order to handle exceeding runtimes, computation errors, and unrealistic data simulation. Figure 2 shows an overview about these exclusion and modification steps. Figure 2. Overview about the analysis workflow and the exclusion/modification strategy. These criteria are applied to handle runtime issues, computation errors and unrealistic synthetic data. Exclusion of simulation for a specific data template based on simulation performance A simulation tool will be excluded for a specific data template, if calibration of the simulation parameters is not feasible. We define feasibility by the following criteria: 1) Calibration succeeds without error message 2) The runtime of the calibration procedure is below 7 days (168 hours) for one data template 3) The runtime of simulating a synthetic data set is below 1 hour for one synthetic data set All computations in this study will be performed on a Linux Debian x86_64 compute server with 64 AMD EPYC 7452 32-Core Processor CPUs. Although, we will run parts of the analyses in parallel mode, the specified computation times refer to runtimes on a single core. Exclusion of simulations for a specific data template based on deviating data properties For aim 2, we exclude synthetic data sets that are not similar enough to the experimental data sets used as templates. The goal of the following exclusion criterion is to exclude synthetic data sets that are overall strongly dissimilar from the experimental data template, without being too stringent since the simulation tools cannot perfectly resemble all data characteristics and therefore a slight or medium amount of dissimilarity has to be accepted. In general, dissimilarities are exploited to study the impact of those characteristics by investigating the association of such deviations with dissimilarity in outcomes. For assessing similarity, the data characteristics described before and specified in detail in Table 5 are used. We expect that a few data characteristics are very sensitive in discriminating experimental and synthetic data. To prevent loss of too many data sets, such characteristics that are non-equivalent for the majority (>50%) of templates (highlighted in gray in Figure 2 ) are only considered for the investigation of association between mismatch in outcome and mismatch in data characteristics but not for exclusion. Unrealistic synthetic data will be excluded for the primary analyses of the study using the remaining data characteristics. We define the exclusion criteria due to dissimilarity from its template by one of the following criteria: 1) The equivalence test based on Euclidean distance in the 2-dimensional PCA plot failed to indicate equivalence with the respective data templates. For equivalence testing, we use +/- 1 SD of the Euclidean distance over all exp. data templates as upper and lower margins. 2) We apply equivalence tests for the non-redundant subset of the 46 data characteristics individually. We then only consider data characteristic which are not highly-discriminative. When counting non-equivalence of the remaining characteristics (highlighted in brown color in Figure 2 ) for a template, the synthetic data of those templates that appear as an outlier will be removed (see example in Figure 2 ). We use the common outlier definition from boxplots, i.e. all values with distance to the 1 st quartile (Q1) or 3 rd quartile (Q3) larger than 1.5 x the inter-quartile range Q3-Q1 are considered as outlier. For evaluating the sensitivity with respect to exclusion, we perform an additional, secondary analysis on all synthetic data sets, regardless of similarity to the templates. Modification of differential abundance (DA) tests Inflated runtime Data sets with a large number of features could lead to inflating runtimes for some statistical tests. If the runtime threshold for an individual test is exceeded for a specific data set, we split the data set, apply the test again on the subsets and afterwards merge the results. This split and merge procedure is repeated until the test runtime is below the threshold. Here, we define the runtime threshold to be max. 1 hour per test. Then, in a worst case scenario, for each simulation tool the 14 tests for the 10+1 data sets for each of the 38 template and taking unfiltered and filtered data into account (11,704 combinations) would need 488 days on a single core. Since we can conduct the tests on up to 64 cores, such a worst case scenario would still be manageable. Test failure If a DA test throws an error we omit the number of significant features and the overlap of significant features and report them as NA (not available) as it would occur in practice. Strategies to improve adherence to interventions {11c} n/a Relevant concomitant care permitted or prohibited during the trial {11d} n/a Provisions for post-trial care {30} n/a Outcomes {12} Aim 1: For each data set (experimental template and synthetic data) 46 data characteristics are calculated as described in Table 5 . The difference of a data characteristic between a synthetic data and the corresponding data template is calculated as outcomes. For each feature which is closer to a normal distribution on the log-scale according to p-values of the Shapiro-Wilk test, we apply a log2-transformation to the respective characteristics prior to all analyses. Principal component analysis (PCA) is then performed on the scaled data characteristics and a two-dimensional PCA plot is generated to summarize similarity of experimental and simulated data on the level of the computed data characteristics. An additional outcome is the Euclidean distance of a synthetic data set to its template in the first two principal component coordinates. An equivalence test will be conducted on the synthetic data sets for each template to check whether data properties are maintained in synthetic data on a summary level for all data characteristics. Next, boxplots are generated, visualizing for each data characteristic how it varies between templates, between all simulation realizations, and how templates deviate from the corresponding synthetic data sets. Here, we again perform equivalence tests and also report median distances of a data characteristic between simulated and experimental data. Aim 2: As primary outcome (aim 2a), for each experimental data template the average proportion of shared significant features across all synthetic data are calculated for each DA tool. For each data template, a barplot is generated as in Nearing et al. 1 to visually summarize how many of the 14 DA tools identified the same feature as significantly changed. Moreover, we try to validate the conclusions from Nearing et al. 1 made on this primary outcome. Overall, we extracted 13 conclusions and formulated the respective hypothesis as shown in Box 1 . Box 1. Hypotheses investigating the overlap of identified features as aim 2a extracted as conclusions from Ref. 1 including the statistical analysis to be applied. We term the statistical analysis that estimates the proportion P of cases (e.g. the proportion of synthetic data sets) where the hypothesis is fulfilled as “Counting”. Depending on the stringency of the formulation in Ref. 1 , we always define a question-specific threshold for confirmation and indicate the respective number of cases n for this evaluation in brackets. The asterisk * indicates that this number of cases might be reduced if exclusion criteria apply. Hypothesis 1 : For unfiltered data, the proportion of features jointly found as significant by limma voom TMM and limma voom TMMwsp but by less than 50% of the other methods, is larger than the overlap with more than 50% of the other methods. Analysis: Counting (n=380*) with 95% threshold, i.e. the hypothesis is validated if the 95% CI is > 95%. Hypothesis 2 : For unfiltered data, the overlap of features jointly found as significant by limma voom TMM and limma voom TMMwsp with features found by Wilcoxon CLR is larger than the overlap with all other DA methods. Analysis: Counting (n=380*) with 95% threshold. Hypothesis 3 : For unfiltered data, the Kolmogorov-Smirnov test statistic D when comparing the profile for Wilcoxon CLR and Wilcoxon rare is larger than for other pairs of methods on average. Analysis: Counting (n=380*) with 95% threshold. Hypothesis 4 : For unfiltered data, MaAsLin2 and MaAsLin2-rare a more similar profile (larger test statistic D) than a randomly selected pair of methods. Analysis: Counting (n=380*) with 95% threshold. Hypothesis 5 : For unfiltered data, ALDEx2 and ANCOM-II identify more features that were also identified by all except 3 (i.e. 10 out of 13) other methods. Analysis: Counting (n=380*) with 95% threshold. Hypothesis 6 : For unfiltered data, EdgeR and LEfSe identify a larger percentage of features that are not identified by any other tool, compared to the same percentage for all other methods. Analysis: Counting (n=380*) with 95% threshold. Hypothesis 7 : For unfiltered data, for corncob, metagenomeSeq, and DESeq2, there are always multiple other methods (i.e. at least 2 out of 10 other DA methods) that have a more extreme consistency profile. Analysis: Counting (n=380*) with 95% threshold. Mean consistency is used to assess the location of the consistency profile. Hypothesis 8 : The shape of the overlap profiles for all methods except both limma voom approaches is mainly determined by the exp. data set and the DA method but only little of the fact whether data has been filtered. Analysis: qq-Plots of the cumulative overlap profile filtered vs. non-filtered are closer to the diagonal than comparison of different DA methods and comparison of different exp. data templates. Quantification using the mean of the Kolmogorov-Smirnov test statistic, i.e. the average of the maximal absolute distance of the empirical cumulative density functions. Hypothesis 9 : For filtered data, for both limma voom approaches the proportion of identified features that are also identified by the majority of other tests is larger than for un-filtered data Analysis: Counting (n=380*) with 95% threshold. Hypothesis 10 : For filtered data, the overlap profile of Wilcoxon CLR is bimodal. Analysis: Counting (n=380*) with 95% threshold using bimodality index (R function BimodalIndex::bimodalIndex). Only datasets with at least 10 significant features when using Wilcoxon CLR will be considered. Hypothesis 11 : The proportion of features identified by all except one DA method is larger for prevalence-filtered data. Analysis: Counting (n=380*) with 95% threshold. Hypothesis 12 : For filtered data, the consistency profiles of corncob, metagenomeSeq, and DESeq2 are more similar to the more extreme methods (as for Hypothesis 9 defined as the most extreme 2 profiles of other DA methods) than for unfiltered data. Analysis: Counting (n=380*) with 95% threshold using the Kolmogorov-Smirnov test statistic of the two 2 nd most extreme left- and right shifted other profiles. Hypothesis 13 : For filtered data, ALDEx2 and ANCOM-II identify more features that were also identified by all except 3 (i.e. 10 out of 13) other methods. Analysis: Counting (n=380*) with 95% threshold. As secondary outcome (aim 2b), the numbers and proportions of significant features across tools and data sets are considered. This outcome is reported and visualized in a heatmap with rows representing the data templates and columns the DA tools as in Nearing et al. 1 In total, two heatmaps, one for the experimental data and a second one for the simulated data, are generated. For the synthetic data sets, mean values from N=10 simulation realizations are calculated and plotted. Moreover, we try to validate the outcomes from Nearing et al. 1 made on this secondary outcome. Overall, we extracted 14 conclusions about the number of identified features and formulated the respective hypothesis as summarized in Box 2 . Box 2. Hypotheses investigating the proportion of identified features as aim 2b extracted as conclusions from Ref. 1 including the statistical analysis to be applied. We term the statistical analysis that estimates the proportion P of cases (e.g. the proportion of synthetic data sets) where the hypothesis is fulfilled as “Counting”. Depending on the stringency of the formulation in Ref. 1 , we always define a question-specific threshold for confirmation and indicate the respective number of cases n for this evaluation in brackets. The asterisk * indicates that this number of cases might be reduced if exclusion criteria apply. Hypothesis1: For filtered and unfiltered data, the percentage of significant features identified by each DA method varies widely across data sets. Analysis: Histograms of the range (= max – min) of the percentage values 1) for all data sets, and 2) for all DA methods when applied to data from each data template Hypothesis 2 : For filtered and unfiltered data, rankings of the DA methods with respect to the proportion of identified features depend on the data template. Analysis: 2-way ANOVA of the rank-transformed proportions using DA method and template as grouping variables indicate significant interaction effects between both variables. Hypothesis 3 : Rankings of the DA methods with respect to the proportion of identified features depend stronger on the data template in unfiltered data than in filtered data sets. Analysis: 2-way ANOVA of the ranks using DA method and template as grouping variables indicate a more significant interaction effects between both effects in unfiltered data (compared to respective analysis for unfiltered data, see preceding hypothesis). Hypothesis 4 : In unfiltered data, either limma voom TMMwsp, limma voom TMM, Wilcoxon CLR, or LEfSe identify the largest proportion of significant features. Analysis: Counting ( n=380* ). Since in the original analysis, the observation was seen in all analysis, we classify the hypothesis as confirmed, if true in >95% of cases. Hypothesis 5 : For unfiltered data, there are data sets, where edgeR identifies the largest proportion of significant features. Analysis: Counting (n=380*) with >0 threshold. Hypothesis 6 : For unfiltered data, Limma voom TMMwsp identifies the largest proportion of features in the Human-HIV data set. Analysis: Counting (n=380*) in all synthetic data sets generated for this template with >95% threshold. Hypothesis 7 : For unfiltered data, there are data sets, where both limma voom methods identify more than 99% of features as significant. Analysis: Counting (n=380*) with >0 threshold. Hypothesis 8 : For unfiltered data, there are data sets, where Wilcoxon CLR identifies more than 90% of features as significant. Analysis: Counting (n=380*) with >0 threshold. Hypothesis 9 : For unfiltered data, there are data sets, where LEfSe identifies more taxa as significant compared with all other tools. Analysis: Counting (n=380*) with >0 threshold. Hypothesis 10 : In unfiltered data, either ALDEx2 or ANCOM-BC identify the fewest significant features. Analysis: Counting (n=380*) with >95% threshold. Hypothesis 11 : In unfiltered data, ALDEx2, ANCOM-II and corncob do not identify significantly more features than the most conservative tests. Analysis: Counting (n=380*) whether DA method is within the three most conservative ones (with >95% threshold). Hypothesis 12 : No tool (except ALDEx2) identifies a smaller number of features for unfiltered data (compared to filtered data). Analysis: Counting with >95% threshold. Hypothesis 13 : For filtered and unfiltered data, ANCOM-BC identifies the least significant features in total, i.e. when summing ranks of DA methods over all 38 templates. Analysis: Counting (n=10) whether the ranksum over 38* single synthetic data simulations (randomly drawn from N=10) of all other DA methods satisfies the hypothesis (>95% threshold without confidence intervals). Hypothesis 14 : For filtered data, there is no method other than EdgeR, LEfSe, limma voom TMMwsp, limma voom TMM, or Wilcoxon CLR that identifies the largest number of significant features in total, i.e. when considering ranks of DA methods over all 38 templates. Analysis: Counting (n=10) whether the ranksum over 38* single synthetic data simulations (randomly drawn from N=10) of all other DA methods satisfies the hypothesis (<5% threshold without confidence intervals). Participant timeline {13} n/a Sample size {14} The used 38 experimental data sets and DA tests are defined by the reference study that is being validated. In our study, we can therefore only choose the number of synthetic data sets per data template reasonably. Since there is no pilot study for the outcomes obtained from synthetic data that can be used for sample size calculations, and because a large number of outcomes are considered, the number of simulated data sets for one template was chosen to be a) feasible in terms of run time and b) should be large enough to enable valid conclusions. Based on both aspects, we decided to simulate N=10 synthetic data sets for each experimental data template, i.e., 380 synthetic data sets for each simulation tool. For the one-sample equivalence tests with significance level 5% conducted for aim 1 , N=10 synthetic data sets have a power of 89,75% to reject both the null hypothesis that the data characteristic is below -1SD and above +1SD and thereby favor the alternative hypothesis that the characteristic is equivalent to the reference value computed for the experimental data template. For this computation, we presumed that the expected mean (i.e. the bias of a characteristic in simulated data) is 0.5 SD, the standard deviation within synthetic data is 0.5 SD. Here, all specifications were made in units SD that refers to the standard deviation over all exp. templates. Assuming a smaller bias or a smaller variability for the synthetic data increases the expected power. These calculation has been conducted with Nquery version 8.7.2.0, equivalence test for one mean (MOE4-1). For the proportions P of fulfilled hypothesis that were estimated for addressing most hypotheses in aim 2, 95% Clopper-Pearson confidence intervals (R-package DescTools) are calculated. As an example, n=380 independent samples leads to P = 0.026, 95%-CI = [0.013, 0.047] if for 10 out of 380 synthetic data sets the tested hypothesis is violated. This sample size is available for 20/27 hypotheses. However, it should be noted that such sample size considerations are limited by the fact that the data characteristics and results for synthetic data sets from a template are likely to be similar to each other and it is therefore not permissible to consider all samples as independent. Recruitment {15} n/a Assignment of interventions: allocation Sequence generation {16a} The intervention and the comparator can be evaluated for all data sets and the order of the computations has no impact. Therefore, random allocation and sequence generation is not required. Concealment mechanism {16b} n/a Implementation {16c} n/a Assignment of interventions: Blinding Who will be blinded {17a} No blinding procedure will be applied because we see no risk of bias by a non-blinded calculation of data characteristics and statistical tests. Procedure for unblinding if needed {17b} n/a: No blinding performed. Data collection and management Plans for assessment and collection of outcomes {18a} The analyses will be conducted by two experienced Statisticians/Bioinformaticians (E.K., C.K.) both with > 5yrs experience with differential abundance analyses. We use the latest implementations of the DA methods but all statistical tests will adhere to the methodology outlined by Nearing et al. 1 To do so we chose the same data processing steps and configuration parameters of the statistical methods as implemented in the code from Nearing et al. 1 for running the statistical tests, provided on Github ( https://github.com/nearinj/Comparison_of_DA_microbiome_methods ). This strategy balances the tradeoff between assessing the most recent versions and maximizing comparability between the outcomes of the reference study and our validation study. Plans to promote participant retention and complete follow-up {18b} n/a Data management {19} The 38 experimental data sets were downloaded from https://figshare.com/articles/dataset/16S_rRNA_Microbiome_Datasets/14531724 on February 9, 2024. There, Nearing et al. 1 made the data sets from their study available, therefore we incorporate the exact same data sets. We keep a local copy of this data in our Fredato research data management system https://nxc-fredato.imbi.uni-freiburg.de until 31.12.2030 and make it available if the original data is not available in the current version any more and if this does not violate legal, data protection, or copyright regulations. Generated data, analysis scripts, results and supplemental information to this study will also be stored in the Fredato research data management once the study is conducted and undergoing peer review. In case of unexpected technical limitations, we will make data, analysis scripts and supplemental information available via https://figshare.com . Confidentiality {27} n/a: The experimental data is publicly available. The data generated in this project is not subject to data protection regulations. Plans for collection, laboratory evaluation and storage of biological specimens for genetic or molecular analysis in this trial/future use {33} n/a Statistical methods {20} Data monitoring committee {21a} n/a: The data has already been collected and is publicly available. Statistical methods for primary and secondary outcomes {20a} Aim 1 : For each data set (experimental and synthetic) a set of 46 data characteristics is calculated and a non-redundant subset is iteratively selected as specified in section 6b. All data characteristics are defined as a single number. These calculations are described in more detail in Table 5 . As N=10 simulation realizations are generated, there will be 10 values for each data characteristic per experimental data template. For the primary outcome a PCA plot based on the scaled data characteristics is generated. Moreover, equivalence tests are applied to the non-redundant subset of the 46 data characteristics as well as to the Euclidean distance in the two principal component coordinates as describe above (section “Interventions”). Based on these equivalence tests, we test a single hypothesis (equivalence of synthetic and experimental data) which is fulfilled in strict terms only when all equivalence tests are significant. We therefore do not have to control the probability of a single false positive test (i.e. the so-called family-wise error rate). Therefore, multiple testing aspects do not apply these tests. Aim 2 : All DA methods will be applied to experimental and synthetic data adhering to the methodology in Nearing et al. 1 as described in the methods section of the paper. Significant features will be identified using a 0.05 threshold for the multiple testing adjusted p-values (Benjamini-Hochberg). For the primary outcome, it is determined how many tests jointly identify features to be significant for each data set. After visualization, the 13 hypotheses extracted from 1 will be investigated using the statistical analyses summarized in Box 1 . Estimates of the target values and the respective 95% confidence intervals are used to validate the hypotheses because these values are easier to interpret than p-values and because the significance of the p-values is strongly determined by the number of cases. For the secondary outcome, the number and proportion of significant features is extracted for each data set and test individually. After visualization, the 14 hypotheses extracted from 1 will be investigated using the statistical analyses summarized in Box 2 . Confidence intervals for the estimated proportions of cases where the hypothesis is fulfilled will be calculated as exact intervals, i.e. using Clopper-Pearson intervals (R-package DescTools). If the analyzed cases are statistically dependent, we compute hierarchical bootstrap confidence intervals by first drawing with replacement templates, and then synthetic data sets within each template. These analyses are conducted for unfiltered data and for filtered data. As in, 1 filtered means that features found in fewer than 10% of samples are removed. Moreover, to analyze the sensitivity of our outcomes with respect to our exclusion criteria, filtered and unfiltered data are also analyzed without applying criteria that exclude non-realistic simulations. In case we find different results for the simulated data for some hypotheses, we will analyze the association of the mismatch in the outcome with the mismatch of data characteristics to identify data characteristics that could be responsible for the disagreement. To ensure independence of the scales, we will perform these analyses after rank transformations. We will use univariate analyses (i.e. Spearman correlation) as well as a forward selection with a 5% cut-off criterion for p-values. Interim analyses {21b} n/a: There will be no interim analyses in this study. Methods for additional analyses (e.g. subgroup analyses) {20b} n/a: In this study there will be no subgroup analyses. Methods in analysis to handle protocol non-adherence and any statistical methods to handle missing data {20c} n/a: In this study there is no missing data. Plans to give access to the full protocol, participant level-data and statistical code {31c} Generated data, analysis scripts and supplemental information to this study will also be stored in the Fredato research data management. In case of unexpected technical limitations, we will make data, analysis scripts and supplemental information available via https://figshare.com . Timeline There is no timeline for data collection, as the study is based on existing data. Conducting the study does not depend on any clinical parameters. The anticipated timeline for completing the study is 3 to 4 months. Oversight and monitoring Composition of the coordinating centre and trial steering committee {5d} n/a Composition of the data monitoring committee, its role and reporting structure {21a} n/a Adverse event reporting and harms {22} n/a Ancillary and post-trial care {30} n/a Frequency and plans for auditing trial conduct {23} n/a Plans for communicating important protocol amendments to relevant parties (e.g. trial participants, ethical committees) {25} n/a Dissemination policy {31a} Public access of the generated data, analysis scripts, results and supplemental information is granted as indicated above on our Fredato research data management system https://nxc-fredato.imbi.uni-freiburg.de . The results will be published in a peer-reviewed scientific journal, preferably in the same journal as this protocol. Discussion Synthetic data are a valuable tool in benchmark studies as they allow for controlled manipulation of data properties and evaluations based on the underlying known truths. We utilize this potential to validate previous findings derived on experimental data exclusively. However, statistical tests are always based on assumptions about the distribution of the data. Since real data sets are inherently different from synthetic data, the results may differ significantly between synthetic and experimental datasets, potentially resulting in misleading conclusion about the methods’ performances. Therefore, first it is critical to determine the extent to which synthetic data can faithfully reflect real experimental data. It may necessitate adjustments to simulation tools to ensure their capability to accurately mimic experimental conditions. Consequently, this study not only investigates the potential limitations of synthetic data in validating experimental outcomes but also sheds light on the general capacity of simulation tools to faithfully represent real-world data. Since unsuccessful validation indicates that the previously published findings do not generalize to synthetic data, we try to identify responsible inadequacies in the simulation tools. However, this aspect is limited to the data characteristics of our study and there might be additional data properties that require consideration to explain a possible mismatch. Finally, this benchmark study employs rigorous statistical methodology and, for the first time, publishes a study protocol in advance, adhering to established protocol guidelines. Moreover, it is the first benchmark study that primarily focus on validation of the results of a previous study. Abbreviations ANCOM Analysis of compositions of microbiomes with bias correction, a DA method ANOVA analysis of variance DA differential abundance Fredato Freiburg research data management tool LEfSe Linear discriminant analysis Effect Size, a DA method MaAsLin Microbiome Multivariable Association with Linear Models, a DA method metaSPARSim acronym for “a sparse count matrix simulator intended for usage in development of 16S rDNA-seq metagenomic data processing pipelines” 18 p KS p-values of a Kolmogorov-Smirnov test PCA principal component analysis R a script-based statistical programming language sparseDOSSA2 abbreviation for “Sparse Data Observations for Simulating Synthetic Abundance”, a simulation tool for microbial count data based on a hierarchical model 19 TMM trimmed mean of M values, an approach for data normalization TMMwsp TMM with singleton pairing, an approach for data normalization Declarations Authors’ contributions {31b} E.K. and C.K. equally contributed to the design and development of the study. Ethics and consent {24} Ethical approval and consent were not required Consent for publication {32} n/a Authors’ information (optional) E.K.: Statistical data analyst and PhD student in the lab of Dr. Clemens Kreutz. Research topics cover robust data analysis of high throughput data with a specification for microbiome sequencing data. C.K.: Group Leader of the lab for Methods in Systems Biomedicine at the Institute for Medical Biometry and Statistics of University Medical Center of Freiburg. Research topics cover mathematic modelling for systems biomedicine and neutral benchmark studies. Both authors have neither been involved in the development of any of the applied simulation and DA methods nor in the reference study. 1 Study status Data availability {29} No data are associated with this article. Acknowledgements n/a References 1. Nearing JT, Douglas GM, Hayes MG, et al. : Microbiome differential abundance methods produce different results across 38 datasets. Nat. Commun. 2022; 13 (1). Publisher Full Text 2. Chan AW, Tetzlaff JM, Gøtzsche PC, et al. : SPIRIT 2013 explanation and elaboration: guidance for protocols of clinical trials. BMJ. 2013; 346 : 346. Publisher Full Text 3. Sorboni SG, Moghaddam HS, Jafarzadeh-Esfehani R, et al. : A Comprehensive Review on the Role of the Gut Microbiome in Human Neurological Disorders. Clin. Microbiol. Rev. 2022; 35 (1): e0033820. PubMed Abstract | Publisher Full Text | Free Full Text 4. Hou K, Wu ZX, Chen XY, et al. : Microbiota in health and diseases. Signal Transduct. Target. Ther. 2022; 7 (1). 5. Young VB: The role of the microbiome in human health and disease: An introduction for clinicians. BMJ. 2017; 356 : j831. Publisher Full Text 6. Li Q, Chan H, Liu WX, et al. : Carnobacterium maltaromaticum boosts intestinal vitamin D production to suppress colorectal cancer in female mice. Cancer Cell. 2023; 41 (8): 1450–1465.e8. PubMed Abstract | Publisher Full Text 7. Tang J, Wei Y, Pi C, et al. : The therapeutic value of bifidobacteria in cardiovascular disease. NPJ Biofilms Microbiomes. 2023; 9 (1): 82. PubMed Abstract | Publisher Full Text | Free Full Text 8. Jonsson V, Österlund T, Nerman O, et al. : Modelling of zero-inflation improves inference of metagenomic gene count data. Stat. Methods Med. Res. 2019; 28 (12): 3712–3728. PubMed Abstract | Publisher Full Text 9. Luz Calle M: Statistical analysis of metagenomics data. Genomics Inform. 2019; 17 (1). Publisher Full Text 10. Silverman JD, Roche K, Mukherjee S, et al. : Naught all zeros in sequence count data are the same. Comput. Struct. Biotechnol. J. 2020; 18 : 2789–2798. PubMed Abstract | Publisher Full Text | Free Full Text 11. Gloor GB, Macklaim JM, Pawlowsky-Glahn V, et al. : Microbiome datasets are compositional: And this is not optional. Front. Microbiol. 2017; 8 : 8. Publisher Full Text 12. Yang L, Chen J: A comprehensive evaluation of microbial differential abundance analysis methods: current status and potential solutions. Microbiome. 2022; 10 (1): 130. PubMed Abstract | Publisher Full Text | Free Full Text 13. Cappellato M, Baruzzo G, Di CB: Investigating differential abundance methods in microbiome data: A benchmark study. PLoS Comput. Biol. 2022; 18 (9): e1010467. PubMed Abstract | Publisher Full Text | Free Full Text 14. Weiss S, Xu ZZ, Peddada S, et al. : Normalization and microbial differential abundance strategies depend upon data characteristics. Microbiome. 2017; 5 (1): 27. PubMed Abstract | Publisher Full Text | Free Full Text 15. Calgaro M, Romualdi C, Waldron L, et al. : Assessment of statistical methods from single cell, bulk RNA-seq, and metagenomics applied to microbiome data. Genome Biol. 2020; 21 (1): 191. PubMed Abstract | Publisher Full Text | Free Full Text 16. Boulesteix AL, Morris T, Sauerbrei W, et al. : STRengthening Analytical Thinking for Observational Studies (STRATOS): Introducing the Simulation Panel (SP). Biom. Bull. 2020; 37 (2): 11–12. 17. Boulesteix AL, Binder H, Abrahamowicz M, et al. : On the necessity and design of studies comparing statistical methods. Biom. J. 2018; 60 (1): 216–218. Publisher Full Text 18. Patuzzi I, Baruzzo G, Losasso C, et al. : MetaSPARSim: A 16S rRNA gene sequencing count data simulator. BMC Bioinformatics. 2019; 20 : 416. PubMed Abstract | Publisher Full Text | Free Full Text 19. Ma S, Ren B, Mallick H, et al. : A statistical model for describing and simulating microbial community profiles. PLoS Comput. Biol. 2021; 17 (9): e1008913. PubMed Abstract | Publisher Full Text | Free Full Text 20. Richter DC, Ott F, Auch AF, et al. : MetaSim - A sequencing simulator for genomics and metagenomics. PLoS One. 2008; 3 (10): e3373. PubMed Abstract | Publisher Full Text | Free Full Text 21. Fritz A, Hofmann P, Majda S, et al. : CAMISIM: Simulating metagenomes and microbial communities. Microbiome. 2019; 7 (1): 17. PubMed Abstract | Publisher Full Text | Free Full Text 22. Rong R, Jiang S, Xu L, et al. : MB-GAN: Microbiome Simulation via Generative Adversarial Network. Gigascience. 2021; 10 (2). PubMed Abstract | Publisher Full Text | Free Full Text 23. Williams J, Bravo HC, Tom J, et al. : MicrobiomeDASim: Simulating longitudinal differential abundance for microbiome data. F1000Res. 2020; 8 : 8. Publisher Full Text 24. Liu S, Hua K, Chen S, et al. : Comprehensive simulation of metagenomic sequencing data with non-uniform sampling distribution. Quantitative Biology. 2018; 6 (2): 175–185. Publisher Full Text Comments on this article Comments (0) Version 2 VERSION 2 PUBLISHED 09 Oct 2024 ADD YOUR COMMENT Comment Author details Author details 1 Institute of Medical Biometry and Statistics, Faculty of Medicine and Medical Center, University of Freiburg, Baden-Württemberg, Germany Eva Kohnert Roles: Writing – Original Draft Preparation, Writing – Review & Editing Clemens Kreutz Roles: Writing – Original Draft Preparation, Writing – Review & Editing Competing interests No competing interests were disclosed. Grant information The author(s) declared that no grants were involved in supporting this work. Article Versions (2) version 2 Revised Published: 02 Jan 2025, 13:1180 https://doi.org/10.12688/f1000research.155230.2 version 1 Published: 09 Oct 2024, 13:1180 https://doi.org/10.12688/f1000research.155230.1 Copyright © 2025 Kohnert E and Kreutz C. This is an open access article distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. Download Export To Sciwheel Bibtex EndNote ProCite Ref. Manager (RIS) Sente metrics Views Downloads F1000Research - - PubMed Central info_outline Data from PMC are received and updated monthly. - - Citations open_in_new 0 open_in_new 0 open_in_new SEE MORE DETAILS CITE how to cite this article Kohnert E and Kreutz C. Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.12688/f1000research.155230.2 ) NOTE: If applicable, it is important to ensure the information in square brackets after the title is included in all citations of this article. COPY CITATION DETAILS track receive updates on this article Track an article to receive email alerts on any updates to this article. TRACK THIS ARTICLE Share Open Peer Review Current Reviewer Status: ? Key to Reviewer Statuses VIEW HIDE Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions Version 2 VERSION 2 PUBLISHED 02 Jan 2025 Revised Views 0 Cite How to cite this report: Nearing JT. Reviewer Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.176118.r355372 ) The direct URL for this report is: https://f1000research.com/articles/13-1180/v2#referee-response-355372 NOTE: it is important to ensure the information in square brackets after the title is included in this citation. Close Copy Citation Details Reviewer Report 23 Jan 2025 Jacob T. Nearing , Biostatistics, Harvard T.H. Chan School of Public Health, Boston, MA, USA Approved VIEWS 0 https://doi.org/10.5256/f1000research.176118.r355372 The authors have addressed all my comments in their revisions and clarified ... Continue reading READ ALL The authors have addressed all my comments in their revisions and clarified the manuscript's goals and objectives. I have no further comments to provide. Competing Interests: No competing interests were disclosed. Reviewer Expertise: Bioinformatics, microbiome I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard. Close READ LESS CITE CITE HOW TO CITE THIS REPORT Nearing JT. Reviewer Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.176118.r355372 ) The direct URL for this report is: https://f1000research.com/articles/13-1180/v2#referee-response-355372 NOTE: it is important to ensure the information in square brackets after the title is included in all citations of this article. COPY CITATION DETAILS Report a concern Respond or Comment COMMENT ON THIS REPORT Views 0 Cite How to cite this report: Lahti L and Pelto J. Reviewer Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.176118.r355371 ) The direct URL for this report is: https://f1000research.com/articles/13-1180/v2#referee-response-355371 NOTE: it is important to ensure the information in square brackets after the title is included in this citation. Close Copy Citation Details Reviewer Report 21 Jan 2025 Leo Lahti , University of Turku, Turku, Finland Juho Pelto , Department of Computing, University of Turku, Turku, Finland Approved VIEWS 0 https://doi.org/10.5256/f1000research.176118.r355371 The authors have responded to our comments satisfactorily. Most importantly, the rationale for and the objectives of the study are now more clearly expressed. However, in case that metaSPARSim and sparseDOSSA2 cannot produce (even after adjustments for ... Continue reading READ ALL The authors have responded to our comments satisfactorily. Most importantly, the rationale for and the objectives of the study are now more clearly expressed. However, in case that metaSPARSim and sparseDOSSA2 cannot produce (even after adjustments for sparsity and effect sizes) data sets that resemble the original 38 real data sets closely enough, it remains a limitation of the study that no other simulation tools are being used. While we understand that adding other simulation tools to the study would increase the run-time and complexity of the study, testing other tools could be considered at least conditionally, i.e. in case that metaSPARSim and sparseDOSSA2 cannot produce proper data. Competing Interests: No competing interests were disclosed. Reviewer Expertise: biostatistics We confirm that we have read this submission and believe that we have an appropriate level of expertise to confirm that it is of an acceptable scientific standard. Close READ LESS CITE CITE HOW TO CITE THIS REPORT Lahti L and Pelto J. Reviewer Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.176118.r355371 ) The direct URL for this report is: https://f1000research.com/articles/13-1180/v2#referee-response-355371 NOTE: it is important to ensure the information in square brackets after the title is included in all citations of this article. COPY CITATION DETAILS Report a concern Respond or Comment COMMENT ON THIS REPORT Version 1 VERSION 1 PUBLISHED 09 Oct 2024 Views 0 Cite How to cite this report: Nearing JT. Reviewer Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.170374.r340427 ) The direct URL for this report is: https://f1000research.com/articles/13-1180/v1#referee-response-340427 NOTE: it is important to ensure the information in square brackets after the title is included in this citation. Close Copy Citation Details Reviewer Report 05 Dec 2024 Jacob T. Nearing , Biostatistics, Harvard T.H. Chan School of Public Health, Boston, MA, USA Approved with Reservations VIEWS 0 https://doi.org/10.5256/f1000research.170374.r340427 In this protocol Kohnert and Kreutz design a study to assess microbiome differential abundance methods by replicating Nearing et al., 2022 using synthetic datasets generated from the original data using metaSPARSim and sparseDOSSA2. They intend to determine the similarity between ... Continue reading READ ALL In this protocol Kohnert and Kreutz design a study to assess microbiome differential abundance methods by replicating Nearing et al., 2022 using synthetic datasets generated from the original data using metaSPARSim and sparseDOSSA2. They intend to determine the similarity between the synthetic data generated and their underlying true datasets using 43 different data characteristics. Moreover, they hope to identify whether there are any driving characteristics in synthetic data that would result in discordant conclusions with those achieved by using the real datasets they are based upon. The authors identify that their protocol falls within SPIRIT guidelines and openly report all changes that were made to their report over its initial creation until now. Overall, the study is of interest to microbiome research, however, some areas require clarification, and the overall presentation of the protocol could be improved as major steps in some cases were not presented in order. Major comments: Well I appreciate the authors adherence to the SPIRIT guidelines it made the ordering of the steps in this protocol hard to follow as these guidelines do not fully align with computational benchmarking studies. For example, the under the section “ Modification by adjusting the proportions of zeros and effect sizes” it was mentioned that the groups would be simulated separately, however, the exact simulation procedure (and parameters for metaSPARSim and sparseDOSSA2) was not fully discussed prior to this. Additional information on the code and parameters used in this simulation software (including version # should be included). I suspect the authors may have difficulty fully recapturing all the true effect size differences between groups within the same study using synthetic simulated data. Given this I appreciated the in-depth discussion they gave, and the illustration presented in figure 2. However, I do believe that the study could benefit from treating equivalence as a sliding scale rather than a discreate category for each synthetic dataset based on an equivalence test for each data characteristic. It is possible that being equivalent in one characteristic will result in a much larger range than another and this should be taken into consideration during further data analysis. As an aside to this the authors should consider cases where standard deviations may become so large that almost any dataset may pass the equivalence test that it is subjected to (i.e. in figure 1 what happens if the upper and lower bounds cover almost the entire possible range of that data characteristic). In this case it may be reasonable to look at how many times a synthetic dataset from an unrelated template passes an equivalence test. (i.e. how many times does an unrelated dataset that shows very different DA results show equivalence and in what data characteristics does this occur in?). In figure 2 and in the section " Exclusion of simulations for a specific data template based on deviating data properties" it is suggested that some data characteristics may be too sensitive to include during equivalency testing, however, there is no guidelines to the best of my interpretation of the protocol to determine when a data characteristic would be too sensitive that it must be disregarded. Minor comments: Some tools included within this study may have undergone significant updates since the initial publication of Nearing et al., 2022. This should be considered in the analysis as even slightly differences may give rise to discordant results. As an aside to this the authors may want to consider including more recently developed methods in their study such as LOCOM, ANCOM-BC2 or radEmu. This would benefit the microbiome community. In some cases, the hypothesis they define for aims 1 and 2 should be reviewed for clarity. I have listed some examples below. The authors mention ANCOM-BC however, this DA tool was not considering in Nearing et al., 2022. It is possible that this was typo, and the authors were suggesting ANCOM-II, however, it is important to clarify this as ANCOM-II and ANCOM-BC are highly different methods. Hypothesis 8 in aim 1 should be adjusted as the shapes were similar except for the limma voom tools. Hypothesis 4 in aim 2 should specify what Wilcoxon method is being referred to. Hypothesis 9 in aim 2 suggests that in the original study LEFSE had results where more than 99% of features in a dataset were significant, however, I do think this was found in Nearing et al., 2022. Although it was identified to be on the more sensitive side of methods tested. Is the rationale for, and objectives of, the study clearly described? Partly Is the study design appropriate for the research question? Partly Are sufficient details of the methods provided to allow replication by others? Partly Are the datasets clearly presented in a useable and accessible format? Yes Competing Interests: No competing interests were disclosed. Reviewer Expertise: Bioinformatics, microbiome, I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above. Close READ LESS CITE CITE HOW TO CITE THIS REPORT Nearing JT. Reviewer Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.170374.r340427 ) The direct URL for this report is: https://f1000research.com/articles/13-1180/v1#referee-response-340427 NOTE: it is important to ensure the information in square brackets after the title is included in all citations of this article. COPY CITATION DETAILS Report a concern Author Response 02 Jan 2025 Eva Kohnert , Medical Center-University of Freiburg, Freiburg, Germany 02 Jan 2025 Author Response Major comments: Well I appreciate the authors adherence to the SPIRIT guidelines it made the ordering of the steps in this protocol hard to follow as these guidelines do not ... Continue reading Major comments: Well I appreciate the authors adherence to the SPIRIT guidelines it made the ordering of the steps in this protocol hard to follow as these guidelines do not fully align with computational benchmarking studies. For example, the under the section “Modification by adjusting the proportions of zeros and effect sizes” it was mentioned that the groups would be simulated separately, however, the exact simulation procedure (and parameters for metaSPARSim and sparseDOSSA2) was not fully discussed prior to this. Additional information on the code and parameters used in this simulation software (including version # should be included). In the revised version of our study protocol, we describe in more detail, how the simulated data is generated. We also added version numbers for the two simulation packages. All package versions will be provided when the study is accomplished. We also added a reference to paragraph “Modification by adjusting the proportions of zeros and effect sizes” in section 11b where it is described, how the simulation procedure is changed to adapt the effect size. I suspect the authors may have difficulty fully recapturing all the true effect size differences between groups within the same study using synthetic simulated data. Given this I appreciated the in-depth discussion they gave, and the illustration presented in figure 2. However, I do believe that the study could benefit from treating equivalence as a sliding scale rather than a discreate category for each synthetic dataset based on an equivalence test for each data characteristic. It is possible that being equivalent in one characteristic will result in a much larger range than another and this should be taken into consideration during further data analysis. As an aside to this the authors should consider cases where standard deviations may become so large that almost any dataset may pass the equivalence test that it is subjected to (i.e. in figure 1 what happens if the upper and lower bounds cover almost the entire possible range of that data characteristic). In this case it may be reasonable to look at how many times a synthetic dataset from an unrelated template passes an equivalence test. (i.e. how many times does an unrelated dataset that shows very different DA results show equivalence and in what data characteristics does this occur in?). To address these concerns, we first want to emphasize that each data characteristic (DC) is computed independently for both experimental and simulated count matrices. The scales of different DCs are inherently incomparable; for instance, the proportion of zeros ranges between 0 and 1, while the number of features varies from 327 to 59,736. Thus, the vertical axis in Fig. X is in arbitrary units and +/- 1 SD always cover approx. 68% of the points from the exp. templates. Using the SD over the DCs computed from all exp. templates for defining the equivalence region can be seen as a normalization of the DCs to make it comparable. Our current equivalence testing yields a binary outcome (equivalent or not) to identify and exclude unrealistic simulations. We acknowledge the reviewer's suggestion to treat equivalence as a continuous outcome. However, implementing a sliding scale approach does not provide a final decision and presents challenges due to the diverse scales and distributions of DCs and we have to define a strategy for classification beforehand. Deciding based on how many times a synthetic dataset from an unrelated template passes an equivalence test could behave unintendedly if all synthetic data sets are unrealistic. Thus, we see no clear benefit and a further cutoff has to be defined. Nevertheless, if the reviewer prefers such an option, we are open to implement this (e.g. by defining the equivalence region from the 10% and 90% quantiles of the DC difference to unrelated templates). In response to all reviewers’ feedback, we have refined our set of DCs by eliminating redundancies and incorporating measures of alpha and beta diversity. To further address the reviewer's concerns, we will include a visualization by plotting the number of non-equivalent characteristics against the size of the equivalence region for comparing simulated data with the respective templates and comparing with unrelated templates. This will illustrate the relationship between equivalence thresholds and the proportion of DCs deemed equivalent / non-equivalent, offering deeper insight into the sensitivity of our equivalence testing procedure. We hope that these clarifications and proposed modifications adequately address the reviewer's concerns and enhance our study design. In figure 2 and in the section "Exclusion of simulations for a specific data template based on deviating data properties" it is suggested that some data characteristics may be too sensitive to include during equivalency testing, however, there is no guidelines to the best of my interpretation of the protocol to determine when a data characteristic would be too sensitive that it must be disregarded. We apologize for not providing a clear definition earlier. We now define data characteristics more precisely: we will only consider those that pass equivalence tests for at least 50% of the templates. Minor comments: Some tools included within this study may have undergone significant updates since the initial publication of Nearing et al., 2022. This should be considered in the analysis as even slightly differences may give rise to discordant results. As an aside to this the authors may want to consider including more recently developed methods in their study such as LOCOM, ANCOM-BC2 or radEmu. This would benefit the microbiome community. As written in our study protocol (section 9), we’ll apply the latest versions of the DA methods since this seems mostly valuable. We are aware, that updates of the DA methods may be responsible for failed validations. However, we see assessment and confirmation using the most recent tools as more important than preventing the risk that disagreement might originate from improved DA tools. Since a key feature of our study is to strictly perform validations following a strict methodology, we do not want to mix validation with evaluation of other tests. This will be investigated in a subsequent study where we focus on sensitivities and specificities. In the revised version of the protocol, we mention this aspect and better emphasize our overall strategy. We emphasize this strategy in the revised version of the study protocol at several points and use wordings like “validation with the most recent implementations/versions”. In some cases, the hypothesis they define for aims 1 and 2 should be reviewed for clarity. I have listed some examples below. The authors mention ANCOM-BC however, this DA tool was not considering in Nearing et al., 2022. It is possible that this was typo, and the authors were suggesting ANCOM-II, however, it is important to clarify this as ANCOM-II and ANCOM-BC are highly different methods. We thank the reviewer for this note. Upon reflection, we recognize that we indeed planned to use a different ANCOM method than the one employed in Nearing et al. They utilized a custom script downloaded “on May 11, 2020, from the FrederickHuangLin/ANCOM" repository on GitHub. We will use this script for our analysis to be as consistent as possible with the reference study. Hypothesis 8 in aim 1 should be adjusted as the shapes were similar except for the limma voom tools. Thank you for pointing this out. We changed the hypothesis accordingly. Hypothesis 4 in aim 2 should specify what Wilcoxon method is being referred to. We checked all hypothesis and clearly indicated which method is being referred to. Hypothesis 9 in aim 2 suggests that in the original study LEFSE had results where more than 99% of features in a dataset were significant, however, I do think this was found in Nearing et al., 2022. Although it was identified to be on the more sensitive side of methods tested. Thank you for pointing that out. We changed the hypothesis to be in line with the results from the reference paper. Is the rationale for, and objectives of, the study clearly described? Partly We acknowledge that the rationale and objectives of the study may not have been communicated clearly. To clarify this, we changed the abstract. To summarize, this study is essentially a validation study, focusing on evaluating the feasibility of using synthetic data to resemble experimental data and whether we replicate findings from the reference study based on multiple simulated data sets. Additionally, it is a showcase how to incorporate study protocols in computational studies and to rigorously plan a benchmark study before actually conducting it. This study serves as groundwork for a subsequent benchmark study, which will include a broader range of data sets for performance testing. Is the study design appropriate for the research question? Partly We agree that simulated data has limitations in fully validating previous findings due to inherent differences from experimental data. To address this, we calculate a comprehensive set of data characteristics, perform equivalence tests, and select only simulated data that does not show significant disagreement with experimental data. In our view, this approach is the best we can do to minimize the differences and ensure comparability between simulated and experimental datasets. Moreover, applying the presented equivalence testing procedure represents a novel methodological improvement compared to existing approaches. Are sufficient details of the methods provided to allow replication by others? Partly In the revised version we better explain especially that data generation mechanism, by including more details for the simulation process. All data sets, experimental and generated, as well as all source code to run the study will be made available once the paper describing the results will undergo review process. Are the datasets clearly presented in a useable and accessible format? Yes Major comments: Well I appreciate the authors adherence to the SPIRIT guidelines it made the ordering of the steps in this protocol hard to follow as these guidelines do not fully align with computational benchmarking studies. For example, the under the section “Modification by adjusting the proportions of zeros and effect sizes” it was mentioned that the groups would be simulated separately, however, the exact simulation procedure (and parameters for metaSPARSim and sparseDOSSA2) was not fully discussed prior to this. Additional information on the code and parameters used in this simulation software (including version # should be included). In the revised version of our study protocol, we describe in more detail, how the simulated data is generated. We also added version numbers for the two simulation packages. All package versions will be provided when the study is accomplished. We also added a reference to paragraph “Modification by adjusting the proportions of zeros and effect sizes” in section 11b where it is described, how the simulation procedure is changed to adapt the effect size. I suspect the authors may have difficulty fully recapturing all the true effect size differences between groups within the same study using synthetic simulated data. Given this I appreciated the in-depth discussion they gave, and the illustration presented in figure 2. However, I do believe that the study could benefit from treating equivalence as a sliding scale rather than a discreate category for each synthetic dataset based on an equivalence test for each data characteristic. It is possible that being equivalent in one characteristic will result in a much larger range than another and this should be taken into consideration during further data analysis. As an aside to this the authors should consider cases where standard deviations may become so large that almost any dataset may pass the equivalence test that it is subjected to (i.e. in figure 1 what happens if the upper and lower bounds cover almost the entire possible range of that data characteristic). In this case it may be reasonable to look at how many times a synthetic dataset from an unrelated template passes an equivalence test. (i.e. how many times does an unrelated dataset that shows very different DA results show equivalence and in what data characteristics does this occur in?). To address these concerns, we first want to emphasize that each data characteristic (DC) is computed independently for both experimental and simulated count matrices. The scales of different DCs are inherently incomparable; for instance, the proportion of zeros ranges between 0 and 1, while the number of features varies from 327 to 59,736. Thus, the vertical axis in Fig. X is in arbitrary units and +/- 1 SD always cover approx. 68% of the points from the exp. templates. Using the SD over the DCs computed from all exp. templates for defining the equivalence region can be seen as a normalization of the DCs to make it comparable. Our current equivalence testing yields a binary outcome (equivalent or not) to identify and exclude unrealistic simulations. We acknowledge the reviewer's suggestion to treat equivalence as a continuous outcome. However, implementing a sliding scale approach does not provide a final decision and presents challenges due to the diverse scales and distributions of DCs and we have to define a strategy for classification beforehand. Deciding based on how many times a synthetic dataset from an unrelated template passes an equivalence test could behave unintendedly if all synthetic data sets are unrealistic. Thus, we see no clear benefit and a further cutoff has to be defined. Nevertheless, if the reviewer prefers such an option, we are open to implement this (e.g. by defining the equivalence region from the 10% and 90% quantiles of the DC difference to unrelated templates). In response to all reviewers’ feedback, we have refined our set of DCs by eliminating redundancies and incorporating measures of alpha and beta diversity. To further address the reviewer's concerns, we will include a visualization by plotting the number of non-equivalent characteristics against the size of the equivalence region for comparing simulated data with the respective templates and comparing with unrelated templates. This will illustrate the relationship between equivalence thresholds and the proportion of DCs deemed equivalent / non-equivalent, offering deeper insight into the sensitivity of our equivalence testing procedure. We hope that these clarifications and proposed modifications adequately address the reviewer's concerns and enhance our study design. In figure 2 and in the section "Exclusion of simulations for a specific data template based on deviating data properties" it is suggested that some data characteristics may be too sensitive to include during equivalency testing, however, there is no guidelines to the best of my interpretation of the protocol to determine when a data characteristic would be too sensitive that it must be disregarded. We apologize for not providing a clear definition earlier. We now define data characteristics more precisely: we will only consider those that pass equivalence tests for at least 50% of the templates. Minor comments: Some tools included within this study may have undergone significant updates since the initial publication of Nearing et al., 2022. This should be considered in the analysis as even slightly differences may give rise to discordant results. As an aside to this the authors may want to consider including more recently developed methods in their study such as LOCOM, ANCOM-BC2 or radEmu. This would benefit the microbiome community. As written in our study protocol (section 9), we’ll apply the latest versions of the DA methods since this seems mostly valuable. We are aware, that updates of the DA methods may be responsible for failed validations. However, we see assessment and confirmation using the most recent tools as more important than preventing the risk that disagreement might originate from improved DA tools. Since a key feature of our study is to strictly perform validations following a strict methodology, we do not want to mix validation with evaluation of other tests. This will be investigated in a subsequent study where we focus on sensitivities and specificities. In the revised version of the protocol, we mention this aspect and better emphasize our overall strategy. We emphasize this strategy in the revised version of the study protocol at several points and use wordings like “validation with the most recent implementations/versions”. In some cases, the hypothesis they define for aims 1 and 2 should be reviewed for clarity. I have listed some examples below. The authors mention ANCOM-BC however, this DA tool was not considering in Nearing et al., 2022. It is possible that this was typo, and the authors were suggesting ANCOM-II, however, it is important to clarify this as ANCOM-II and ANCOM-BC are highly different methods. We thank the reviewer for this note. Upon reflection, we recognize that we indeed planned to use a different ANCOM method than the one employed in Nearing et al. They utilized a custom script downloaded “on May 11, 2020, from the FrederickHuangLin/ANCOM" repository on GitHub. We will use this script for our analysis to be as consistent as possible with the reference study. Hypothesis 8 in aim 1 should be adjusted as the shapes were similar except for the limma voom tools. Thank you for pointing this out. We changed the hypothesis accordingly. Hypothesis 4 in aim 2 should specify what Wilcoxon method is being referred to. We checked all hypothesis and clearly indicated which method is being referred to. Hypothesis 9 in aim 2 suggests that in the original study LEFSE had results where more than 99% of features in a dataset were significant, however, I do think this was found in Nearing et al., 2022. Although it was identified to be on the more sensitive side of methods tested. Thank you for pointing that out. We changed the hypothesis to be in line with the results from the reference paper. Is the rationale for, and objectives of, the study clearly described? Partly We acknowledge that the rationale and objectives of the study may not have been communicated clearly. To clarify this, we changed the abstract. To summarize, this study is essentially a validation study, focusing on evaluating the feasibility of using synthetic data to resemble experimental data and whether we replicate findings from the reference study based on multiple simulated data sets. Additionally, it is a showcase how to incorporate study protocols in computational studies and to rigorously plan a benchmark study before actually conducting it. This study serves as groundwork for a subsequent benchmark study, which will include a broader range of data sets for performance testing. Is the study design appropriate for the research question? Partly We agree that simulated data has limitations in fully validating previous findings due to inherent differences from experimental data. To address this, we calculate a comprehensive set of data characteristics, perform equivalence tests, and select only simulated data that does not show significant disagreement with experimental data. In our view, this approach is the best we can do to minimize the differences and ensure comparability between simulated and experimental datasets. Moreover, applying the presented equivalence testing procedure represents a novel methodological improvement compared to existing approaches. Are sufficient details of the methods provided to allow replication by others? Partly In the revised version we better explain especially that data generation mechanism, by including more details for the simulation process. All data sets, experimental and generated, as well as all source code to run the study will be made available once the paper describing the results will undergo review process. Are the datasets clearly presented in a useable and accessible format? Yes Competing Interests: No competing interests were disclosed. Close Report a concern Respond or Comment COMMENTS ON THIS REPORT Author Response 02 Jan 2025 Eva Kohnert , Medical Center-University of Freiburg, Freiburg, Germany 02 Jan 2025 Author Response Major comments: Well I appreciate the authors adherence to the SPIRIT guidelines it made the ordering of the steps in this protocol hard to follow as these guidelines do not ... Continue reading Major comments: Well I appreciate the authors adherence to the SPIRIT guidelines it made the ordering of the steps in this protocol hard to follow as these guidelines do not fully align with computational benchmarking studies. For example, the under the section “Modification by adjusting the proportions of zeros and effect sizes” it was mentioned that the groups would be simulated separately, however, the exact simulation procedure (and parameters for metaSPARSim and sparseDOSSA2) was not fully discussed prior to this. Additional information on the code and parameters used in this simulation software (including version # should be included). In the revised version of our study protocol, we describe in more detail, how the simulated data is generated. We also added version numbers for the two simulation packages. All package versions will be provided when the study is accomplished. We also added a reference to paragraph “Modification by adjusting the proportions of zeros and effect sizes” in section 11b where it is described, how the simulation procedure is changed to adapt the effect size. I suspect the authors may have difficulty fully recapturing all the true effect size differences between groups within the same study using synthetic simulated data. Given this I appreciated the in-depth discussion they gave, and the illustration presented in figure 2. However, I do believe that the study could benefit from treating equivalence as a sliding scale rather than a discreate category for each synthetic dataset based on an equivalence test for each data characteristic. It is possible that being equivalent in one characteristic will result in a much larger range than another and this should be taken into consideration during further data analysis. As an aside to this the authors should consider cases where standard deviations may become so large that almost any dataset may pass the equivalence test that it is subjected to (i.e. in figure 1 what happens if the upper and lower bounds cover almost the entire possible range of that data characteristic). In this case it may be reasonable to look at how many times a synthetic dataset from an unrelated template passes an equivalence test. (i.e. how many times does an unrelated dataset that shows very different DA results show equivalence and in what data characteristics does this occur in?). To address these concerns, we first want to emphasize that each data characteristic (DC) is computed independently for both experimental and simulated count matrices. The scales of different DCs are inherently incomparable; for instance, the proportion of zeros ranges between 0 and 1, while the number of features varies from 327 to 59,736. Thus, the vertical axis in Fig. X is in arbitrary units and +/- 1 SD always cover approx. 68% of the points from the exp. templates. Using the SD over the DCs computed from all exp. templates for defining the equivalence region can be seen as a normalization of the DCs to make it comparable. Our current equivalence testing yields a binary outcome (equivalent or not) to identify and exclude unrealistic simulations. We acknowledge the reviewer's suggestion to treat equivalence as a continuous outcome. However, implementing a sliding scale approach does not provide a final decision and presents challenges due to the diverse scales and distributions of DCs and we have to define a strategy for classification beforehand. Deciding based on how many times a synthetic dataset from an unrelated template passes an equivalence test could behave unintendedly if all synthetic data sets are unrealistic. Thus, we see no clear benefit and a further cutoff has to be defined. Nevertheless, if the reviewer prefers such an option, we are open to implement this (e.g. by defining the equivalence region from the 10% and 90% quantiles of the DC difference to unrelated templates). In response to all reviewers’ feedback, we have refined our set of DCs by eliminating redundancies and incorporating measures of alpha and beta diversity. To further address the reviewer's concerns, we will include a visualization by plotting the number of non-equivalent characteristics against the size of the equivalence region for comparing simulated data with the respective templates and comparing with unrelated templates. This will illustrate the relationship between equivalence thresholds and the proportion of DCs deemed equivalent / non-equivalent, offering deeper insight into the sensitivity of our equivalence testing procedure. We hope that these clarifications and proposed modifications adequately address the reviewer's concerns and enhance our study design. In figure 2 and in the section "Exclusion of simulations for a specific data template based on deviating data properties" it is suggested that some data characteristics may be too sensitive to include during equivalency testing, however, there is no guidelines to the best of my interpretation of the protocol to determine when a data characteristic would be too sensitive that it must be disregarded. We apologize for not providing a clear definition earlier. We now define data characteristics more precisely: we will only consider those that pass equivalence tests for at least 50% of the templates. Minor comments: Some tools included within this study may have undergone significant updates since the initial publication of Nearing et al., 2022. This should be considered in the analysis as even slightly differences may give rise to discordant results. As an aside to this the authors may want to consider including more recently developed methods in their study such as LOCOM, ANCOM-BC2 or radEmu. This would benefit the microbiome community. As written in our study protocol (section 9), we’ll apply the latest versions of the DA methods since this seems mostly valuable. We are aware, that updates of the DA methods may be responsible for failed validations. However, we see assessment and confirmation using the most recent tools as more important than preventing the risk that disagreement might originate from improved DA tools. Since a key feature of our study is to strictly perform validations following a strict methodology, we do not want to mix validation with evaluation of other tests. This will be investigated in a subsequent study where we focus on sensitivities and specificities. In the revised version of the protocol, we mention this aspect and better emphasize our overall strategy. We emphasize this strategy in the revised version of the study protocol at several points and use wordings like “validation with the most recent implementations/versions”. In some cases, the hypothesis they define for aims 1 and 2 should be reviewed for clarity. I have listed some examples below. The authors mention ANCOM-BC however, this DA tool was not considering in Nearing et al., 2022. It is possible that this was typo, and the authors were suggesting ANCOM-II, however, it is important to clarify this as ANCOM-II and ANCOM-BC are highly different methods. We thank the reviewer for this note. Upon reflection, we recognize that we indeed planned to use a different ANCOM method than the one employed in Nearing et al. They utilized a custom script downloaded “on May 11, 2020, from the FrederickHuangLin/ANCOM" repository on GitHub. We will use this script for our analysis to be as consistent as possible with the reference study. Hypothesis 8 in aim 1 should be adjusted as the shapes were similar except for the limma voom tools. Thank you for pointing this out. We changed the hypothesis accordingly. Hypothesis 4 in aim 2 should specify what Wilcoxon method is being referred to. We checked all hypothesis and clearly indicated which method is being referred to. Hypothesis 9 in aim 2 suggests that in the original study LEFSE had results where more than 99% of features in a dataset were significant, however, I do think this was found in Nearing et al., 2022. Although it was identified to be on the more sensitive side of methods tested. Thank you for pointing that out. We changed the hypothesis to be in line with the results from the reference paper. Is the rationale for, and objectives of, the study clearly described? Partly We acknowledge that the rationale and objectives of the study may not have been communicated clearly. To clarify this, we changed the abstract. To summarize, this study is essentially a validation study, focusing on evaluating the feasibility of using synthetic data to resemble experimental data and whether we replicate findings from the reference study based on multiple simulated data sets. Additionally, it is a showcase how to incorporate study protocols in computational studies and to rigorously plan a benchmark study before actually conducting it. This study serves as groundwork for a subsequent benchmark study, which will include a broader range of data sets for performance testing. Is the study design appropriate for the research question? Partly We agree that simulated data has limitations in fully validating previous findings due to inherent differences from experimental data. To address this, we calculate a comprehensive set of data characteristics, perform equivalence tests, and select only simulated data that does not show significant disagreement with experimental data. In our view, this approach is the best we can do to minimize the differences and ensure comparability between simulated and experimental datasets. Moreover, applying the presented equivalence testing procedure represents a novel methodological improvement compared to existing approaches. Are sufficient details of the methods provided to allow replication by others? Partly In the revised version we better explain especially that data generation mechanism, by including more details for the simulation process. All data sets, experimental and generated, as well as all source code to run the study will be made available once the paper describing the results will undergo review process. Are the datasets clearly presented in a useable and accessible format? Yes Major comments: Well I appreciate the authors adherence to the SPIRIT guidelines it made the ordering of the steps in this protocol hard to follow as these guidelines do not fully align with computational benchmarking studies. For example, the under the section “Modification by adjusting the proportions of zeros and effect sizes” it was mentioned that the groups would be simulated separately, however, the exact simulation procedure (and parameters for metaSPARSim and sparseDOSSA2) was not fully discussed prior to this. Additional information on the code and parameters used in this simulation software (including version # should be included). In the revised version of our study protocol, we describe in more detail, how the simulated data is generated. We also added version numbers for the two simulation packages. All package versions will be provided when the study is accomplished. We also added a reference to paragraph “Modification by adjusting the proportions of zeros and effect sizes” in section 11b where it is described, how the simulation procedure is changed to adapt the effect size. I suspect the authors may have difficulty fully recapturing all the true effect size differences between groups within the same study using synthetic simulated data. Given this I appreciated the in-depth discussion they gave, and the illustration presented in figure 2. However, I do believe that the study could benefit from treating equivalence as a sliding scale rather than a discreate category for each synthetic dataset based on an equivalence test for each data characteristic. It is possible that being equivalent in one characteristic will result in a much larger range than another and this should be taken into consideration during further data analysis. As an aside to this the authors should consider cases where standard deviations may become so large that almost any dataset may pass the equivalence test that it is subjected to (i.e. in figure 1 what happens if the upper and lower bounds cover almost the entire possible range of that data characteristic). In this case it may be reasonable to look at how many times a synthetic dataset from an unrelated template passes an equivalence test. (i.e. how many times does an unrelated dataset that shows very different DA results show equivalence and in what data characteristics does this occur in?). To address these concerns, we first want to emphasize that each data characteristic (DC) is computed independently for both experimental and simulated count matrices. The scales of different DCs are inherently incomparable; for instance, the proportion of zeros ranges between 0 and 1, while the number of features varies from 327 to 59,736. Thus, the vertical axis in Fig. X is in arbitrary units and +/- 1 SD always cover approx. 68% of the points from the exp. templates. Using the SD over the DCs computed from all exp. templates for defining the equivalence region can be seen as a normalization of the DCs to make it comparable. Our current equivalence testing yields a binary outcome (equivalent or not) to identify and exclude unrealistic simulations. We acknowledge the reviewer's suggestion to treat equivalence as a continuous outcome. However, implementing a sliding scale approach does not provide a final decision and presents challenges due to the diverse scales and distributions of DCs and we have to define a strategy for classification beforehand. Deciding based on how many times a synthetic dataset from an unrelated template passes an equivalence test could behave unintendedly if all synthetic data sets are unrealistic. Thus, we see no clear benefit and a further cutoff has to be defined. Nevertheless, if the reviewer prefers such an option, we are open to implement this (e.g. by defining the equivalence region from the 10% and 90% quantiles of the DC difference to unrelated templates). In response to all reviewers’ feedback, we have refined our set of DCs by eliminating redundancies and incorporating measures of alpha and beta diversity. To further address the reviewer's concerns, we will include a visualization by plotting the number of non-equivalent characteristics against the size of the equivalence region for comparing simulated data with the respective templates and comparing with unrelated templates. This will illustrate the relationship between equivalence thresholds and the proportion of DCs deemed equivalent / non-equivalent, offering deeper insight into the sensitivity of our equivalence testing procedure. We hope that these clarifications and proposed modifications adequately address the reviewer's concerns and enhance our study design. In figure 2 and in the section "Exclusion of simulations for a specific data template based on deviating data properties" it is suggested that some data characteristics may be too sensitive to include during equivalency testing, however, there is no guidelines to the best of my interpretation of the protocol to determine when a data characteristic would be too sensitive that it must be disregarded. We apologize for not providing a clear definition earlier. We now define data characteristics more precisely: we will only consider those that pass equivalence tests for at least 50% of the templates. Minor comments: Some tools included within this study may have undergone significant updates since the initial publication of Nearing et al., 2022. This should be considered in the analysis as even slightly differences may give rise to discordant results. As an aside to this the authors may want to consider including more recently developed methods in their study such as LOCOM, ANCOM-BC2 or radEmu. This would benefit the microbiome community. As written in our study protocol (section 9), we’ll apply the latest versions of the DA methods since this seems mostly valuable. We are aware, that updates of the DA methods may be responsible for failed validations. However, we see assessment and confirmation using the most recent tools as more important than preventing the risk that disagreement might originate from improved DA tools. Since a key feature of our study is to strictly perform validations following a strict methodology, we do not want to mix validation with evaluation of other tests. This will be investigated in a subsequent study where we focus on sensitivities and specificities. In the revised version of the protocol, we mention this aspect and better emphasize our overall strategy. We emphasize this strategy in the revised version of the study protocol at several points and use wordings like “validation with the most recent implementations/versions”. In some cases, the hypothesis they define for aims 1 and 2 should be reviewed for clarity. I have listed some examples below. The authors mention ANCOM-BC however, this DA tool was not considering in Nearing et al., 2022. It is possible that this was typo, and the authors were suggesting ANCOM-II, however, it is important to clarify this as ANCOM-II and ANCOM-BC are highly different methods. We thank the reviewer for this note. Upon reflection, we recognize that we indeed planned to use a different ANCOM method than the one employed in Nearing et al. They utilized a custom script downloaded “on May 11, 2020, from the FrederickHuangLin/ANCOM" repository on GitHub. We will use this script for our analysis to be as consistent as possible with the reference study. Hypothesis 8 in aim 1 should be adjusted as the shapes were similar except for the limma voom tools. Thank you for pointing this out. We changed the hypothesis accordingly. Hypothesis 4 in aim 2 should specify what Wilcoxon method is being referred to. We checked all hypothesis and clearly indicated which method is being referred to. Hypothesis 9 in aim 2 suggests that in the original study LEFSE had results where more than 99% of features in a dataset were significant, however, I do think this was found in Nearing et al., 2022. Although it was identified to be on the more sensitive side of methods tested. Thank you for pointing that out. We changed the hypothesis to be in line with the results from the reference paper. Is the rationale for, and objectives of, the study clearly described? Partly We acknowledge that the rationale and objectives of the study may not have been communicated clearly. To clarify this, we changed the abstract. To summarize, this study is essentially a validation study, focusing on evaluating the feasibility of using synthetic data to resemble experimental data and whether we replicate findings from the reference study based on multiple simulated data sets. Additionally, it is a showcase how to incorporate study protocols in computational studies and to rigorously plan a benchmark study before actually conducting it. This study serves as groundwork for a subsequent benchmark study, which will include a broader range of data sets for performance testing. Is the study design appropriate for the research question? Partly We agree that simulated data has limitations in fully validating previous findings due to inherent differences from experimental data. To address this, we calculate a comprehensive set of data characteristics, perform equivalence tests, and select only simulated data that does not show significant disagreement with experimental data. In our view, this approach is the best we can do to minimize the differences and ensure comparability between simulated and experimental datasets. Moreover, applying the presented equivalence testing procedure represents a novel methodological improvement compared to existing approaches. Are sufficient details of the methods provided to allow replication by others? Partly In the revised version we better explain especially that data generation mechanism, by including more details for the simulation process. All data sets, experimental and generated, as well as all source code to run the study will be made available once the paper describing the results will undergo review process. Are the datasets clearly presented in a useable and accessible format? Yes Competing Interests: No competing interests were disclosed. Close Report a concern COMMENT ON THIS REPORT Views 0 Cite How to cite this report: Lahti L and Pelto J. Reviewer Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.170374.r340429 ) The direct URL for this report is: https://f1000research.com/articles/13-1180/v1#referee-response-340429 NOTE: it is important to ensure the information in square brackets after the title is included in this citation. Close Copy Citation Details Reviewer Report 27 Nov 2024 Leo Lahti , University of Turku, Turku, Finland Juho Pelto , Department of Computing, University of Turku, Turku, Finland Approved with Reservations VIEWS 0 https://doi.org/10.5256/f1000research.170374.r340429 This manuscript suggests a (computational) protocol for validating differential abundance tests. The main question is whether synthetic data, generated to mimic experimental data, can validate the findings from differential abundance tests for 16S microbiome sequencing data. Specifically, the study aims ... Continue reading READ ALL This manuscript suggests a (computational) protocol for validating differential abundance tests. The main question is whether synthetic data, generated to mimic experimental data, can validate the findings from differential abundance tests for 16S microbiome sequencing data. Specifically, the study aims to assess how well synthetic datasets replicate the characteristics of real data and whether the conclusions from real data on the performance of DA tests can be confirmed with synthetic data. In order to evaluate such protocol, the authors created synthetic data that would resemble the 16S rRNA gene sequencing data in the 38 datasets employed in the study by Nearing et al. (2022). They mimic real data with synthetic data using metaSPARSim and sparseDOSSA2 with specific adjustments. They use 43 metrics to evaluate the correspondence between the synthetic and real data. The study complies with the SPIRIT guidelines for clinical trial protocols. Second, the datasets that resemble the original data sufficiently well were used to replicate results of differential abundance analysis (DAA) method comparison by Nearing et al. The language is clear but the protocol has a vast number of steps, making it hard to separate the key contributions and performance assessment from the routine protocol checkpoints. Major remarks: 1) In addition to using metaSPARSim and sparseDOSSA2, consider including more recent methods such as SimMSeq (Yang and Chen, 2022) and SIMBA (Wirbel et al., 2024). 2) The authors use 43 metrics to assess the correspondence between synthetic and real data. The number of metrics seems quite high, and it may thus be difficult describe these resemblances. Would it be possible to identify a smaller number of key metrics that would grasp the most essential aspects of the data? 3) The motivation of Aim 2 seems unclear. The authors state in the introduction, synthetic data can be used to evaluate the correctness of DAA results. However, the authors suggest to use synthetic data to replicate the results Nearing et al. (Aim 2). Why synthetic data is not used to directly evaluate the correctness of the DAA results? Replication of Nearing’s results to evaluate the validity of the synthetic data seems a rather indirect means to assess the performance of the DAA methods. 4) The study is about 16S (according to the title and descriptions) but Table 2, Aim 1 discusses metagenome simulation tools. The 16S and metagenome are different approaches for characterizing microbial communities, although there are some overlaps. It would require further clarification why a main Table switches to metagenomics in a 16S paper. 5) Availability of materials: the study is based on an open data set that is available. Availability of the source code for the experiments and results would be critical for this kind of study. I could not locate these from the manuscript. Instead, there is a text "Generated data, analysis scripts, results and supplemental information to this study will also be stored in the Fredato research data management. In case of unexpected technical limitations, we will make data, analysis scripts and supplemental information available via https://figshare.com". These materials should be made available already at the review stage, otherwise a full review is not possible. Minor notes: - Real data sets are inherently different from synthetic ones. If the underlying data distributions differ significantly between synthetic and experimental datasets, the statistical tests may yield misleading results. Some discussion could be added on this and other potential limitations of using synthetic data for methods validation. - Possibly out of scope for the protocol study but in case all the compared data generation methods fail to produce realistic data can the authors consider suggesting their own solution to generate realistic 16S data or expand discussion on this? - Could you consider using other similar measures to PCA, for instance, principal coordinate analysis based on different dissimilarity metrics (e.g. Jaccard and Bray-Curtis distances) to evaluate the robustness of the ordination comparison? - Table 2: " stasticial analyses" -> "statistical analyses"? Is the rationale for, and objectives of, the study clearly described? Partly Is the study design appropriate for the research question? Partly Are sufficient details of the methods provided to allow replication by others? Partly Are the datasets clearly presented in a useable and accessible format? Yes References 1. Yang L, Chen J: A comprehensive evaluation of microbial differential abundance analysis methods: current status and potential solutions. Microbiome . 2022; 10 (1): 130 PubMed Abstract | Publisher Full Text 2. Wirbel J, Essex M, Forslund SK, Zeller G: A realistic benchmark for differential abundance testing and confounder adjustment in human microbiome studies. Genome Biol . 2024; 25 (1): 247 PubMed Abstract | Publisher Full Text Competing Interests: No competing interests were disclosed. Reviewer Expertise: bioinformatics, microbiome, data analysis We confirm that we have read this submission and believe that we have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however we have significant reservations, as outlined above. Close READ LESS CITE CITE HOW TO CITE THIS REPORT Lahti L and Pelto J. Reviewer Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.170374.r340429 ) The direct URL for this report is: https://f1000research.com/articles/13-1180/v1#referee-response-340429 NOTE: it is important to ensure the information in square brackets after the title is included in all citations of this article. COPY CITATION DETAILS Report a concern Author Response 02 Jan 2025 Eva Kohnert , Medical Center-University of Freiburg, Freiburg, Germany 02 Jan 2025 Author Response General remark: The language is clear but the protocol has a vast number of steps, making it hard to separate the key contributions and performance assessment from the routine protocol ... Continue reading General remark: The language is clear but the protocol has a vast number of steps, making it hard to separate the key contributions and performance assessment from the routine protocol checkpoints. We appreciate the reviewer’s feedback regarding the structure and clarity of the protocol. Since this is our first attempt at drafting a study protocol for a computational benchmark study, we adhered strictly to the SPIRIT guidelines. This allowed us to explore the applicability of these guidelines, originally designed for clinical research, and how they could be effectively adapted to computational studies in the future. While we acknowledge that this approach may have resulted in a protocol with numerous steps, we believe this provides a comprehensive foundation for reproducibility and transparency in computational benchmarking. For future benchmark studies, we plan to refine the protocol structure, focusing on clearly delineating key contributions and performance assessments from routine protocol checkpoints. Major remarks: 1) In addition to using metaSPARSim and sparseDOSSA2, consider including more recent methods such as SimMSeq (Yang and Chen, 2022) and SIMBA (Wirbel et al., 2024). We appreciate the reviewer’s suggestion to consider additional simulation tools such as SimMSeq and SIMBA. While we recognize the potential value of these tools, there are several reasons why we chose not to include additional simulation methods in this study: First, the current study is already highly complex, incorporating two simulation tools, comprehensive characterization of various data sets, multiple inclusion/exclusion criteria and extensive testing. Adding further tools would further increase the study’s complexity, potentially detracting from the clarity of our findings. Secondly, as stated in the protocol the computational effects are huge. Therefore, we defined a runtime threshold of max. 1 hour per differential abundance test. We are calculating 14 differential abundance tests across 10 + 1 (simulation + experimental template) datasets for each of the 38 templates and for unfiltered and filtered data, leading to a total of 11,704 combinations. In a worst case scenario, the runtime on a single core would already require approximately 488 days per simulation tool only for conducting DA analyses. Adding additional simulation tools would further inflate runtime, particularly given the additional time required to calibrate the simulators and generate synthetic datasets. Lastly, one of the key aims of this study is to introduce and emphasize the importance of formulating a study protocol specifically for computational research. This is the first attempt to define a structured protocol for such studies, and while it may not yet represent the most refined version, we think it is an important step towards robust and transparent computational benchmark studies. Developing this protocol required substantial effort, and we believe that establishing and demonstrating the concept of a protocol in computational studies is as important as the study results themselves. In the new version of the protocol, we briefly mention these aspects. 2) The authors use 43 metrics to assess the correspondence between synthetic and real data. The number of metrics seems quite high, and it may thus be difficult describe these resemblances. Would it be possible to identify a smaller number of key metrics that would grasp the most essential aspects of the data? We agree with the reviewer that a smaller set of metrics providing independent information would be beneficial. However, we aim to retain characteristics that might highlight discrepancies between simulated and experimental data and help explain potential disagreements in DAA outcomes. Consequently, we decided to eliminate metrics that are nearly redundant by iteratively removing those with a rank correlation ≥ 0.95 and adjusted our study protocol accordingly. 3) The motivation of Aim 2 seems unclear. The authors state in the introduction, synthetic data can be used to evaluate the correctness of DAA results. However, the authors suggest to use synthetic data to replicate the results Nearing et al. (Aim 2). Why synthetic data is not used to directly evaluate the correctness of the DAA results? Replication of Nearing’s results to evaluate the validity of the synthetic data seems a rather indirect means to assess the performance of the DAA methods. Given this feedback, we realized that the motivation of Aim 2 has not been communicated clearly. This study originated as part of the planning process for a more classical benchmark study using the underlying truth of synthetic data for assessing DA methods. While designing the benchmark study, we recognized the need to first assess the feasibility of generating synthetic data that realistically resemble all characteristics of experimental data, to ensure the validity of conclusions drawn from simulated data. This led us to develop the presented validation study with primary goal to compare the results based on synthetic data to those from the reference study. This study is going to be followed by a subsequent benchmark study, in which the known truth in the simulated data sets will be used for performance testing, and the dependence on characteristics such as effect size, sample size etc. will be systematically evaluated. We addressed this issue by updating the abstract and the description in section “study setting”, making it clearer that the focus of the presented study is in validating previous findings, and not in trying to estimate sensitivities and specificities which are planned as a subsequent project. The methodological advancement lies not only in incorporating synthetic data for validation but also in the development of a comprehensive study protocol. 4) The study is about 16S (according to the title and descriptions) but Table 2, Aim 1 discusses metagenome simulation tools. The 16S and metagenome are different approaches for characterizing microbial communities, although there are some overlaps. It would require further clarification why a main Table switches to metagenomics in a 16S paper. Thank you for pointing out that we were not precise in our terminology. In this study we only focus on 16S rRNA sequencing data. We have reviewed the manuscript and changed it accordingly. 5) Availability of materials: the study is based on an open data set that is available. Availability of the source code for the experiments and results would be critical for this kind of study. I could not locate these from the manuscript. Instead, there is a text "Generated data, analysis scripts, results and supplemental information to this study will also be stored in the Fredato research data management. In case of unexpected technical limitations, we will make data, analysis scripts and supplemental information available via https://figshare.com". These materials should be made available already at the review stage, otherwise a full review is not possible. The source code for this study, generated data and results will be made available at the review stage of the paper summarizing the results. At this point the source code is not complete yet, as we are only conducting the study once the protocol is accepted. Minor notes: - Real data sets are inherently different from synthetic ones. If the underlying data distributions differ significantly between synthetic and experimental datasets, the statistical tests may yield misleading results. Some discussion could be added on this and other potential limitations of using synthetic data for methods validation. According to the reviewer’s suggestion, we added a brief discussion of this aspect to the study protocol (Discussion section). A more detailed discussion will be included in the paper presenting the results of the study. - Possibly out of scope for the protocol study but in case all the compared data generation methods fail to produce realistic data can the authors consider suggesting their own solution to generate realistic 16S data or expand discussion on this? We deliberately chose not to develop our own simulation tool in this study, as we clearly want to separate benchmarking from methods development to prevent bias. However, to maximize the realism of the synthetic data generated by the tools at hand, we already suggested to introduce two adaptations: adding zeros to the data to achieve realistic levels of sparsity and adjusting the number of differentially abundant features to better capture true effect sizes. - Could you consider using other similar measures to PCA, for instance, principal coordinate analysis based on different dissimilarity metrics (e.g. Jaccard and Bray-Curtis distances) to evaluate the robustness of the ordination comparison? The PCA visualization we proposed will not be conducted on the count matrices, but on the characteristics calculated from those count matrices. Since it would be unusual to use the Bray-Curtis distance as a similarity measure for data characteristics and visualize it by PCoA, especially across multiple experimental datasets with differing taxonomies, we decided to incorporate the average Bray-Curtis distance between all sample pairs as an additional data property. This approach aids in assessing whether simulated count matrices are realistic. - Table 2: " stasticial analyses" -> "statistical analyses"? Thank you, we have corrected this typo. Is the rationale for, and objectives of, the study clearly described? Partly We acknowledge that the rationale and objectives of the study may not have been communicated clearly. To clarify this, we changed the abstract. To summarize, this study is essentially a validation study, focusing on evaluating the feasibility of using synthetic data to resemble experimental data and whether we replicate findings from the reference study based multiple simulated data sets. Additionally, it is a showcase how to incorporate study protocols in computational studies and to rigorously plan a benchmark study before actually conducting it. This study serves as groundwork for a subsequent benchmark study, which will include a broader range of data sets for performance testing. Is the study design appropriate for the research question? Partly We agree that simulated data has limitations in fully validating previous findings due to inherent differences from experimental data. To address this, we calculate a comprehensive set of data characteristics, perform equivalence tests, and select only simulated data that does not show significant disagreement with experimental data. In our view, this approach is the best we can do to minimize the differences and ensure comparability between simulated and experimental datasets. Moreover, applying the presented equivalence testing procedure represents a novel methodological improvement compared to existing approaches. Are sufficient details of the methods provided to allow replication by others? Partly In the revised version we better explain especially that data generation mechanism, by including more details for the simulation process. All data sets, experimental and generated, as well as all source code to run the study will be made available once the paper describing the results will undergo review process. Are the datasets clearly presented in a useable and accessible format? Yes General remark: The language is clear but the protocol has a vast number of steps, making it hard to separate the key contributions and performance assessment from the routine protocol checkpoints. We appreciate the reviewer’s feedback regarding the structure and clarity of the protocol. Since this is our first attempt at drafting a study protocol for a computational benchmark study, we adhered strictly to the SPIRIT guidelines. This allowed us to explore the applicability of these guidelines, originally designed for clinical research, and how they could be effectively adapted to computational studies in the future. While we acknowledge that this approach may have resulted in a protocol with numerous steps, we believe this provides a comprehensive foundation for reproducibility and transparency in computational benchmarking. For future benchmark studies, we plan to refine the protocol structure, focusing on clearly delineating key contributions and performance assessments from routine protocol checkpoints. Major remarks: 1) In addition to using metaSPARSim and sparseDOSSA2, consider including more recent methods such as SimMSeq (Yang and Chen, 2022) and SIMBA (Wirbel et al., 2024). We appreciate the reviewer’s suggestion to consider additional simulation tools such as SimMSeq and SIMBA. While we recognize the potential value of these tools, there are several reasons why we chose not to include additional simulation methods in this study: First, the current study is already highly complex, incorporating two simulation tools, comprehensive characterization of various data sets, multiple inclusion/exclusion criteria and extensive testing. Adding further tools would further increase the study’s complexity, potentially detracting from the clarity of our findings. Secondly, as stated in the protocol the computational effects are huge. Therefore, we defined a runtime threshold of max. 1 hour per differential abundance test. We are calculating 14 differential abundance tests across 10 + 1 (simulation + experimental template) datasets for each of the 38 templates and for unfiltered and filtered data, leading to a total of 11,704 combinations. In a worst case scenario, the runtime on a single core would already require approximately 488 days per simulation tool only for conducting DA analyses. Adding additional simulation tools would further inflate runtime, particularly given the additional time required to calibrate the simulators and generate synthetic datasets. Lastly, one of the key aims of this study is to introduce and emphasize the importance of formulating a study protocol specifically for computational research. This is the first attempt to define a structured protocol for such studies, and while it may not yet represent the most refined version, we think it is an important step towards robust and transparent computational benchmark studies. Developing this protocol required substantial effort, and we believe that establishing and demonstrating the concept of a protocol in computational studies is as important as the study results themselves. In the new version of the protocol, we briefly mention these aspects. 2) The authors use 43 metrics to assess the correspondence between synthetic and real data. The number of metrics seems quite high, and it may thus be difficult describe these resemblances. Would it be possible to identify a smaller number of key metrics that would grasp the most essential aspects of the data? We agree with the reviewer that a smaller set of metrics providing independent information would be beneficial. However, we aim to retain characteristics that might highlight discrepancies between simulated and experimental data and help explain potential disagreements in DAA outcomes. Consequently, we decided to eliminate metrics that are nearly redundant by iteratively removing those with a rank correlation ≥ 0.95 and adjusted our study protocol accordingly. 3) The motivation of Aim 2 seems unclear. The authors state in the introduction, synthetic data can be used to evaluate the correctness of DAA results. However, the authors suggest to use synthetic data to replicate the results Nearing et al. (Aim 2). Why synthetic data is not used to directly evaluate the correctness of the DAA results? Replication of Nearing’s results to evaluate the validity of the synthetic data seems a rather indirect means to assess the performance of the DAA methods. Given this feedback, we realized that the motivation of Aim 2 has not been communicated clearly. This study originated as part of the planning process for a more classical benchmark study using the underlying truth of synthetic data for assessing DA methods. While designing the benchmark study, we recognized the need to first assess the feasibility of generating synthetic data that realistically resemble all characteristics of experimental data, to ensure the validity of conclusions drawn from simulated data. This led us to develop the presented validation study with primary goal to compare the results based on synthetic data to those from the reference study. This study is going to be followed by a subsequent benchmark study, in which the known truth in the simulated data sets will be used for performance testing, and the dependence on characteristics such as effect size, sample size etc. will be systematically evaluated. We addressed this issue by updating the abstract and the description in section “study setting”, making it clearer that the focus of the presented study is in validating previous findings, and not in trying to estimate sensitivities and specificities which are planned as a subsequent project. The methodological advancement lies not only in incorporating synthetic data for validation but also in the development of a comprehensive study protocol. 4) The study is about 16S (according to the title and descriptions) but Table 2, Aim 1 discusses metagenome simulation tools. The 16S and metagenome are different approaches for characterizing microbial communities, although there are some overlaps. It would require further clarification why a main Table switches to metagenomics in a 16S paper. Thank you for pointing out that we were not precise in our terminology. In this study we only focus on 16S rRNA sequencing data. We have reviewed the manuscript and changed it accordingly. 5) Availability of materials: the study is based on an open data set that is available. Availability of the source code for the experiments and results would be critical for this kind of study. I could not locate these from the manuscript. Instead, there is a text "Generated data, analysis scripts, results and supplemental information to this study will also be stored in the Fredato research data management. In case of unexpected technical limitations, we will make data, analysis scripts and supplemental information available via https://figshare.com". These materials should be made available already at the review stage, otherwise a full review is not possible. The source code for this study, generated data and results will be made available at the review stage of the paper summarizing the results. At this point the source code is not complete yet, as we are only conducting the study once the protocol is accepted. Minor notes: - Real data sets are inherently different from synthetic ones. If the underlying data distributions differ significantly between synthetic and experimental datasets, the statistical tests may yield misleading results. Some discussion could be added on this and other potential limitations of using synthetic data for methods validation. According to the reviewer’s suggestion, we added a brief discussion of this aspect to the study protocol (Discussion section). A more detailed discussion will be included in the paper presenting the results of the study. - Possibly out of scope for the protocol study but in case all the compared data generation methods fail to produce realistic data can the authors consider suggesting their own solution to generate realistic 16S data or expand discussion on this? We deliberately chose not to develop our own simulation tool in this study, as we clearly want to separate benchmarking from methods development to prevent bias. However, to maximize the realism of the synthetic data generated by the tools at hand, we already suggested to introduce two adaptations: adding zeros to the data to achieve realistic levels of sparsity and adjusting the number of differentially abundant features to better capture true effect sizes. - Could you consider using other similar measures to PCA, for instance, principal coordinate analysis based on different dissimilarity metrics (e.g. Jaccard and Bray-Curtis distances) to evaluate the robustness of the ordination comparison? The PCA visualization we proposed will not be conducted on the count matrices, but on the characteristics calculated from those count matrices. Since it would be unusual to use the Bray-Curtis distance as a similarity measure for data characteristics and visualize it by PCoA, especially across multiple experimental datasets with differing taxonomies, we decided to incorporate the average Bray-Curtis distance between all sample pairs as an additional data property. This approach aids in assessing whether simulated count matrices are realistic. - Table 2: " stasticial analyses" -> "statistical analyses"? Thank you, we have corrected this typo. Is the rationale for, and objectives of, the study clearly described? Partly We acknowledge that the rationale and objectives of the study may not have been communicated clearly. To clarify this, we changed the abstract. To summarize, this study is essentially a validation study, focusing on evaluating the feasibility of using synthetic data to resemble experimental data and whether we replicate findings from the reference study based multiple simulated data sets. Additionally, it is a showcase how to incorporate study protocols in computational studies and to rigorously plan a benchmark study before actually conducting it. This study serves as groundwork for a subsequent benchmark study, which will include a broader range of data sets for performance testing. Is the study design appropriate for the research question? Partly We agree that simulated data has limitations in fully validating previous findings due to inherent differences from experimental data. To address this, we calculate a comprehensive set of data characteristics, perform equivalence tests, and select only simulated data that does not show significant disagreement with experimental data. In our view, this approach is the best we can do to minimize the differences and ensure comparability between simulated and experimental datasets. Moreover, applying the presented equivalence testing procedure represents a novel methodological improvement compared to existing approaches. Are sufficient details of the methods provided to allow replication by others? Partly In the revised version we better explain especially that data generation mechanism, by including more details for the simulation process. All data sets, experimental and generated, as well as all source code to run the study will be made available once the paper describing the results will undergo review process. Are the datasets clearly presented in a useable and accessible format? Yes Competing Interests: No competing interests were disclosed. Close Report a concern Respond or Comment COMMENTS ON THIS REPORT Author Response 02 Jan 2025 Eva Kohnert , Medical Center-University of Freiburg, Freiburg, Germany 02 Jan 2025 Author Response General remark: The language is clear but the protocol has a vast number of steps, making it hard to separate the key contributions and performance assessment from the routine protocol ... Continue reading General remark: The language is clear but the protocol has a vast number of steps, making it hard to separate the key contributions and performance assessment from the routine protocol checkpoints. We appreciate the reviewer’s feedback regarding the structure and clarity of the protocol. Since this is our first attempt at drafting a study protocol for a computational benchmark study, we adhered strictly to the SPIRIT guidelines. This allowed us to explore the applicability of these guidelines, originally designed for clinical research, and how they could be effectively adapted to computational studies in the future. While we acknowledge that this approach may have resulted in a protocol with numerous steps, we believe this provides a comprehensive foundation for reproducibility and transparency in computational benchmarking. For future benchmark studies, we plan to refine the protocol structure, focusing on clearly delineating key contributions and performance assessments from routine protocol checkpoints. Major remarks: 1) In addition to using metaSPARSim and sparseDOSSA2, consider including more recent methods such as SimMSeq (Yang and Chen, 2022) and SIMBA (Wirbel et al., 2024). We appreciate the reviewer’s suggestion to consider additional simulation tools such as SimMSeq and SIMBA. While we recognize the potential value of these tools, there are several reasons why we chose not to include additional simulation methods in this study: First, the current study is already highly complex, incorporating two simulation tools, comprehensive characterization of various data sets, multiple inclusion/exclusion criteria and extensive testing. Adding further tools would further increase the study’s complexity, potentially detracting from the clarity of our findings. Secondly, as stated in the protocol the computational effects are huge. Therefore, we defined a runtime threshold of max. 1 hour per differential abundance test. We are calculating 14 differential abundance tests across 10 + 1 (simulation + experimental template) datasets for each of the 38 templates and for unfiltered and filtered data, leading to a total of 11,704 combinations. In a worst case scenario, the runtime on a single core would already require approximately 488 days per simulation tool only for conducting DA analyses. Adding additional simulation tools would further inflate runtime, particularly given the additional time required to calibrate the simulators and generate synthetic datasets. Lastly, one of the key aims of this study is to introduce and emphasize the importance of formulating a study protocol specifically for computational research. This is the first attempt to define a structured protocol for such studies, and while it may not yet represent the most refined version, we think it is an important step towards robust and transparent computational benchmark studies. Developing this protocol required substantial effort, and we believe that establishing and demonstrating the concept of a protocol in computational studies is as important as the study results themselves. In the new version of the protocol, we briefly mention these aspects. 2) The authors use 43 metrics to assess the correspondence between synthetic and real data. The number of metrics seems quite high, and it may thus be difficult describe these resemblances. Would it be possible to identify a smaller number of key metrics that would grasp the most essential aspects of the data? We agree with the reviewer that a smaller set of metrics providing independent information would be beneficial. However, we aim to retain characteristics that might highlight discrepancies between simulated and experimental data and help explain potential disagreements in DAA outcomes. Consequently, we decided to eliminate metrics that are nearly redundant by iteratively removing those with a rank correlation ≥ 0.95 and adjusted our study protocol accordingly. 3) The motivation of Aim 2 seems unclear. The authors state in the introduction, synthetic data can be used to evaluate the correctness of DAA results. However, the authors suggest to use synthetic data to replicate the results Nearing et al. (Aim 2). Why synthetic data is not used to directly evaluate the correctness of the DAA results? Replication of Nearing’s results to evaluate the validity of the synthetic data seems a rather indirect means to assess the performance of the DAA methods. Given this feedback, we realized that the motivation of Aim 2 has not been communicated clearly. This study originated as part of the planning process for a more classical benchmark study using the underlying truth of synthetic data for assessing DA methods. While designing the benchmark study, we recognized the need to first assess the feasibility of generating synthetic data that realistically resemble all characteristics of experimental data, to ensure the validity of conclusions drawn from simulated data. This led us to develop the presented validation study with primary goal to compare the results based on synthetic data to those from the reference study. This study is going to be followed by a subsequent benchmark study, in which the known truth in the simulated data sets will be used for performance testing, and the dependence on characteristics such as effect size, sample size etc. will be systematically evaluated. We addressed this issue by updating the abstract and the description in section “study setting”, making it clearer that the focus of the presented study is in validating previous findings, and not in trying to estimate sensitivities and specificities which are planned as a subsequent project. The methodological advancement lies not only in incorporating synthetic data for validation but also in the development of a comprehensive study protocol. 4) The study is about 16S (according to the title and descriptions) but Table 2, Aim 1 discusses metagenome simulation tools. The 16S and metagenome are different approaches for characterizing microbial communities, although there are some overlaps. It would require further clarification why a main Table switches to metagenomics in a 16S paper. Thank you for pointing out that we were not precise in our terminology. In this study we only focus on 16S rRNA sequencing data. We have reviewed the manuscript and changed it accordingly. 5) Availability of materials: the study is based on an open data set that is available. Availability of the source code for the experiments and results would be critical for this kind of study. I could not locate these from the manuscript. Instead, there is a text "Generated data, analysis scripts, results and supplemental information to this study will also be stored in the Fredato research data management. In case of unexpected technical limitations, we will make data, analysis scripts and supplemental information available via https://figshare.com". These materials should be made available already at the review stage, otherwise a full review is not possible. The source code for this study, generated data and results will be made available at the review stage of the paper summarizing the results. At this point the source code is not complete yet, as we are only conducting the study once the protocol is accepted. Minor notes: - Real data sets are inherently different from synthetic ones. If the underlying data distributions differ significantly between synthetic and experimental datasets, the statistical tests may yield misleading results. Some discussion could be added on this and other potential limitations of using synthetic data for methods validation. According to the reviewer’s suggestion, we added a brief discussion of this aspect to the study protocol (Discussion section). A more detailed discussion will be included in the paper presenting the results of the study. - Possibly out of scope for the protocol study but in case all the compared data generation methods fail to produce realistic data can the authors consider suggesting their own solution to generate realistic 16S data or expand discussion on this? We deliberately chose not to develop our own simulation tool in this study, as we clearly want to separate benchmarking from methods development to prevent bias. However, to maximize the realism of the synthetic data generated by the tools at hand, we already suggested to introduce two adaptations: adding zeros to the data to achieve realistic levels of sparsity and adjusting the number of differentially abundant features to better capture true effect sizes. - Could you consider using other similar measures to PCA, for instance, principal coordinate analysis based on different dissimilarity metrics (e.g. Jaccard and Bray-Curtis distances) to evaluate the robustness of the ordination comparison? The PCA visualization we proposed will not be conducted on the count matrices, but on the characteristics calculated from those count matrices. Since it would be unusual to use the Bray-Curtis distance as a similarity measure for data characteristics and visualize it by PCoA, especially across multiple experimental datasets with differing taxonomies, we decided to incorporate the average Bray-Curtis distance between all sample pairs as an additional data property. This approach aids in assessing whether simulated count matrices are realistic. - Table 2: " stasticial analyses" -> "statistical analyses"? Thank you, we have corrected this typo. Is the rationale for, and objectives of, the study clearly described? Partly We acknowledge that the rationale and objectives of the study may not have been communicated clearly. To clarify this, we changed the abstract. To summarize, this study is essentially a validation study, focusing on evaluating the feasibility of using synthetic data to resemble experimental data and whether we replicate findings from the reference study based multiple simulated data sets. Additionally, it is a showcase how to incorporate study protocols in computational studies and to rigorously plan a benchmark study before actually conducting it. This study serves as groundwork for a subsequent benchmark study, which will include a broader range of data sets for performance testing. Is the study design appropriate for the research question? Partly We agree that simulated data has limitations in fully validating previous findings due to inherent differences from experimental data. To address this, we calculate a comprehensive set of data characteristics, perform equivalence tests, and select only simulated data that does not show significant disagreement with experimental data. In our view, this approach is the best we can do to minimize the differences and ensure comparability between simulated and experimental datasets. Moreover, applying the presented equivalence testing procedure represents a novel methodological improvement compared to existing approaches. Are sufficient details of the methods provided to allow replication by others? Partly In the revised version we better explain especially that data generation mechanism, by including more details for the simulation process. All data sets, experimental and generated, as well as all source code to run the study will be made available once the paper describing the results will undergo review process. Are the datasets clearly presented in a useable and accessible format? Yes General remark: The language is clear but the protocol has a vast number of steps, making it hard to separate the key contributions and performance assessment from the routine protocol checkpoints. We appreciate the reviewer’s feedback regarding the structure and clarity of the protocol. Since this is our first attempt at drafting a study protocol for a computational benchmark study, we adhered strictly to the SPIRIT guidelines. This allowed us to explore the applicability of these guidelines, originally designed for clinical research, and how they could be effectively adapted to computational studies in the future. While we acknowledge that this approach may have resulted in a protocol with numerous steps, we believe this provides a comprehensive foundation for reproducibility and transparency in computational benchmarking. For future benchmark studies, we plan to refine the protocol structure, focusing on clearly delineating key contributions and performance assessments from routine protocol checkpoints. Major remarks: 1) In addition to using metaSPARSim and sparseDOSSA2, consider including more recent methods such as SimMSeq (Yang and Chen, 2022) and SIMBA (Wirbel et al., 2024). We appreciate the reviewer’s suggestion to consider additional simulation tools such as SimMSeq and SIMBA. While we recognize the potential value of these tools, there are several reasons why we chose not to include additional simulation methods in this study: First, the current study is already highly complex, incorporating two simulation tools, comprehensive characterization of various data sets, multiple inclusion/exclusion criteria and extensive testing. Adding further tools would further increase the study’s complexity, potentially detracting from the clarity of our findings. Secondly, as stated in the protocol the computational effects are huge. Therefore, we defined a runtime threshold of max. 1 hour per differential abundance test. We are calculating 14 differential abundance tests across 10 + 1 (simulation + experimental template) datasets for each of the 38 templates and for unfiltered and filtered data, leading to a total of 11,704 combinations. In a worst case scenario, the runtime on a single core would already require approximately 488 days per simulation tool only for conducting DA analyses. Adding additional simulation tools would further inflate runtime, particularly given the additional time required to calibrate the simulators and generate synthetic datasets. Lastly, one of the key aims of this study is to introduce and emphasize the importance of formulating a study protocol specifically for computational research. This is the first attempt to define a structured protocol for such studies, and while it may not yet represent the most refined version, we think it is an important step towards robust and transparent computational benchmark studies. Developing this protocol required substantial effort, and we believe that establishing and demonstrating the concept of a protocol in computational studies is as important as the study results themselves. In the new version of the protocol, we briefly mention these aspects. 2) The authors use 43 metrics to assess the correspondence between synthetic and real data. The number of metrics seems quite high, and it may thus be difficult describe these resemblances. Would it be possible to identify a smaller number of key metrics that would grasp the most essential aspects of the data? We agree with the reviewer that a smaller set of metrics providing independent information would be beneficial. However, we aim to retain characteristics that might highlight discrepancies between simulated and experimental data and help explain potential disagreements in DAA outcomes. Consequently, we decided to eliminate metrics that are nearly redundant by iteratively removing those with a rank correlation ≥ 0.95 and adjusted our study protocol accordingly. 3) The motivation of Aim 2 seems unclear. The authors state in the introduction, synthetic data can be used to evaluate the correctness of DAA results. However, the authors suggest to use synthetic data to replicate the results Nearing et al. (Aim 2). Why synthetic data is not used to directly evaluate the correctness of the DAA results? Replication of Nearing’s results to evaluate the validity of the synthetic data seems a rather indirect means to assess the performance of the DAA methods. Given this feedback, we realized that the motivation of Aim 2 has not been communicated clearly. This study originated as part of the planning process for a more classical benchmark study using the underlying truth of synthetic data for assessing DA methods. While designing the benchmark study, we recognized the need to first assess the feasibility of generating synthetic data that realistically resemble all characteristics of experimental data, to ensure the validity of conclusions drawn from simulated data. This led us to develop the presented validation study with primary goal to compare the results based on synthetic data to those from the reference study. This study is going to be followed by a subsequent benchmark study, in which the known truth in the simulated data sets will be used for performance testing, and the dependence on characteristics such as effect size, sample size etc. will be systematically evaluated. We addressed this issue by updating the abstract and the description in section “study setting”, making it clearer that the focus of the presented study is in validating previous findings, and not in trying to estimate sensitivities and specificities which are planned as a subsequent project. The methodological advancement lies not only in incorporating synthetic data for validation but also in the development of a comprehensive study protocol. 4) The study is about 16S (according to the title and descriptions) but Table 2, Aim 1 discusses metagenome simulation tools. The 16S and metagenome are different approaches for characterizing microbial communities, although there are some overlaps. It would require further clarification why a main Table switches to metagenomics in a 16S paper. Thank you for pointing out that we were not precise in our terminology. In this study we only focus on 16S rRNA sequencing data. We have reviewed the manuscript and changed it accordingly. 5) Availability of materials: the study is based on an open data set that is available. Availability of the source code for the experiments and results would be critical for this kind of study. I could not locate these from the manuscript. Instead, there is a text "Generated data, analysis scripts, results and supplemental information to this study will also be stored in the Fredato research data management. In case of unexpected technical limitations, we will make data, analysis scripts and supplemental information available via https://figshare.com". These materials should be made available already at the review stage, otherwise a full review is not possible. The source code for this study, generated data and results will be made available at the review stage of the paper summarizing the results. At this point the source code is not complete yet, as we are only conducting the study once the protocol is accepted. Minor notes: - Real data sets are inherently different from synthetic ones. If the underlying data distributions differ significantly between synthetic and experimental datasets, the statistical tests may yield misleading results. Some discussion could be added on this and other potential limitations of using synthetic data for methods validation. According to the reviewer’s suggestion, we added a brief discussion of this aspect to the study protocol (Discussion section). A more detailed discussion will be included in the paper presenting the results of the study. - Possibly out of scope for the protocol study but in case all the compared data generation methods fail to produce realistic data can the authors consider suggesting their own solution to generate realistic 16S data or expand discussion on this? We deliberately chose not to develop our own simulation tool in this study, as we clearly want to separate benchmarking from methods development to prevent bias. However, to maximize the realism of the synthetic data generated by the tools at hand, we already suggested to introduce two adaptations: adding zeros to the data to achieve realistic levels of sparsity and adjusting the number of differentially abundant features to better capture true effect sizes. - Could you consider using other similar measures to PCA, for instance, principal coordinate analysis based on different dissimilarity metrics (e.g. Jaccard and Bray-Curtis distances) to evaluate the robustness of the ordination comparison? The PCA visualization we proposed will not be conducted on the count matrices, but on the characteristics calculated from those count matrices. Since it would be unusual to use the Bray-Curtis distance as a similarity measure for data characteristics and visualize it by PCoA, especially across multiple experimental datasets with differing taxonomies, we decided to incorporate the average Bray-Curtis distance between all sample pairs as an additional data property. This approach aids in assessing whether simulated count matrices are realistic. - Table 2: " stasticial analyses" -> "statistical analyses"? Thank you, we have corrected this typo. Is the rationale for, and objectives of, the study clearly described? Partly We acknowledge that the rationale and objectives of the study may not have been communicated clearly. To clarify this, we changed the abstract. To summarize, this study is essentially a validation study, focusing on evaluating the feasibility of using synthetic data to resemble experimental data and whether we replicate findings from the reference study based multiple simulated data sets. Additionally, it is a showcase how to incorporate study protocols in computational studies and to rigorously plan a benchmark study before actually conducting it. This study serves as groundwork for a subsequent benchmark study, which will include a broader range of data sets for performance testing. Is the study design appropriate for the research question? Partly We agree that simulated data has limitations in fully validating previous findings due to inherent differences from experimental data. To address this, we calculate a comprehensive set of data characteristics, perform equivalence tests, and select only simulated data that does not show significant disagreement with experimental data. In our view, this approach is the best we can do to minimize the differences and ensure comparability between simulated and experimental datasets. Moreover, applying the presented equivalence testing procedure represents a novel methodological improvement compared to existing approaches. Are sufficient details of the methods provided to allow replication by others? Partly In the revised version we better explain especially that data generation mechanism, by including more details for the simulation process. All data sets, experimental and generated, as well as all source code to run the study will be made available once the paper describing the results will undergo review process. Are the datasets clearly presented in a useable and accessible format? Yes Competing Interests: No competing interests were disclosed. Close Report a concern COMMENT ON THIS REPORT Comments on this article Comments (0) Version 2 VERSION 2 PUBLISHED 09 Oct 2024 ADD YOUR COMMENT Comment keyboard_arrow_left keyboard_arrow_right Open Peer Review Reviewer Status info_outline Alongside their report, reviewers assign a status to the article: Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions Reviewer Reports Invited Reviewers 1 2 Version 2 (revision) 02 Jan 25 read read Version 1 09 Oct 24 read read Leo Lahti , University of Turku, Turku, Finland Juho Pelto , University of Turku, Turku, Finland Jacob T. Nearing , Harvard T.H. Chan School of Public Health, Boston, USA Comments on this article All Comments (0) Add a comment Sign up for content alerts Sign Up You are now signed up to receive this alert Browse by related subjects keyboard_arrow_left Back to all reports Reviewer Report 0 Views copyright © 2025 Nearing J. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. 23 Jan 2025 | for Version 2 Jacob T. Nearing , Biostatistics, Harvard T.H. Chan School of Public Health, Boston, MA, USA 0 Views copyright © 2025 Nearing J. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. format_quote Cite this report speaker_notes Responses (0) Approved info_outline Alongside their report, reviewers assign a status to the article: Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions The authors have addressed all my comments in their revisions and clarified the manuscript's goals and objectives. I have no further comments to provide. Competing Interests No competing interests were disclosed. Reviewer Expertise Bioinformatics, microbiome I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard. reply Respond to this report Responses (0) Nearing JT. Peer Review Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.176118.r355372) NOTE: it is important to ensure the information in square brackets after the title is included in this citation. The direct URL for this report is: https://f1000research.com/articles/13-1180/v2#referee-response-355372 keyboard_arrow_left Back to all reports Reviewer Report 0 Views copyright © 2025 Lahti L et al. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. 21 Jan 2025 | for Version 2 Leo Lahti , University of Turku, Turku, Finland Juho Pelto , Department of Computing, University of Turku, Turku, Finland 0 Views copyright © 2025 Lahti L et al. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. format_quote Cite this report speaker_notes Responses (0) Approved info_outline Alongside their report, reviewers assign a status to the article: Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions The authors have responded to our comments satisfactorily. Most importantly, the rationale for and the objectives of the study are now more clearly expressed. However, in case that metaSPARSim and sparseDOSSA2 cannot produce (even after adjustments for sparsity and effect sizes) data sets that resemble the original 38 real data sets closely enough, it remains a limitation of the study that no other simulation tools are being used. While we understand that adding other simulation tools to the study would increase the run-time and complexity of the study, testing other tools could be considered at least conditionally, i.e. in case that metaSPARSim and sparseDOSSA2 cannot produce proper data. Competing Interests No competing interests were disclosed. Reviewer Expertise biostatistics We confirm that we have read this submission and believe that we have an appropriate level of expertise to confirm that it is of an acceptable scientific standard. reply Respond to this report Responses (0) Lahti L and Pelto J. Peer Review Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.176118.r355371) NOTE: it is important to ensure the information in square brackets after the title is included in this citation. The direct URL for this report is: https://f1000research.com/articles/13-1180/v2#referee-response-355371 keyboard_arrow_left Back to all reports Reviewer Report 0 Views copyright © 2024 Nearing J. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. 05 Dec 2024 | for Version 1 Jacob T. Nearing , Biostatistics, Harvard T.H. Chan School of Public Health, Boston, MA, USA 0 Views copyright © 2024 Nearing J. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. format_quote Cite this report speaker_notes Responses (1) Approved With Reservations info_outline Alongside their report, reviewers assign a status to the article: Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions In this protocol Kohnert and Kreutz design a study to assess microbiome differential abundance methods by replicating Nearing et al., 2022 using synthetic datasets generated from the original data using metaSPARSim and sparseDOSSA2. They intend to determine the similarity between the synthetic data generated and their underlying true datasets using 43 different data characteristics. Moreover, they hope to identify whether there are any driving characteristics in synthetic data that would result in discordant conclusions with those achieved by using the real datasets they are based upon. The authors identify that their protocol falls within SPIRIT guidelines and openly report all changes that were made to their report over its initial creation until now. Overall, the study is of interest to microbiome research, however, some areas require clarification, and the overall presentation of the protocol could be improved as major steps in some cases were not presented in order. Major comments: Well I appreciate the authors adherence to the SPIRIT guidelines it made the ordering of the steps in this protocol hard to follow as these guidelines do not fully align with computational benchmarking studies. For example, the under the section “ Modification by adjusting the proportions of zeros and effect sizes” it was mentioned that the groups would be simulated separately, however, the exact simulation procedure (and parameters for metaSPARSim and sparseDOSSA2) was not fully discussed prior to this. Additional information on the code and parameters used in this simulation software (including version # should be included). I suspect the authors may have difficulty fully recapturing all the true effect size differences between groups within the same study using synthetic simulated data. Given this I appreciated the in-depth discussion they gave, and the illustration presented in figure 2. However, I do believe that the study could benefit from treating equivalence as a sliding scale rather than a discreate category for each synthetic dataset based on an equivalence test for each data characteristic. It is possible that being equivalent in one characteristic will result in a much larger range than another and this should be taken into consideration during further data analysis. As an aside to this the authors should consider cases where standard deviations may become so large that almost any dataset may pass the equivalence test that it is subjected to (i.e. in figure 1 what happens if the upper and lower bounds cover almost the entire possible range of that data characteristic). In this case it may be reasonable to look at how many times a synthetic dataset from an unrelated template passes an equivalence test. (i.e. how many times does an unrelated dataset that shows very different DA results show equivalence and in what data characteristics does this occur in?). In figure 2 and in the section " Exclusion of simulations for a specific data template based on deviating data properties" it is suggested that some data characteristics may be too sensitive to include during equivalency testing, however, there is no guidelines to the best of my interpretation of the protocol to determine when a data characteristic would be too sensitive that it must be disregarded. Minor comments: Some tools included within this study may have undergone significant updates since the initial publication of Nearing et al., 2022. This should be considered in the analysis as even slightly differences may give rise to discordant results. As an aside to this the authors may want to consider including more recently developed methods in their study such as LOCOM, ANCOM-BC2 or radEmu. This would benefit the microbiome community. In some cases, the hypothesis they define for aims 1 and 2 should be reviewed for clarity. I have listed some examples below. The authors mention ANCOM-BC however, this DA tool was not considering in Nearing et al., 2022. It is possible that this was typo, and the authors were suggesting ANCOM-II, however, it is important to clarify this as ANCOM-II and ANCOM-BC are highly different methods. Hypothesis 8 in aim 1 should be adjusted as the shapes were similar except for the limma voom tools. Hypothesis 4 in aim 2 should specify what Wilcoxon method is being referred to. Hypothesis 9 in aim 2 suggests that in the original study LEFSE had results where more than 99% of features in a dataset were significant, however, I do think this was found in Nearing et al., 2022. Although it was identified to be on the more sensitive side of methods tested. Is the rationale for, and objectives of, the study clearly described? Partly Is the study design appropriate for the research question? Partly Are sufficient details of the methods provided to allow replication by others? Partly Are the datasets clearly presented in a useable and accessible format? Yes Competing Interests No competing interests were disclosed. Reviewer Expertise Bioinformatics, microbiome, I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above. reply Respond to this report Responses (1) Author Response 02 Jan 2025 Eva Kohnert, Medical Center-University of Freiburg, Freiburg, Germany Major comments: Well I appreciate the authors adherence to the SPIRIT guidelines it made the ordering of the steps in this protocol hard to follow as these guidelines do not fully align with computational benchmarking studies. For example, the under the section “Modification by adjusting the proportions of zeros and effect sizes” it was mentioned that the groups would be simulated separately, however, the exact simulation procedure (and parameters for metaSPARSim and sparseDOSSA2) was not fully discussed prior to this. Additional information on the code and parameters used in this simulation software (including version # should be included). In the revised version of our study protocol, we describe in more detail, how the simulated data is generated. We also added version numbers for the two simulation packages. All package versions will be provided when the study is accomplished. We also added a reference to paragraph “Modification by adjusting the proportions of zeros and effect sizes” in section 11b where it is described, how the simulation procedure is changed to adapt the effect size. I suspect the authors may have difficulty fully recapturing all the true effect size differences between groups within the same study using synthetic simulated data. Given this I appreciated the in-depth discussion they gave, and the illustration presented in figure 2. However, I do believe that the study could benefit from treating equivalence as a sliding scale rather than a discreate category for each synthetic dataset based on an equivalence test for each data characteristic. It is possible that being equivalent in one characteristic will result in a much larger range than another and this should be taken into consideration during further data analysis. As an aside to this the authors should consider cases where standard deviations may become so large that almost any dataset may pass the equivalence test that it is subjected to (i.e. in figure 1 what happens if the upper and lower bounds cover almost the entire possible range of that data characteristic). In this case it may be reasonable to look at how many times a synthetic dataset from an unrelated template passes an equivalence test. (i.e. how many times does an unrelated dataset that shows very different DA results show equivalence and in what data characteristics does this occur in?). To address these concerns, we first want to emphasize that each data characteristic (DC) is computed independently for both experimental and simulated count matrices. The scales of different DCs are inherently incomparable; for instance, the proportion of zeros ranges between 0 and 1, while the number of features varies from 327 to 59,736. Thus, the vertical axis in Fig. X is in arbitrary units and +/- 1 SD always cover approx. 68% of the points from the exp. templates. Using the SD over the DCs computed from all exp. templates for defining the equivalence region can be seen as a normalization of the DCs to make it comparable. Our current equivalence testing yields a binary outcome (equivalent or not) to identify and exclude unrealistic simulations. We acknowledge the reviewer's suggestion to treat equivalence as a continuous outcome. However, implementing a sliding scale approach does not provide a final decision and presents challenges due to the diverse scales and distributions of DCs and we have to define a strategy for classification beforehand. Deciding based on how many times a synthetic dataset from an unrelated template passes an equivalence test could behave unintendedly if all synthetic data sets are unrealistic. Thus, we see no clear benefit and a further cutoff has to be defined. Nevertheless, if the reviewer prefers such an option, we are open to implement this (e.g. by defining the equivalence region from the 10% and 90% quantiles of the DC difference to unrelated templates). In response to all reviewers’ feedback, we have refined our set of DCs by eliminating redundancies and incorporating measures of alpha and beta diversity. To further address the reviewer's concerns, we will include a visualization by plotting the number of non-equivalent characteristics against the size of the equivalence region for comparing simulated data with the respective templates and comparing with unrelated templates. This will illustrate the relationship between equivalence thresholds and the proportion of DCs deemed equivalent / non-equivalent, offering deeper insight into the sensitivity of our equivalence testing procedure. We hope that these clarifications and proposed modifications adequately address the reviewer's concerns and enhance our study design. In figure 2 and in the section "Exclusion of simulations for a specific data template based on deviating data properties" it is suggested that some data characteristics may be too sensitive to include during equivalency testing, however, there is no guidelines to the best of my interpretation of the protocol to determine when a data characteristic would be too sensitive that it must be disregarded. We apologize for not providing a clear definition earlier. We now define data characteristics more precisely: we will only consider those that pass equivalence tests for at least 50% of the templates. Minor comments: Some tools included within this study may have undergone significant updates since the initial publication of Nearing et al., 2022. This should be considered in the analysis as even slightly differences may give rise to discordant results. As an aside to this the authors may want to consider including more recently developed methods in their study such as LOCOM, ANCOM-BC2 or radEmu. This would benefit the microbiome community. As written in our study protocol (section 9), we’ll apply the latest versions of the DA methods since this seems mostly valuable. We are aware, that updates of the DA methods may be responsible for failed validations. However, we see assessment and confirmation using the most recent tools as more important than preventing the risk that disagreement might originate from improved DA tools. Since a key feature of our study is to strictly perform validations following a strict methodology, we do not want to mix validation with evaluation of other tests. This will be investigated in a subsequent study where we focus on sensitivities and specificities. In the revised version of the protocol, we mention this aspect and better emphasize our overall strategy. We emphasize this strategy in the revised version of the study protocol at several points and use wordings like “validation with the most recent implementations/versions”. In some cases, the hypothesis they define for aims 1 and 2 should be reviewed for clarity. I have listed some examples below. The authors mention ANCOM-BC however, this DA tool was not considering in Nearing et al., 2022. It is possible that this was typo, and the authors were suggesting ANCOM-II, however, it is important to clarify this as ANCOM-II and ANCOM-BC are highly different methods. We thank the reviewer for this note. Upon reflection, we recognize that we indeed planned to use a different ANCOM method than the one employed in Nearing et al. They utilized a custom script downloaded “on May 11, 2020, from the FrederickHuangLin/ANCOM" repository on GitHub. We will use this script for our analysis to be as consistent as possible with the reference study. Hypothesis 8 in aim 1 should be adjusted as the shapes were similar except for the limma voom tools. Thank you for pointing this out. We changed the hypothesis accordingly. Hypothesis 4 in aim 2 should specify what Wilcoxon method is being referred to. We checked all hypothesis and clearly indicated which method is being referred to. Hypothesis 9 in aim 2 suggests that in the original study LEFSE had results where more than 99% of features in a dataset were significant, however, I do think this was found in Nearing et al., 2022. Although it was identified to be on the more sensitive side of methods tested. Thank you for pointing that out. We changed the hypothesis to be in line with the results from the reference paper. Is the rationale for, and objectives of, the study clearly described? Partly We acknowledge that the rationale and objectives of the study may not have been communicated clearly. To clarify this, we changed the abstract. To summarize, this study is essentially a validation study, focusing on evaluating the feasibility of using synthetic data to resemble experimental data and whether we replicate findings from the reference study based on multiple simulated data sets. Additionally, it is a showcase how to incorporate study protocols in computational studies and to rigorously plan a benchmark study before actually conducting it. This study serves as groundwork for a subsequent benchmark study, which will include a broader range of data sets for performance testing. Is the study design appropriate for the research question? Partly We agree that simulated data has limitations in fully validating previous findings due to inherent differences from experimental data. To address this, we calculate a comprehensive set of data characteristics, perform equivalence tests, and select only simulated data that does not show significant disagreement with experimental data. In our view, this approach is the best we can do to minimize the differences and ensure comparability between simulated and experimental datasets. Moreover, applying the presented equivalence testing procedure represents a novel methodological improvement compared to existing approaches. Are sufficient details of the methods provided to allow replication by others? Partly In the revised version we better explain especially that data generation mechanism, by including more details for the simulation process. All data sets, experimental and generated, as well as all source code to run the study will be made available once the paper describing the results will undergo review process. Are the datasets clearly presented in a useable and accessible format? Yes View more View less Competing Interests No competing interests were disclosed. reply Respond Report a concern Nearing JT. Peer Review Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.170374.r340427) NOTE: it is important to ensure the information in square brackets after the title is included in this citation. The direct URL for this report is: https://f1000research.com/articles/13-1180/v1#referee-response-340427 keyboard_arrow_left Back to all reports Reviewer Report 0 Views copyright © 2024 Lahti L et al. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. 27 Nov 2024 | for Version 1 Leo Lahti , University of Turku, Turku, Finland Juho Pelto , Department of Computing, University of Turku, Turku, Finland 0 Views copyright © 2024 Lahti L et al. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. format_quote Cite this report speaker_notes Responses (1) Approved With Reservations info_outline Alongside their report, reviewers assign a status to the article: Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions This manuscript suggests a (computational) protocol for validating differential abundance tests. The main question is whether synthetic data, generated to mimic experimental data, can validate the findings from differential abundance tests for 16S microbiome sequencing data. Specifically, the study aims to assess how well synthetic datasets replicate the characteristics of real data and whether the conclusions from real data on the performance of DA tests can be confirmed with synthetic data. In order to evaluate such protocol, the authors created synthetic data that would resemble the 16S rRNA gene sequencing data in the 38 datasets employed in the study by Nearing et al. (2022). They mimic real data with synthetic data using metaSPARSim and sparseDOSSA2 with specific adjustments. They use 43 metrics to evaluate the correspondence between the synthetic and real data. The study complies with the SPIRIT guidelines for clinical trial protocols. Second, the datasets that resemble the original data sufficiently well were used to replicate results of differential abundance analysis (DAA) method comparison by Nearing et al. The language is clear but the protocol has a vast number of steps, making it hard to separate the key contributions and performance assessment from the routine protocol checkpoints. Major remarks: 1) In addition to using metaSPARSim and sparseDOSSA2, consider including more recent methods such as SimMSeq (Yang and Chen, 2022) and SIMBA (Wirbel et al., 2024). 2) The authors use 43 metrics to assess the correspondence between synthetic and real data. The number of metrics seems quite high, and it may thus be difficult describe these resemblances. Would it be possible to identify a smaller number of key metrics that would grasp the most essential aspects of the data? 3) The motivation of Aim 2 seems unclear. The authors state in the introduction, synthetic data can be used to evaluate the correctness of DAA results. However, the authors suggest to use synthetic data to replicate the results Nearing et al. (Aim 2). Why synthetic data is not used to directly evaluate the correctness of the DAA results? Replication of Nearing’s results to evaluate the validity of the synthetic data seems a rather indirect means to assess the performance of the DAA methods. 4) The study is about 16S (according to the title and descriptions) but Table 2, Aim 1 discusses metagenome simulation tools. The 16S and metagenome are different approaches for characterizing microbial communities, although there are some overlaps. It would require further clarification why a main Table switches to metagenomics in a 16S paper. 5) Availability of materials: the study is based on an open data set that is available. Availability of the source code for the experiments and results would be critical for this kind of study. I could not locate these from the manuscript. Instead, there is a text "Generated data, analysis scripts, results and supplemental information to this study will also be stored in the Fredato research data management. In case of unexpected technical limitations, we will make data, analysis scripts and supplemental information available via https://figshare.com". These materials should be made available already at the review stage, otherwise a full review is not possible. Minor notes: - Real data sets are inherently different from synthetic ones. If the underlying data distributions differ significantly between synthetic and experimental datasets, the statistical tests may yield misleading results. Some discussion could be added on this and other potential limitations of using synthetic data for methods validation. - Possibly out of scope for the protocol study but in case all the compared data generation methods fail to produce realistic data can the authors consider suggesting their own solution to generate realistic 16S data or expand discussion on this? - Could you consider using other similar measures to PCA, for instance, principal coordinate analysis based on different dissimilarity metrics (e.g. Jaccard and Bray-Curtis distances) to evaluate the robustness of the ordination comparison? - Table 2: " stasticial analyses" -> "statistical analyses"? Is the rationale for, and objectives of, the study clearly described? Partly Is the study design appropriate for the research question? Partly Are sufficient details of the methods provided to allow replication by others? Partly Are the datasets clearly presented in a useable and accessible format? Yes References 1. Yang L, Chen J: A comprehensive evaluation of microbial differential abundance analysis methods: current status and potential solutions. Microbiome . 2022; 10 (1): 130 PubMed Abstract | Publisher Full Text 2. Wirbel J, Essex M, Forslund SK, Zeller G: A realistic benchmark for differential abundance testing and confounder adjustment in human microbiome studies. Genome Biol . 2024; 25 (1): 247 PubMed Abstract | Publisher Full Text Competing Interests No competing interests were disclosed. Reviewer Expertise bioinformatics, microbiome, data analysis We confirm that we have read this submission and believe that we have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however we have significant reservations, as outlined above. reply Respond to this report Responses (1) Author Response 02 Jan 2025 Eva Kohnert, Medical Center-University of Freiburg, Freiburg, Germany General remark: The language is clear but the protocol has a vast number of steps, making it hard to separate the key contributions and performance assessment from the routine protocol checkpoints. We appreciate the reviewer’s feedback regarding the structure and clarity of the protocol. Since this is our first attempt at drafting a study protocol for a computational benchmark study, we adhered strictly to the SPIRIT guidelines. This allowed us to explore the applicability of these guidelines, originally designed for clinical research, and how they could be effectively adapted to computational studies in the future. While we acknowledge that this approach may have resulted in a protocol with numerous steps, we believe this provides a comprehensive foundation for reproducibility and transparency in computational benchmarking. For future benchmark studies, we plan to refine the protocol structure, focusing on clearly delineating key contributions and performance assessments from routine protocol checkpoints. Major remarks: 1) In addition to using metaSPARSim and sparseDOSSA2, consider including more recent methods such as SimMSeq (Yang and Chen, 2022) and SIMBA (Wirbel et al., 2024). We appreciate the reviewer’s suggestion to consider additional simulation tools such as SimMSeq and SIMBA. While we recognize the potential value of these tools, there are several reasons why we chose not to include additional simulation methods in this study: First, the current study is already highly complex, incorporating two simulation tools, comprehensive characterization of various data sets, multiple inclusion/exclusion criteria and extensive testing. Adding further tools would further increase the study’s complexity, potentially detracting from the clarity of our findings. Secondly, as stated in the protocol the computational effects are huge. Therefore, we defined a runtime threshold of max. 1 hour per differential abundance test. We are calculating 14 differential abundance tests across 10 + 1 (simulation + experimental template) datasets for each of the 38 templates and for unfiltered and filtered data, leading to a total of 11,704 combinations. In a worst case scenario, the runtime on a single core would already require approximately 488 days per simulation tool only for conducting DA analyses. Adding additional simulation tools would further inflate runtime, particularly given the additional time required to calibrate the simulators and generate synthetic datasets. Lastly, one of the key aims of this study is to introduce and emphasize the importance of formulating a study protocol specifically for computational research. This is the first attempt to define a structured protocol for such studies, and while it may not yet represent the most refined version, we think it is an important step towards robust and transparent computational benchmark studies. Developing this protocol required substantial effort, and we believe that establishing and demonstrating the concept of a protocol in computational studies is as important as the study results themselves. In the new version of the protocol, we briefly mention these aspects. 2) The authors use 43 metrics to assess the correspondence between synthetic and real data. The number of metrics seems quite high, and it may thus be difficult describe these resemblances. Would it be possible to identify a smaller number of key metrics that would grasp the most essential aspects of the data? We agree with the reviewer that a smaller set of metrics providing independent information would be beneficial. However, we aim to retain characteristics that might highlight discrepancies between simulated and experimental data and help explain potential disagreements in DAA outcomes. Consequently, we decided to eliminate metrics that are nearly redundant by iteratively removing those with a rank correlation ≥ 0.95 and adjusted our study protocol accordingly. 3) The motivation of Aim 2 seems unclear. The authors state in the introduction, synthetic data can be used to evaluate the correctness of DAA results. However, the authors suggest to use synthetic data to replicate the results Nearing et al. (Aim 2). Why synthetic data is not used to directly evaluate the correctness of the DAA results? Replication of Nearing’s results to evaluate the validity of the synthetic data seems a rather indirect means to assess the performance of the DAA methods. Given this feedback, we realized that the motivation of Aim 2 has not been communicated clearly. This study originated as part of the planning process for a more classical benchmark study using the underlying truth of synthetic data for assessing DA methods. While designing the benchmark study, we recognized the need to first assess the feasibility of generating synthetic data that realistically resemble all characteristics of experimental data, to ensure the validity of conclusions drawn from simulated data. This led us to develop the presented validation study with primary goal to compare the results based on synthetic data to those from the reference study. This study is going to be followed by a subsequent benchmark study, in which the known truth in the simulated data sets will be used for performance testing, and the dependence on characteristics such as effect size, sample size etc. will be systematically evaluated. We addressed this issue by updating the abstract and the description in section “study setting”, making it clearer that the focus of the presented study is in validating previous findings, and not in trying to estimate sensitivities and specificities which are planned as a subsequent project. The methodological advancement lies not only in incorporating synthetic data for validation but also in the development of a comprehensive study protocol. 4) The study is about 16S (according to the title and descriptions) but Table 2, Aim 1 discusses metagenome simulation tools. The 16S and metagenome are different approaches for characterizing microbial communities, although there are some overlaps. It would require further clarification why a main Table switches to metagenomics in a 16S paper. Thank you for pointing out that we were not precise in our terminology. In this study we only focus on 16S rRNA sequencing data. We have reviewed the manuscript and changed it accordingly. 5) Availability of materials: the study is based on an open data set that is available. Availability of the source code for the experiments and results would be critical for this kind of study. I could not locate these from the manuscript. Instead, there is a text "Generated data, analysis scripts, results and supplemental information to this study will also be stored in the Fredato research data management. In case of unexpected technical limitations, we will make data, analysis scripts and supplemental information available via https://figshare.com". These materials should be made available already at the review stage, otherwise a full review is not possible. The source code for this study, generated data and results will be made available at the review stage of the paper summarizing the results. At this point the source code is not complete yet, as we are only conducting the study once the protocol is accepted. Minor notes: - Real data sets are inherently different from synthetic ones. If the underlying data distributions differ significantly between synthetic and experimental datasets, the statistical tests may yield misleading results. Some discussion could be added on this and other potential limitations of using synthetic data for methods validation. According to the reviewer’s suggestion, we added a brief discussion of this aspect to the study protocol (Discussion section). A more detailed discussion will be included in the paper presenting the results of the study. - Possibly out of scope for the protocol study but in case all the compared data generation methods fail to produce realistic data can the authors consider suggesting their own solution to generate realistic 16S data or expand discussion on this? We deliberately chose not to develop our own simulation tool in this study, as we clearly want to separate benchmarking from methods development to prevent bias. However, to maximize the realism of the synthetic data generated by the tools at hand, we already suggested to introduce two adaptations: adding zeros to the data to achieve realistic levels of sparsity and adjusting the number of differentially abundant features to better capture true effect sizes. - Could you consider using other similar measures to PCA, for instance, principal coordinate analysis based on different dissimilarity metrics (e.g. Jaccard and Bray-Curtis distances) to evaluate the robustness of the ordination comparison? The PCA visualization we proposed will not be conducted on the count matrices, but on the characteristics calculated from those count matrices. Since it would be unusual to use the Bray-Curtis distance as a similarity measure for data characteristics and visualize it by PCoA, especially across multiple experimental datasets with differing taxonomies, we decided to incorporate the average Bray-Curtis distance between all sample pairs as an additional data property. This approach aids in assessing whether simulated count matrices are realistic. - Table 2: " stasticial analyses" -> "statistical analyses"? Thank you, we have corrected this typo. Is the rationale for, and objectives of, the study clearly described? Partly We acknowledge that the rationale and objectives of the study may not have been communicated clearly. To clarify this, we changed the abstract. To summarize, this study is essentially a validation study, focusing on evaluating the feasibility of using synthetic data to resemble experimental data and whether we replicate findings from the reference study based multiple simulated data sets. Additionally, it is a showcase how to incorporate study protocols in computational studies and to rigorously plan a benchmark study before actually conducting it. This study serves as groundwork for a subsequent benchmark study, which will include a broader range of data sets for performance testing. Is the study design appropriate for the research question? Partly We agree that simulated data has limitations in fully validating previous findings due to inherent differences from experimental data. To address this, we calculate a comprehensive set of data characteristics, perform equivalence tests, and select only simulated data that does not show significant disagreement with experimental data. In our view, this approach is the best we can do to minimize the differences and ensure comparability between simulated and experimental datasets. Moreover, applying the presented equivalence testing procedure represents a novel methodological improvement compared to existing approaches. Are sufficient details of the methods provided to allow replication by others? Partly In the revised version we better explain especially that data generation mechanism, by including more details for the simulation process. All data sets, experimental and generated, as well as all source code to run the study will be made available once the paper describing the results will undergo review process. Are the datasets clearly presented in a useable and accessible format? Yes View more View less Competing Interests No competing interests were disclosed. reply Respond Report a concern Lahti L and Pelto J. Peer Review Report For: Computational Study Protocol: Leveraging Synthetic Data to Validate a Benchmark Study for Differential Abundance Tests for 16S Microbiome Sequencing Data [version 2; peer review: 2 approved] . F1000Research 2025, 13 :1180 ( https://doi.org/10.5256/f1000research.170374.r340429) NOTE: it is important to ensure the information in square brackets after the title is included in this citation. The direct URL for this report is: https://f1000research.com/articles/13-1180/v1#referee-response-340429 Alongside their report, reviewers assign a status to the article: Approved - the paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations - A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved - fundamental flaws in the paper seriously undermine the findings and conclusions Adjust parameters to alter display View on desktop for interactive features Includes Interactive Elements View on desktop for interactive features Competing Interests Policy Provide sufficient details of any financial or non-financial competing interests to enable users to assess whether your comments might lead a reasonable person to question your impartiality. Consider the following examples, but note that this is not an exhaustive list: Examples of 'Non-Financial Competing Interests' Within the past 4 years, you have held joint grants, published or collaborated with any of the authors of the selected paper. You have a close personal relationship (e.g. parent, spouse, sibling, or domestic partner) with any of the authors. You are a close professional associate of any of the authors (e.g. scientific mentor, recent student). You work at the same institute as any of the authors. You hope/expect to benefit (e.g. favour or employment) as a result of your submission. You are an Editor for the journal in which the article is published. Examples of 'Financial Competing Interests' You expect to receive, or in the past 4 years have received, any of the following from any commercial organisation that may gain financially from your submission: a salary, fees, funding, reimbursements. You expect to receive, or in the past 4 years have received, shared grant support or other funding with any of the authors. You hold, or are currently applying for, any patents or significant stocks/shares relating to the subject matter of the paper you are commenting on. Stay Updated Sign up for content alerts and receive a weekly or monthly email with all newly published articles Register with F1000Research Already registered? Sign in Not now, thanks close PLEASE NOTE If you are an AUTHOR of this article, please check that you signed in with the account associated with this article otherwise we cannot automatically identify your role as an author and your comment will be labelled as a “User Comment”. If you are a REVIEWER of this article, please check that you have signed in with the account associated with this article and then go to your account to submit your report, please do not post your review here. If you do not have access to your original account, please contact us . All commenters must hold a formal affiliation as per our Policies . The information that you give us will be displayed next to your comment. User comments must be in English, comprehensible and relevant to the article under discussion. We reserve the right to remove any comments that we consider to be inappropriate, offensive or otherwise in breach of the User Comment Terms and Conditions . Commenters must not use a comment for personal attacks. When criticisms of the article are based on unpublished data, the data should be made available. I accept the User Comment Terms and Conditions Please confirm that you accept the User Comment Terms and Conditions. Affiliation ✕ refresh Please enter your institution. Note: To add your institution or organisation, start typing the name and then select the correct name from the list. Where applicable, the name will appear in both the original language and in English. Do not paste in the name. If the name does not appear in the drop-down list, we will display the information you have entered. ✕ refresh Country/Region * USA UK Canada China France Germany Afghanistan Aland Islands Albania Algeria American Samoa Andorra Angola Anguilla Antarctica Antigua and Barbuda Argentina Armenia Aruba Australia Austria Azerbaijan Bahamas Bahrain Bangladesh Barbados Belarus Belgium Belize Benin Bermuda Bhutan Bolivia Bosnia and Herzegovina Botswana Bouvet Island Brazil British Indian Ocean Territory British Virgin Islands Brunei Bulgaria Burkina Faso Burundi Cambodia Cameroon Canada Cape Verde Cayman Islands Central African Republic Chad Chile China Christmas Island Cocos (Keeling) Islands Colombia Comoros Congo Cook Islands Costa Rica Cote d'Ivoire Croatia Cuba Cyprus Czech Republic Democratic Republic of the Congo Denmark Djibouti Dominica Dominican Republic Ecuador Egypt El Salvador Equatorial Guinea Eritrea Estonia Ethiopia Falkland Islands Faroe Islands Federated States of Micronesia Fiji Finland France French Guiana French Polynesia French Southern Territories Gabon Georgia Germany Ghana Gibraltar Greece Greenland Grenada Guadeloupe Guam Guatemala Guernsey Guinea Guinea-Bissau Guyana Haiti Heard Island and Mcdonald Islands Holy See (Vatican City State) Honduras Hong Kong Hungary Iceland India Indonesia Iran Iraq Ireland Israel Italy Jamaica Japan Jersey Jordan Kazakhstan Kenya Kiribati Kosovo (Serbia and Montenegro) Kuwait Kyrgyzstan Lao People's Democratic Republic Latvia Lebanon Lesotho Liberia Libya Liechtenstein Lithuania Luxembourg Macao Madagascar Malawi Malaysia Maldives Mali Malta Marshall Islands Martinique Mauritania Mauritius Mayotte Mexico Minor Outlying Islands of the United States Moldova Monaco Mongolia Montenegro Montserrat Morocco Mozambique Myanmar Namibia Nauru Nepal Netherlands Antilles New Caledonia New Zealand Nicaragua Niger Nigeria Niue Norfolk Island North Korea North Macedonia Northern Mariana Islands Norway Oman Pakistan Palau Palestinian Territory Panama Papua New Guinea Paraguay Peru Philippines Pitcairn Poland Portugal Puerto Rico Qatar Reunion Romania Russian Federation Rwanda Saint Helena Saint Kitts and Nevis Saint Lucia Saint Pierre and Miquelon Saint Vincent and the Grenadines Samoa San Marino Sao Tome and Principe Saudi Arabia Senegal Serbia Seychelles Sierra Leone Singapore Slovakia Slovenia Solomon Islands Somalia South Africa South Georgia and the South Sandwich Is South Korea South Sudan Spain Sri Lanka Sudan Suriname Svalbard and Jan Mayen Swaziland Sweden Switzerland Syria Taiwan Tajikistan Tanzania Thailand The Gambia The Netherlands Timor-Leste Togo Tokelau Tonga Trinidad and Tobago Tunisia Turkey Turkmenistan Turks and Caicos Islands Tuvalu UK USA Uganda Ukraine United Arab Emirates United States Virgin Islands Uruguay Uzbekistan Vanuatu Venezuela Vietnam Wallis and Futuna West Bank and Gaza Strip Western Sahara Yemen Zambia Zimbabwe Please select your country/region. You must enter a comment. Competing Interests Please disclose any competing interests that might be construed to influence your judgment of the article's or peer review report's validity or importance. Competing Interests Policy Provide sufficient details of any financial or non-financial competing interests to enable users to assess whether your comments might lead a reasonable person to question your impartiality. Consider the following examples, but note that this is not an exhaustive list: Examples of 'Non-Financial Competing Interests' Within the past 4 years, you have held joint grants, published or collaborated with any of the authors of the selected paper. You have a close personal relationship (e.g. parent, spouse, sibling, or domestic partner) with any of the authors. You are a close professional associate of any of the authors (e.g. scientific mentor, recent student). You work at the same institute as any of the authors. You hope/expect to benefit (e.g. favour or employment) as a result of your submission. You are an Editor for the journal in which the article is published. Examples of 'Financial Competing Interests' You expect to receive, or in the past 4 years have received, any of the following from any commercial organisation that may gain financially from your submission: a salary, fees, funding, reimbursements. You expect to receive, or in the past 4 years have received, shared grant support or other funding with any of the authors. You hold, or are currently applying for, any patents or significant stocks/shares relating to the subject matter of the paper you are commenting on. Please state your competing interests The comment has been saved. An error has occurred. Please try again. Cancel Post var lTitle = "Computational Study Protocol: Leveraging...".replace("'", ''); var linkedInUrl = "http://www.linkedin.com/shareArticle?url=https://f1000research.com/articles/13-1180/v2" + "&title=" + encodeURIComponent(lTitle) + "&summary=" + encodeURIComponent('Read the article by '); var deliciousUrl = "https://del.icio.us/post?url=https://f1000research.com/articles/13-1180/v2&title=" + encodeURIComponent(lTitle); var redditUrl = "http://reddit.com/submit?url=https://f1000research.com/articles/13-1180/v2" + "&title=" + encodeURIComponent(lTitle); linkedInUrl += encodeURIComponent('Kohnert E and Kreutz C'); var offsetTop = /chrome/i.test( navigator.userAgent ) ? 4 : -10; var addthis_config = { ui_offset_top: offsetTop, services_compact : "facebook,twitter,www.linkedin.com,www.mendeley.com,reddit.com", services_expanded : "facebook,twitter,www.linkedin.com,www.mendeley.com,reddit.com", services_custom : [ { name: "LinkedIn", url: linkedInUrl, icon:"/img/icon/at_linkedin.svg" }, { name: "Mendeley", url: "http://www.mendeley.com/import/?url=https://f1000research.com/articles/13-1180/v2/mendeley", icon:"/img/icon/at_mendeley.svg" }, { name: "Reddit", url: redditUrl, icon:"/img/icon/at_reddit.svg" }, ] }; var addthis_share = { url: "https://f1000research.com/articles/13-1180", templates : { twitter : "Computational Study Protocol: Leveraging Synthetic Data to Validate.... Kohnert E and Kreutz C, published by " + "@F1000Research" + ", https://f1000research.com/articles/13-1180/v2" } }; if (typeof(addthis) != "undefined"){ addthis.addEventListener('addthis.ready', checkCount); addthis.addEventListener('addthis.menu.share', checkCount); } $(".f1r-shares-twitter").attr("href", "https://twitter.com/intent/tweet?text=" + addthis_share.templates.twitter); $(".f1r-shares-facebook").attr("href", "https://www.facebook.com/sharer/sharer.php?u=" + addthis_share.url); $(".f1r-shares-linkedin").attr("href", addthis_config.services_custom[0].url); $(".f1r-shares-reddit").attr("href", addthis_config.services_custom[2].url); $(".f1r-shares-mendelay").attr("href", addthis_config.services_custom[1].url); function checkCount(){ setTimeout(function(){ $(".addthis_button_expanded").each(function(){ var count = $(this).text(); if (count !== "" && count != "0") $(this).removeClass("is-hidden"); else $(this).addClass("is-hidden"); }); }, 1000); } close How to cite this report {{reportCitation}} Cancel Copy Citation Details $(function(){R.ui.buttonDropdowns('.dropdown-for-downloads');}); $(function(){R.ui.toolbarDropdowns('.toolbar-dropdown-for-downloads');}); $.get("/articles/acj/155230/176118") new F1000.Clipboard(); new F1000.ThesaurusTermsDisplay("articles", "article", "176118"); $(document).ready(function() { $( "#frame1" ).on('load', function() { var mydiv = $(this).contents().find("div"); var h = mydiv.height(); console.log(h) }); var tooltipLivingFigure = jQuery(".interactive-living-figure-label .icon-more-info"), titleLivingFigure = tooltipLivingFigure.attr("title"); tooltipLivingFigure.simpletip({ fixed: true, position: ["-115", "30"], baseClass: 'small-tooltip', content:titleLivingFigure + " " }); tooltipLivingFigure.removeAttr("title"); $("body").on("click", ".cite-living-figure", function(e) { e.preventDefault(); var ref = $(this).attr("data-ref"); $(this).closest(".living-figure-list-container").find("#" + ref).fadeIn(200); }); $("body").on("click", ".close-cite-living-figure", function(e) { e.preventDefault(); $(this).closest(".popup-window-wrapper").fadeOut(200); }); $(document).on("mouseup", function(e) { var metricsContainer = $(".article-metrics-popover-wrapper"); if (!metricsContainer.is(e.target) && metricsContainer.has(e.target).length === 0) { $(".article-metrics-close-button").click(); } }); var articleId = $('#articleId').val(); if($("#main-article-count-box").attachArticleMetrics) { $("#main-article-count-box").attachArticleMetrics(articleId, { articleMetricsView: true }); } }); var figshareWidget = $(".new_figshare_widget"); if (figshareWidget.length > 0) { window.figshare.load("f1000", function(Widget) { // Select a tag/tags defined in your page. In this tag we will place the widget. _.map(figshareWidget, function(el){ var widget = new Widget({ articleId: $(el).attr("figshare_articleId") //height:300 // this is the height of the viewer part. [Default: 550] }); widget.initialize(); // initialize the widget widget.mount(el); // mount it in a tag that's on your page // this will save the widget on the global scope for later use from // your JS scripts. This line is optional. //window.widget = widget; }); }); } close Error Close Add Reset F1000.MICROSERVICES.AFFILIATION = ''; $(document).ready(function () { $('.js-affiliations-form').each((index, form) => { new AffiliationForm({ formId: form.id, institutionErrorSelector: '.comment-enter-institution', departmentErrorSelector: '.comment-enter-department', placeSelector: '.js-add-comment-place', stateSelector: '.js-add-comment-state', zipCodeSelector: '.js-add-comment-zipcode', countrySelector: '.js-add-comment-country', countryErrorSelector: '.comment-enter-country', }); }); }); $(document).ready(function () { var reportIds = { "331405": 0, "331404": 0, "331407": 0, "360591": 0, "331406": 0, "331401": 0, "331403": 0, "331402": 0, "331409": 0, "360593": 0, "331408": 0, "360592": 0, "360595": 0, "331410": 0, "360594": 0, "334637": 0, "337325": 0, "337324": 0, "355372": 7, "334639": 0, "337327": 0, "334638": 0, "337326": 0, "337323": 0, "355371": 6, "334645": 0, "334644": 0, "337332": 0, "334646": 0, "334641": 0, "337329": 0, "334640": 0, "337328": 0, "334643": 0, "337331": 0, "334642": 0, "337330": 0, "340423": 0, "340429": 25, "340428": 0, "340425": 0, "340424": 0, "340427": 17, "340426": 0, "359655": 0, "359661": 0, "356333": 0, "356332": 0, "359660": 0, "356335": 0, "359663": 0, "359662": 0, "356334": 0, "359657": 0, "359656": 0, "359659": 0, "359658": 0, "356340": 0, "356337": 0, "359664": 0, "356336": 0, "356339": 0, "356338": 0, "360700": 0, "356351": 0, "360697": 0, "360696": 0, "360699": 0, "360698": 0, }; $(".referee-response-container,.js-referee-report").each(function(index, el) { var reportId = $(el).attr("data-reportid"), reportCount = reportIds[reportId] || 0; $(el).find(".comments-count-container,.js-referee-report-views").html(reportCount); }); var uuidInput = $("#article_uuid"), oldUUId = uuidInput.val(), newUUId = "1ad32358-60ab-4b45-a957-90a0f618df93"; uuidInput.val(newUUId); $("a[href*='article_uuid=']").each(function(index, el) { var newHref = $(el).attr("href").replace(oldUUId, newUUId); $(el).attr("href", newHref); }); }); An innovative open access publishing platform offering rapid publication and open peer review, whilst supporting data deposition and sharing. Browse Gateways Collections How it Works Contact For Developers Cookie Notice Privacy Notice RSS Submit Your Research Follow us © 2012-2026 F1000 Research Ltd. ISSN 2046-1402 | Legal | Partner of Research4Life • CrossRef • ORCID • FAIRSharing R.templateTests.simpleTemplate = R.template(' $text $text $text $text $text '); R.templateTests.runTests(); var F1000platform = new F1000.Platform({ name: "f1000research", displayName: "F1000Research", hostName: "f1000research.com", id: "1", editorialEmail: "[email protected]", infoEmail: "[email protected]", usePmcStats: true }); $(function(){R.ui.dropdowns('.dropdown-for-authors, .dropdown-for-about, .dropdown-for-myresearch');}); // $(function(){R.ui.dropdowns('.dropdown-for-referees');}); $(document).ready(function () { if ($(".cookie-warning").is(":visible")) { $(".sticky").css("margin-bottom", "35px"); $(".devices").addClass("devices-and-cookie-warning"); } $(".cookie-warning .close-button").click(function (e) { $(".devices").removeClass("devices-and-cookie-warning"); $(".sticky").css("margin-bottom", "0"); }); $("#tweeter-feed .tweet-message").each(function (i, message) { var self = $(message); self.html(linkify(self.html())); }); $(".partner").on("mouseenter mouseleave", function() { $(this).find(".gray-scale, .colour").toggleClass("is-hidden"); }); }); Sign In Remember me Forgotten your password? Sign In Cancel Email or password not correct. Please try again Please wait... $(function(){ // Note: All the setup needs to run against a name attribute and *not* the id due the clonish // nature of facebox... $("a[id=googleSignInButton]").click(function(event){ event.preventDefault(); $("input[id=oAuthSystem]").val("GOOGLE"); $("form[id=oAuthForm]").submit(); }); $("a[id=facebookSignInButton]").click(function(event){ event.preventDefault(); $("input[id=oAuthSystem]").val("FACEBOOK"); $("form[id=oAuthForm]").submit(); }); $("a[id=orcidSignInButton]").click(function(event){ event.preventDefault(); $("input[id=oAuthSystem]").val("ORCID"); $("form[id=oAuthForm]").submit(); }); }); If you've forgotten your password, please enter your email address below and we'll send you instructions on how to reset your password. The email address should be the one you originally registered with F1000. Email address not valid, please try again You registered with F1000 via Google, so we cannot reset your password. To sign in, please click here . If you still need help with your Google account password, please click here . You registered with F1000 via Facebook, so we cannot reset your password. To sign in, please click here . If you still need help with your Facebook account password, please click here . Code not correct, please try again Reset password Cancel Email us for further assistance. Server error, please try again. If your email address is registered with us, we will email you instructions to reset your password. If you think you should have received this email but it has not arrived, please check your spam filters and/or contact for further assistance. Please wait... Register $(document).ready(function () { signIn.createSignInAsRow($("#sign-in-form-gfb-popup")); $(".target-field").each(function () { var uris = $(this).val().split("/"); if (uris.pop() === "login") { $(this).val(uris.toString().replace(",","/")); } }); });

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00