Addressing common inferential mistakes when failing to reject the null-hypothesis

doi:10.12688/f1000research.158434.3

Addressing common inferential mistakes when failing to reject the null-hypothesis

2025 · doi:10.12688/f1000research.158434.3

preprint OA: closed CC-BY-4.0

🔓 Open OA copy Full text JSON View at publisher

Full text 278,236 characters · extracted from preprint-html · click to expand

Addressing common inferential mistakes when failing... | F1000Research "use strict";function _typeof(t){return(_typeof="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(t){return typeof t}:function(t){return t&&"function"==typeof Symbol&&t.constructor===Symbol&&t!==Symbol.prototype?"symbol":typeof t})(t)}!function(){var t=function(){var t,e,o=[],n=window,r=n;for(;r;){try{if(r.frames.__tcfapiLocator){t=r;break}}catch(t){}if(r===n.top)break;r=r.parent}t||(!function t(){var e=n.document,o=!!n.frames.__tcfapiLocator;if(!o)if(e.body){var r=e.createElement("iframe");r.style.cssText="display:none",r.name="__tcfapiLocator",e.body.appendChild(r)}else setTimeout(t,5);return!o}(),n.__tcfapi=function(){for(var t=arguments.length,n=new Array(t),r=0;r 3&&2===parseInt(n[1],10)&&"boolean"==typeof n[3]&&(e=n[3],"function"==typeof n[2]&&n[2]("set",!0)):"ping"===n[0]?"function"==typeof n[2]&&n[2]({gdprApplies:e,cmpLoaded:!1,cmpStatus:"stub"}):o.push(n)},n.addEventListener("message",(function(t){var e="string"==typeof t.data,o={};if(e)try{o=JSON.parse(t.data)}catch(t){}else o=t.data;var n="object"===_typeof(o)&&null!==o?o.__tcfapiCall:null;n&&window.__tcfapi(n.command,n.version,(function(o,r){var a={__tcfapiReturn:{returnValue:o,success:r,callId:n.callId}};t&&t.source&&t.source.postMessage&&t.source.postMessage(e?JSON.stringify(a):a,"*")}),n.parameter)}),!1))};"undefined"!=typeof module?module.exports=t:t()}(); dataLayer = dataLayer || []; // Standard GTM initialization - Google Consent Mode handles consent automatically (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= 'https://www.googletagmanager.com/gtm.js?id='+i+dl+ '>m_auth=hzk0Vc3qFsQYhCrIoHz68A>m_preview=env-1>m_cookies_win=x';f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-MWFK8L5J'); ;window.NREUM||(NREUM={});NREUM.init={distributed_tracing:{enabled:true},privacy:{cookies_enabled:true},ajax:{deny_list:["bam.nr-data.net"]}}; ;NREUM.loader_config={accountID:"438030",trustKey:"438030",agentID:"772317073",licenseKey:"97f8f67f26",applicationID:"772317073"} ;NREUM.info={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net",licenseKey:"97f8f67f26",applicationID:"772317073",sa:1} ;/*! For license information please see nr-loader-spa-1.236.0.min.js.LICENSE.txt */ (()=>{"use strict";var e,t,r={5763:(e,t,r)=>{r.d(t,{P_:()=>l,Mt:()=>g,C5:()=>s,DL:()=>v,OP:()=>T,lF:()=>D,Yu:()=>y,Dg:()=>h,CX:()=>c,GE:()=>b,sU:()=>_});var n=r(8632),i=r(9567);const o={beacon:n.ce.beacon,errorBeacon:n.ce.errorBeacon,licenseKey:void 0,applicationID:void 0,sa:void 0,queueTime:void 0,applicationTime:void 0,ttGuid:void 0,user:void 0,account:void 0,product:void 0,extra:void 0,jsAttributes:{},userAttributes:void 0,atts:void 0,transactionName:void 0,tNamePlain:void 0},a={};function s(e){if(!e)throw new Error("All info objects require an agent identifier!");if(!a[e])throw new Error("Info for ".concat(e," was never set"));return a[e]}function c(e,t){if(!e)throw new Error("All info objects require an agent identifier!");a[e]=(0,i.D)(t,o),(0,n.Qy)(e,a[e],"info")}var u=r(7056);const d=()=>{const e={blockSelector:"[data-nr-block]",maskInputOptions:{password:!0}};return{allow_bfcache:!0,privacy:{cookies_enabled:!0},ajax:{deny_list:void 0,enabled:!0,harvestTimeSeconds:10},distributed_tracing:{enabled:void 0,exclude_newrelic_header:void 0,cors_use_newrelic_header:void 0,cors_use_tracecontext_headers:void 0,allowed_origins:void 0},session:{domain:void 0,expiresMs:u.oD,inactiveMs:u.Hb},ssl:void 0,obfuscate:void 0,jserrors:{enabled:!0,harvestTimeSeconds:10},metrics:{enabled:!0},page_action:{enabled:!0,harvestTimeSeconds:30},page_view_event:{enabled:!0},page_view_timing:{enabled:!0,harvestTimeSeconds:30,long_task:!1},session_trace:{enabled:!0,harvestTimeSeconds:10},harvest:{tooManyRequestsDelay:60},session_replay:{enabled:!1,harvestTimeSeconds:60,sampleRate:.1,errorSampleRate:.1,maskTextSelector:"*",maskAllInputs:!0,get blockClass(){return"nr-block"},get ignoreClass(){return"nr-ignore"},get maskTextClass(){return"nr-mask"},get blockSelector(){return e.blockSelector},set blockSelector(t){e.blockSelector+=",".concat(t)},get maskInputOptions(){return e.maskInputOptions},set maskInputOptions(t){e.maskInputOptions={...t,password:!0}}},spa:{enabled:!0,harvestTimeSeconds:10}}},f={};function l(e){if(!e)throw new Error("All configuration objects require an agent identifier!");if(!f[e])throw new Error("Configuration for ".concat(e," was never set"));return f[e]}function h(e,t){if(!e)throw new Error("All configuration objects require an agent identifier!");f[e]=(0,i.D)(t,d()),(0,n.Qy)(e,f[e],"config")}function g(e,t){if(!e)throw new Error("All configuration objects require an agent identifier!");var r=l(e);if(r){for(var n=t.split("."),i=0;i {r.d(t,{D:()=>i});var n=r(50);function i(e,t){try{if(!e||"object"!=typeof e)return(0,n.Z)("Setting a Configurable requires an object as input");if(!t||"object"!=typeof t)return(0,n.Z)("Setting a Configurable requires a model to set its initial properties");const r=Object.create(Object.getPrototypeOf(t),Object.getOwnPropertyDescriptors(t)),o=0===Object.keys(r).length?e:r;for(let a in o)if(void 0!==e[a])try{"object"==typeof e[a]&&"object"==typeof t[a]?r[a]=i(e[a],t[a]):r[a]=e[a]}catch(e){(0,n.Z)("An error occurred while setting a property of a Configurable",e)}return r}catch(e){(0,n.Z)("An error occured while setting a Configurable",e)}}},6818:(e,t,r)=>{r.d(t,{Re:()=>i,gF:()=>o,q4:()=>n});const n="1.236.0",i="PROD",o="CDN"},385:(e,t,r)=>{r.d(t,{FN:()=>a,IF:()=>u,Nk:()=>f,Tt:()=>s,_A:()=>o,il:()=>n,pL:()=>c,v6:()=>i,w1:()=>d});const n="undefined"!=typeof window&&!!window.document,i="undefined"!=typeof WorkerGlobalScope&&("undefined"!=typeof self&&self instanceof WorkerGlobalScope&&self.navigator instanceof WorkerNavigator||"undefined"!=typeof globalThis&&globalThis instanceof WorkerGlobalScope&&globalThis.navigator instanceof WorkerNavigator),o=n?window:"undefined"!=typeof WorkerGlobalScope&&("undefined"!=typeof self&&self instanceof WorkerGlobalScope&&self||"undefined"!=typeof globalThis&&globalThis instanceof WorkerGlobalScope&&globalThis),a=""+o?.location,s=/iPad|iPhone|iPod/.test(navigator.userAgent),c=s&&"undefined"==typeof SharedWorker,u=(()=>{const e=navigator.userAgent.match(/Firefox[/\s](\d+\.\d+)/);return Array.isArray(e)&&e.length>=2?+e[1]:0})(),d=Boolean(n&&window.document.documentMode),f=!!navigator.sendBeacon},1117:(e,t,r)=>{r.d(t,{w:()=>o});var n=r(50);const i={agentIdentifier:"",ee:void 0};class o{constructor(e){try{if("object"!=typeof e)return(0,n.Z)("shared context requires an object as input");this.sharedContext={},Object.assign(this.sharedContext,i),Object.entries(e).forEach((e=>{let[t,r]=e;Object.keys(i).includes(t)&&(this.sharedContext[t]=r)}))}catch(e){(0,n.Z)("An error occured while setting SharedContext",e)}}}},8e3:(e,t,r)=>{r.d(t,{L:()=>d,R:()=>c});var n=r(2177),i=r(1284),o=r(4322),a=r(3325);const s={};function c(e,t){const r={staged:!1,priority:a.p[t]||0};u(e),s[e].get(t)||s[e].set(t,r)}function u(e){e&&(s[e]||(s[e]=new Map))}function d(){let e=arguments.length>0&&void 0!==arguments[0]?arguments[0]:"",t=arguments.length>1&&void 0!==arguments[1]?arguments[1]:"feature";if(u(e),!e||!s[e].get(t))return a(t);s[e].get(t).staged=!0;const r=[...s[e]];function a(t){const r=e?n.ee.get(e):n.ee,a=o.X.handlers;if(r.backlog&&a){var s=r.backlog[t],c=a[t];if(c){for(var u=0;s&&u {let[t,r]=e;return r.staged}))&&(r.sort(((e,t)=>e[1].priority-t[1].priority)),r.forEach((e=>{let[t]=e;a(t)})))}function f(e,t){var r=e[1];(0,i.D)(t[r],(function(t,r){var n=e[0];if(r[0]===n){var i=r[1],o=e[3],a=e[2];i.apply(o,a)}}))}},2177:(e,t,r)=>{r.d(t,{c:()=>f,ee:()=>u});var n=r(8632),i=r(2210),o=r(1284),a=r(5763),s="nr@context";let c=(0,n.fP)();var u;function d(){}function f(e){return(0,i.X)(e,s,l)}function l(){return new d}function h(){u.aborted=!0,u.backlog={}}c.ee?u=c.ee:(u=function e(t,r){var n={},c={},f={},g=!1;try{g=16===r.length&&(0,a.OP)(r).isolatedBacklog}catch(e){}var p={on:b,addEventListener:b,removeEventListener:y,emit:v,get:x,listeners:w,context:m,buffer:A,abort:h,aborted:!1,isBuffering:E,debugId:r,backlog:g?{}:t&&"object"==typeof t.backlog?t.backlog:{}};return p;function m(e){return e&&e instanceof d?e:e?(0,i.X)(e,s,l):l()}function v(e,r,n,i,o){if(!1!==o&&(o=!0),!u.aborted||i){t&&o&&t.emit(e,r,n);for(var a=m(n),s=w(e),d=s.length,f=0;fn,p:()=>i});var n=r(2177).ee.get("handle");function i(e,t,r,i,o){o?(o.buffer([e],i),o.emit(e,t,r)):(n.buffer([e],i),n.emit(e,t,r))}},4322:(e,t,r)=>{r.d(t,{X:()=>o});var n=r(5546);o.on=a;var i=o.handlers={};function o(e,t,r,o){a(o||n.E,i,e,t,r)}function a(e,t,r,i,o){o||(o="feature"),e||(e=n.E);var a=t[o]=t[o]||{};(a[r]=a[r]||[]).push([e,i])}},3239:(e,t,r)=>{r.d(t,{bP:()=>s,iz:()=>c,m$:()=>a});var n=r(385);let i=!1,o=!1;try{const e={get passive(){return i=!0,!1},get signal(){return o=!0,!1}};n._A.addEventListener("test",null,e),n._A.removeEventListener("test",null,e)}catch(e){}function a(e,t){return i||o?{capture:!!e,passive:i,signal:t}:!!e}function s(e,t){let r=arguments.length>2&&void 0!==arguments[2]&&arguments[2],n=arguments.length>3?arguments[3]:void 0;window.addEventListener(e,t,a(r,n))}function c(e,t){let r=arguments.length>2&&void 0!==arguments[2]&&arguments[2],n=arguments.length>3?arguments[3]:void 0;document.addEventListener(e,t,a(r,n))}},4402:(e,t,r)=>{r.d(t,{Ht:()=>u,M:()=>c,Rl:()=>a,ky:()=>s});var n=r(385);const i="xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx";function o(e,t){return e?15&e[t]:16*Math.random()|0}function a(){const e=n._A?.crypto||n._A?.msCrypto;let t,r=0;return e&&e.getRandomValues&&(t=e.getRandomValues(new Uint8Array(31))),i.split("").map((e=>"x"===e?o(t,++r).toString(16):"y"===e?(3&o()|8).toString(16):e)).join("")}function s(e){const t=n._A?.crypto||n._A?.msCrypto;let r,i=0;t&&t.getRandomValues&&(r=t.getRandomValues(new Uint8Array(31)));const a=[];for(var s=0;s {r.d(t,{Bq:()=>n,Hb:()=>o,oD:()=>i});const n="NRBA",i=144e5,o=18e5},7894:(e,t,r)=>{function n(){return Math.round(performance.now())}r.d(t,{z:()=>n})},7243:(e,t,r)=>{r.d(t,{e:()=>o});var n=r(385),i={};function o(e){if(e in i)return i[e];if(0===(e||"").indexOf("data:"))return{protocol:"data"};let t;var r=n._A?.location,o={};if(n.il)t=document.createElement("a"),t.href=e;else try{t=new URL(e,r.href)}catch(e){return o}o.port=t.port;var a=t.href.split("://");!o.port&&a[1]&&(o.port=a[1].split("/")[0].split("@").pop().split(":")[1]),o.port&&"0"!==o.port||(o.port="https"===a[0]?"443":"80"),o.hostname=t.hostname||r.hostname,o.pathname=t.pathname,o.protocol=a[0],"/"!==o.pathname.charAt(0)&&(o.pathname="/"+o.pathname);var s=!t.protocol||":"===t.protocol||t.protocol===r.protocol,c=t.hostname===r.hostname&&t.port===r.port;return o.sameOrigin=s&&(!t.hostname||c),"/"===o.pathname&&(i[e]=o),o}},50:(e,t,r)=>{function n(e,t){"function"==typeof console.warn&&(console.warn("New Relic: ".concat(e)),t&&console.warn(t))}r.d(t,{Z:()=>n})},2587:(e,t,r)=>{r.d(t,{N:()=>c,T:()=>u});var n=r(2177),i=r(5546),o=r(8e3),a=r(3325);const s={stn:[a.D.sessionTrace],err:[a.D.jserrors,a.D.metrics],ins:[a.D.pageAction],spa:[a.D.spa],sr:[a.D.sessionReplay,a.D.sessionTrace]};function c(e,t){const r=n.ee.get(t);e&&"object"==typeof e&&(Object.entries(e).forEach((e=>{let[t,n]=e;void 0===u[t]&&(s[t]?s[t].forEach((e=>{n?(0,i.p)("feat-"+t,[],void 0,e,r):(0,i.p)("block-"+t,[],void 0,e,r),(0,i.p)("rumresp-"+t,[Boolean(n)],void 0,e,r)})):n&&(0,i.p)("feat-"+t,[],void 0,void 0,r),u[t]=Boolean(n))})),Object.keys(s).forEach((e=>{void 0===u[e]&&(s[e]?.forEach((t=>(0,i.p)("rumresp-"+e,[!1],void 0,t,r))),u[e]=!1)})),(0,o.L)(t,a.D.pageViewEvent))}const u={}},2210:(e,t,r)=>{r.d(t,{X:()=>i});var n=Object.prototype.hasOwnProperty;function i(e,t,r){if(n.call(e,t))return e[t];var i=r();if(Object.defineProperty&&Object.keys)try{return Object.defineProperty(e,t,{value:i,writable:!0,enumerable:!1}),i}catch(e){}return e[t]=i,i}},1284:(e,t,r)=>{r.d(t,{D:()=>n});const n=(e,t)=>Object.entries(e||{}).map((e=>{let[r,n]=e;return t(r,n)}))},4351:(e,t,r)=>{r.d(t,{P:()=>o});var n=r(2177);const i=()=>{const e=new WeakSet;return(t,r)=>{if("object"==typeof r&&null!==r){if(e.has(r))return;e.add(r)}return r}};function o(e){try{return JSON.stringify(e,i())}catch(e){try{n.ee.emit("internal-error",[e])}catch(e){}}}},3960:(e,t,r)=>{r.d(t,{K:()=>a,b:()=>o});var n=r(3239);function i(){return"undefined"==typeof document||"complete"===document.readyState}function o(e,t){if(i())return e();(0,n.bP)("load",e,t)}function a(e){if(i())return e();(0,n.iz)("DOMContentLoaded",e)}},8632:(e,t,r)=>{r.d(t,{EZ:()=>u,Qy:()=>c,ce:()=>o,fP:()=>a,gG:()=>d,mF:()=>s});var n=r(7894),i=r(385);const o={beacon:"bam.nr-data.net",errorBeacon:"bam.nr-data.net"};function a(){return i._A.NREUM||(i._A.NREUM={}),void 0===i._A.newrelic&&(i._A.newrelic=i._A.NREUM),i._A.NREUM}function s(){let e=a();return e.o||(e.o={ST:i._A.setTimeout,SI:i._A.setImmediate,CT:i._A.clearTimeout,XHR:i._A.XMLHttpRequest,REQ:i._A.Request,EV:i._A.Event,PR:i._A.Promise,MO:i._A.MutationObserver,FETCH:i._A.fetch}),e}function c(e,t,r){let i=a();const o=i.initializedAgents||{},s=o[e]||{};return Object.keys(s).length||(s.initializedAt={ms:(0,n.z)(),date:new Date}),i.initializedAgents={...o,[e]:{...s,[r]:t}},i}function u(e,t){a()[e]=t}function d(){return function(){let e=a();const t=e.info||{};e.info={beacon:o.beacon,errorBeacon:o.errorBeacon,...t}}(),function(){let e=a();const t=e.init||{};e.init={...t}}(),s(),function(){let e=a();const t=e.loader_config||{};e.loader_config={...t}}(),a()}},7956:(e,t,r)=>{r.d(t,{N:()=>i});var n=r(3239);function i(e){let t=arguments.length>1&&void 0!==arguments[1]&&arguments[1],r=arguments.length>2?arguments[2]:void 0,i=arguments.length>3?arguments[3]:void 0;return void(0,n.iz)("visibilitychange",(function(){if(t)return void("hidden"==document.visibilityState&&e());e(document.visibilityState)}),r,i)}},1214:(e,t,r)=>{r.d(t,{em:()=>v,u5:()=>N,QU:()=>S,_L:()=>I,Gm:()=>L,Lg:()=>M,gy:()=>U,BV:()=>Q,Kf:()=>ee});var n=r(2177);const i="nr@original";var o=Object.prototype.hasOwnProperty,a=!1;function s(e,t){return e||(e=n.ee),r.inPlace=function(e,t,n,i,o){n||(n="");var a,s,c,u="-"===n.charAt(0);for(c=0;c 2?n-2:0),o=2;o {r(A[T],e,w),r(E[T],e,w)})),r(l._A,"fetch",y),t.on(y+"end",(function(e,r){var n=this;if(r){var i=r.headers.get("content-length");null!==i&&(n.rxSize=i),t.emit(y+"done",[null,r],n)}else t.emit(y+"done",[e],n)})),t}const O={},j=["pushState","replaceState"];function S(e){const t=function(e){return(e||n.ee).get("history")}(e);return!l.il||O[t.debugId]++||(O[t.debugId]=1,s(t).inPlace(window.history,j,"-")),t}var P=r(3239);const C={},R=["appendChild","insertBefore","replaceChild"];function I(e){const t=function(e){return(e||n.ee).get("jsonp")}(e);if(!l.il||C[t.debugId])return t;C[t.debugId]=!0;var r=s(t),i=/[?&](?:callback|cb)=([^&#]+)/,o=/(.*)\.([^.]+)/,a=/^(\w+)(\.|$)(.*)$/;function c(e,t){var r=e.match(a),n=r[1],i=r[3];return i?c(i,t[n]):t[n]}return r.inPlace(Node.prototype,R,"dom-"),t.on("dom-start",(function(e){!function(e){if(!e||"string"!=typeof e.nodeName||"script"!==e.nodeName.toLowerCase())return;if("function"!=typeof e.addEventListener)return;var n=(a=e.src,s=a.match(i),s?s[1]:null);var a,s;if(!n)return;var u=function(e){var t=e.match(o);if(t&&t.length>=3)return{key:t[2],parent:c(t[1],window)};return{key:e,parent:window}}(n);if("function"!=typeof u.parent[u.key])return;var d={};function f(){t.emit("jsonp-end",[],d),e.removeEventListener("load",f,(0,P.m$)(!1)),e.removeEventListener("error",l,(0,P.m$)(!1))}function l(){t.emit("jsonp-error",[],d),t.emit("jsonp-end",[],d),e.removeEventListener("load",f,(0,P.m$)(!1)),e.removeEventListener("error",l,(0,P.m$)(!1))}r.inPlace(u.parent,[u.key],"cb-",d),e.addEventListener("load",f,(0,P.m$)(!1)),e.addEventListener("error",l,(0,P.m$)(!1)),t.emit("new-jsonp",[e.src],d)}(e[0])})),t}var k=r(5763);const H={};function L(e){const t=function(e){return(e||n.ee).get("mutation")}(e);if(!l.il||H[t.debugId])return t;H[t.debugId]=!0;var r=s(t),i=k.Yu.MO;return i&&(window.MutationObserver=function(e){return this instanceof i?new i(r(e,"fn-")):i.apply(this,arguments)},MutationObserver.prototype=i.prototype),t}const z={};function M(e){const t=function(e){return(e||n.ee).get("promise")}(e);if(z[t.debugId])return t;z[t.debugId]=!0;var r=n.c,o=s(t),a=k.Yu.PR;return a&&function(){function e(r){var n=t.context(),i=o(r,"executor-",n,null,!1);const s=Reflect.construct(a,[i],e);return t.context(s).getCtx=function(){return n},s}l._A.Promise=e,Object.defineProperty(e,"name",{value:"Promise"}),e.toString=function(){return a.toString()},Object.setPrototypeOf(e,a),["all","race"].forEach((function(r){const n=a[r];e[r]=function(e){let i=!1;[...e||[]].forEach((e=>{this.resolve(e).then(a("all"===r),a(!1))}));const o=n.apply(this,arguments);return o;function a(e){return function(){t.emit("propagate",[null,!i],o,!1,!1),i=i||!e}}}})),["resolve","reject"].forEach((function(r){const n=a[r];e[r]=function(e){const r=n.apply(this,arguments);return e!==r&&t.emit("propagate",[e,!0],r,!1,!1),r}})),e.prototype=a.prototype;const n=a.prototype.then;a.prototype.then=function(){var e=this,i=r(e);i.promise=e;for(var a=arguments.length,s=new Array(a),c=0;c e())),t};function m(e,t){i.inPlace(t,["onreadystatechange"],"fn-",E)}function b(){var e=this,t=r.context(e);e.readyState>3&&!t.resolved&&(t.resolved=!0,r.emit("xhr-resolved",[],e)),i.inPlace(e,f,"fn-",E)}if(function(e,t){for(var r in e)t[r]=e[r]}(o,p),p.prototype=o.prototype,i.inPlace(p.prototype,J,"-xhr-",E),r.on("send-xhr-start",(function(e,t){m(e,t),function(e){h.push(e),a&&(y?y.then(A):u?u(A):(w=-w,x.data=w))}(t)})),r.on("open-xhr-start",m),a){var y=c&&c.resolve();if(!u&&!c){var w=1,x=document.createTextNode(w);new a(A).observe(x,{characterData:!0})}}else t.on("fn-end",(function(e){e[0]&&e[0].type===d||A()}));function A(){for(var e=0;e {r.d(t,{t:()=>n});const n=r(3325).D.ajax},6660:(e,t,r)=>{r.d(t,{A:()=>i,t:()=>n});const n=r(3325).D.jserrors,i="nr@seenError"},3081:(e,t,r)=>{r.d(t,{gF:()=>o,mY:()=>i,t9:()=>n,vz:()=>s,xS:()=>a});const n=r(3325).D.metrics,i="sm",o="cm",a="storeSupportabilityMetrics",s="storeEventMetrics"},4649:(e,t,r)=>{r.d(t,{t:()=>n});const n=r(3325).D.pageAction},7633:(e,t,r)=>{r.d(t,{Dz:()=>i,OJ:()=>a,qw:()=>o,t9:()=>n});const n=r(3325).D.pageViewEvent,i="firstbyte",o="domcontent",a="windowload"},9251:(e,t,r)=>{r.d(t,{t:()=>n});const n=r(3325).D.pageViewTiming},3614:(e,t,r)=>{r.d(t,{BST_RESOURCE:()=>i,END:()=>s,FEATURE_NAME:()=>n,FN_END:()=>u,FN_START:()=>c,PUSH_STATE:()=>d,RESOURCE:()=>o,START:()=>a});const n=r(3325).D.sessionTrace,i="bstResource",o="resource",a="-start",s="-end",c="fn"+a,u="fn"+s,d="pushState"},7836:(e,t,r)=>{r.d(t,{BODY:()=>A,CB_END:()=>E,CB_START:()=>u,END:()=>x,FEATURE_NAME:()=>i,FETCH:()=>_,FETCH_BODY:()=>v,FETCH_DONE:()=>m,FETCH_START:()=>p,FN_END:()=>c,FN_START:()=>s,INTERACTION:()=>l,INTERACTION_API:()=>d,INTERACTION_EVENTS:()=>o,JSONP_END:()=>b,JSONP_NODE:()=>g,JS_TIME:()=>T,MAX_TIMER_BUDGET:()=>a,REMAINING:()=>f,SPA_NODE:()=>h,START:()=>w,originalSetTimeout:()=>y});var n=r(5763);const i=r(3325).D.spa,o=["click","submit","keypress","keydown","keyup","change"],a=999,s="fn-start",c="fn-end",u="cb-start",d="api-ixn-",f="remaining",l="interaction",h="spaNode",g="jsonpNode",p="fetch-start",m="fetch-done",v="fetch-body-",b="jsonp-end",y=n.Yu.ST,w="-start",x="-end",A="-body",E="cb"+x,T="jsTime",_="fetch"},5938:(e,t,r)=>{r.d(t,{W:()=>o});var n=r(5763),i=r(2177);class o{constructor(e,t,r){this.agentIdentifier=e,this.aggregator=t,this.ee=i.ee.get(e,(0,n.OP)(this.agentIdentifier).isolatedBacklog),this.featureName=r,this.blocked=!1}}},9144:(e,t,r)=>{r.d(t,{j:()=>m});var n=r(3325),i=r(5763),o=r(5546),a=r(2177),s=r(7894),c=r(8e3),u=r(3960),d=r(385),f=r(50),l=r(3081),h=r(8632);function g(){const e=(0,h.gG)();["setErrorHandler","finished","addToTrace","inlineHit","addRelease","addPageAction","setCurrentRouteName","setPageViewName","setCustomAttribute","interaction","noticeError","setUserId"].forEach((t=>{e[t]=function(){for(var r=arguments.length,n=new Array(r),i=0;i 1?r-1:0),i=1;i {e.exposed&&e.api[t]&&o.push(e.api[t](...n))})),o.length>1?o:o[0]}(t,...n)}}))}var p=r(2587);function m(e){let t=arguments.length>1&&void 0!==arguments[1]?arguments[1]:{},m=arguments.length>2?arguments[2]:void 0,v=arguments.length>3?arguments[3]:void 0,{init:b,info:y,loader_config:w,runtime:x={loaderType:m},exposed:A=!0}=t;const E=(0,h.gG)();y||(b=E.init,y=E.info,w=E.loader_config),(0,i.Dg)(e,b||{}),(0,i.GE)(e,w||{}),(0,i.sU)(e,x),y.jsAttributes??={},d.v6&&(y.jsAttributes.isWorker=!0),(0,i.CX)(e,y),g();const T=function(e,t){t||(0,c.R)(e,"api");const h={};var g=a.ee.get(e),p=g.get("tracer"),m="api-",v=m+"ixn-";function b(t,r,n,o){const a=(0,i.C5)(e);return null===r?delete a.jsAttributes[t]:(0,i.CX)(e,{...a,jsAttributes:{...a.jsAttributes,[t]:r}}),x(m,n,!0,o||null===r?"session":void 0)(t,r)}function y(){}["setErrorHandler","finished","addToTrace","inlineHit","addRelease"].forEach((e=>h[e]=x(m,e,!0,"api"))),h.addPageAction=x(m,"addPageAction",!0,n.D.pageAction),h.setCurrentRouteName=x(m,"routeName",!0,n.D.spa),h.setPageViewName=function(t,r){if("string"==typeof t)return"/"!==t.charAt(0)&&(t="/"+t),(0,i.OP)(e).customTransaction=(r||"http://custom.transaction")+t,x(m,"setPageViewName",!0)()},h.setCustomAttribute=function(e,t){let r=arguments.length>2&&void 0!==arguments[2]&&arguments[2];if("string"==typeof e){if(["string","number"].includes(typeof t)||null===t)return b(e,t,"setCustomAttribute",r);(0,f.Z)("Failed to execute setCustomAttribute.\nNon-null value must be a string or number type, but a type of was provided."))}else(0,f.Z)("Failed to execute setCustomAttribute.\nName must be a string type, but a type of was provided."))},h.setUserId=function(e){if("string"==typeof e||null===e)return b("enduser.id",e,"setUserId",!0);(0,f.Z)("Failed to execute setUserId.\nNon-null value must be a string type, but a type of was provided."))},h.interaction=function(){return(new y).get()};var w=y.prototype={createTracer:function(e,t){var r={},i=this,a="function"==typeof t;return(0,o.p)(v+"tracer",[(0,s.z)(),e,r],i,n.D.spa,g),function(){if(p.emit((a?"":"no-")+"fn-start",[(0,s.z)(),i,a],r),a)try{return t.apply(this,arguments)}catch(e){throw p.emit("fn-err",[arguments,this,"string"==typeof e?new Error(e):e],r),e}finally{p.emit("fn-end",[(0,s.z)()],r)}}}};function x(e,t,r,i){return function(){return(0,o.p)(l.xS,["API/"+t+"/called"],void 0,n.D.metrics,g),i&&(0,o.p)(e+t,[(0,s.z)(),...arguments],r?null:this,i,g),r?void 0:this}}function A(){r.e(439).then(r.bind(r,7438)).then((t=>{let{setAPI:r}=t;r(e),(0,c.L)(e,"api")})).catch((()=>(0,f.Z)("Downloading runtime APIs failed...")))}return["actionText","setName","setAttribute","save","ignore","onEnd","getContext","end","get"].forEach((e=>{w[e]=x(v,e,void 0,n.D.spa)})),h.noticeError=function(e,t){"string"==typeof e&&(e=new Error(e)),(0,o.p)(l.xS,["API/noticeError/called"],void 0,n.D.metrics,g),(0,o.p)("err",[e,(0,s.z)(),!1,t],void 0,n.D.jserrors,g)},d.il?(0,u.b)((()=>A()),!0):A(),h}(e,v);return(0,h.Qy)(e,T,"api"),(0,h.Qy)(e,A,"exposed"),(0,h.EZ)("activatedFeatures",p.T),T}},3325:(e,t,r)=>{r.d(t,{D:()=>n,p:()=>i});const n={ajax:"ajax",jserrors:"jserrors",metrics:"metrics",pageAction:"page_action",pageViewEvent:"page_view_event",pageViewTiming:"page_view_timing",sessionReplay:"session_replay",sessionTrace:"session_trace",spa:"spa"},i={[n.pageViewEvent]:1,[n.pageViewTiming]:2,[n.metrics]:3,[n.jserrors]:4,[n.ajax]:5,[n.sessionTrace]:6,[n.pageAction]:7,[n.spa]:8,[n.sessionReplay]:9}}},n={};function i(e){var t=n[e];if(void 0!==t)return t.exports;var o=n[e]={exports:{}};return r[e](o,o.exports,i),o.exports}i.m=r,i.d=(e,t)=>{for(var r in t)i.o(t,r)&&!i.o(e,r)&&Object.defineProperty(e,r,{enumerable:!0,get:t[r]})},i.f={},i.e=e=>Promise.all(Object.keys(i.f).reduce(((t,r)=>(i.f[r](e,t),t)),[])),i.u=e=>(({78:"page_action-aggregate",147:"metrics-aggregate",242:"session-manager",317:"jserrors-aggregate",348:"page_view_timing-aggregate",412:"lazy-feature-loader",439:"async-api",538:"recorder",590:"session_replay-aggregate",675:"compressor",733:"session_trace-aggregate",786:"page_view_event-aggregate",873:"spa-aggregate",898:"ajax-aggregate"}[e]||e)+"."+{78:"ac76d497",147:"3dc53903",148:"1a20d5fe",242:"2a64278a",317:"49e41428",348:"bd6de33a",412:"2f55ce66",439:"30bd804e",538:"1b18459f",590:"cf0efb30",675:"ae9f91a8",733:"83105561",786:"06482edd",860:"03a8b7a5",873:"e6b09d52",898:"998ef92b"}[e]+"-1.236.0.min.js"),i.o=(e,t)=>Object.prototype.hasOwnProperty.call(e,t),e={},t="NRBA:",i.l=(r,n,o,a)=>{if(e[r])e[r].push(n);else{var s,c;if(void 0!==o)for(var u=document.getElementsByTagName("script"),d=0;d {s.onerror=s.onload=null,clearTimeout(h);var i=e[r];if(delete e[r],s.parentNode&&s.parentNode.removeChild(s),i&&i.forEach((e=>e(n))),t)return t(n)},h=setTimeout(l.bind(null,void 0,{type:"timeout",target:s}),12e4);s.onerror=l.bind(null,s.onerror),s.onload=l.bind(null,s.onload),c&&document.head.appendChild(s)}},i.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},i.j=364,i.p="https://js-agent.newrelic.com/",(()=>{var e={364:0,953:0};i.f.j=(t,r)=>{var n=i.o(e,t)?e[t]:void 0;if(0!==n)if(n)r.push(n[2]);else{var o=new Promise(((r,i)=>n=e[t]=[r,i]));r.push(n[2]=o);var a=i.p+i.u(t),s=new Error;i.l(a,(r=>{if(i.o(e,t)&&(0!==(n=e[t])&&(e[t]=void 0),n)){var o=r&&("load"===r.type?"missing":r.type),a=r&&r.target&&r.target.src;s.message="Loading chunk "+t+" failed.\n("+o+": "+a+")",s.name="ChunkLoadError",s.type=o,s.request=a,n[1](s)}}),"chunk-"+t,t)}};var t=(t,r)=>{var n,o,[a,s,c]=r,u=0;if(a.some((t=>0!==e[t]))){for(n in s)i.o(s,n)&&(i.m[n]=s[n]);if(c)c(i)}for(t&&t(r);u {i.r(o);var e=i(3325),t=i(5763);const r=Object.values(e.D);function n(e){const n={};return r.forEach((r=>{n[r]=function(e,r){return!1!==(0,t.Mt)(r,"".concat(e,".enabled"))}(r,e)})),n}var a=i(9144);var s=i(5546),c=i(385),u=i(8e3),d=i(5938),f=i(3960),l=i(50);class h extends d.W{constructor(e,t,r){let n=!(arguments.length>3&&void 0!==arguments[3])||arguments[3];super(e,t,r),this.auto=n,this.abortHandler,this.featAggregate,this.onAggregateImported,n&&(0,u.R)(e,r)}importAggregator(){let e=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};if(this.featAggregate||!this.auto)return;const r=c.il&&!0===(0,t.Mt)(this.agentIdentifier,"privacy.cookies_enabled");let n;this.onAggregateImported=new Promise((e=>{n=e}));const o=async()=>{let t;try{if(r){const{setupAgentSession:e}=await Promise.all([i.e(860),i.e(242)]).then(i.bind(i,3228));t=e(this.agentIdentifier)}}catch(e){(0,l.Z)("A problem occurred when starting up session manager. This page will not start or extend any session.",e)}try{if(!this.shouldImportAgg(this.featureName,t))return void(0,u.L)(this.agentIdentifier,this.featureName);const{lazyFeatureLoader:r}=await i.e(412).then(i.bind(i,8582)),{Aggregate:o}=await r(this.featureName,"aggregate");this.featAggregate=new o(this.agentIdentifier,this.aggregator,e),n(!0)}catch(e){(0,l.Z)("Downloading and initializing ".concat(this.featureName," failed..."),e),this.abortHandler?.(),n(!1)}};c.il?(0,f.b)((()=>o()),!0):o()}shouldImportAgg(r,n){return r!==e.D.sessionReplay||!1!==(0,t.Mt)(this.agentIdentifier,"session_trace.enabled")&&(!!n?.isNew||!!n?.state.sessionReplay)}}var g=i(7633),p=i(7894);class m extends h{static featureName=g.t9;constructor(r,n){let i=!(arguments.length>2&&void 0!==arguments[2])||arguments[2];if(super(r,n,g.t9,i),("undefined"==typeof PerformanceNavigationTiming||c.Tt)&&"undefined"!=typeof PerformanceTiming){const n=(0,t.OP)(r);n[g.Dz]=Math.max(Date.now()-n.offset,0),(0,f.K)((()=>n[g.qw]=Math.max((0,p.z)()-n[g.Dz],0))),(0,f.b)((()=>{const t=(0,p.z)();n[g.OJ]=Math.max(t-n[g.Dz],0),(0,s.p)("timing",["load",t],void 0,e.D.pageViewTiming,this.ee)}))}this.importAggregator()}}var v=i(1117),b=i(1284);class y extends v.w{constructor(e){super(e),this.aggregatedData={}}store(e,t,r,n,i){var o=this.getBucket(e,t,r,i);return o.metrics=function(e,t){t||(t={count:0});return t.count+=1,(0,b.D)(e,(function(e,r){t[e]=w(r,t[e])})),t}(n,o.metrics),o}merge(e,t,r,n,i){var o=this.getBucket(e,t,n,i);if(o.metrics){var a=o.metrics;a.count+=r.count,(0,b.D)(r,(function(e,t){if("count"!==e){var n=a[e],i=r[e];i&&!i.c?a[e]=w(i.t,n):a[e]=function(e,t){if(!t)return e;t.c||(t=x(t.t));return t.min=Math.min(e.min,t.min),t.max=Math.max(e.max,t.max),t.t+=e.t,t.sos+=e.sos,t.c+=e.c,t}(i,a[e])}}))}else o.metrics=r}storeMetric(e,t,r,n){var i=this.getBucket(e,t,r);return i.stats=w(n,i.stats),i}getBucket(e,t,r,n){this.aggregatedData[e]||(this.aggregatedData[e]={});var i=this.aggregatedData[e][t];return i||(i=this.aggregatedData[e][t]={params:r||{}},n&&(i.custom=n)),i}get(e,t){return t?this.aggregatedData[e]&&this.aggregatedData[e][t]:this.aggregatedData[e]}take(e){for(var t={},r="",n=!1,i=0;i t.max&&(t.max=e),e 2&&void 0!==arguments[2])||arguments[2];super(e,r,j.t,n),c.il&&((0,t.OP)(e).initHidden=Boolean("hidden"===document.visibilityState),(0,N.N)((()=>(0,s.p)("docHidden",[(0,p.z)()],void 0,j.t,this.ee)),!0),(0,O.bP)("pagehide",(()=>(0,s.p)("winPagehide",[(0,p.z)()],void 0,j.t,this.ee))),this.importAggregator())}}var P=i(3081);class C extends h{static featureName=P.t9;constructor(e,t){let r=!(arguments.length>2&&void 0!==arguments[2])||arguments[2];super(e,t,P.t9,r),this.importAggregator()}}var R,I=i(2210),k=i(1214),H=i(2177),L={};try{R=localStorage.getItem("__nr_flags").split(","),console&&"function"==typeof console.log&&(L.console=!0,-1!==R.indexOf("dev")&&(L.dev=!0),-1!==R.indexOf("nr_dev")&&(L.nrDev=!0))}catch(e){}function z(e){try{L.console&&z(e)}catch(e){}}L.nrDev&&H.ee.on("internal-error",(function(e){z(e.stack)})),L.dev&&H.ee.on("fn-err",(function(e,t,r){z(r.stack)})),L.dev&&(z("NR AGENT IN DEVELOPMENT MODE"),z("flags: "+(0,b.D)(L,(function(e,t){return e})).join(", ")));var M=i(6660);class B extends h{static featureName=M.t;constructor(r,n){let i=!(arguments.length>2&&void 0!==arguments[2])||arguments[2];super(r,n,M.t,i),this.skipNext=0;try{this.removeOnAbort=new AbortController}catch(e){}const o=this;o.ee.on("fn-start",(function(e,t,r){o.abortHandler&&(o.skipNext+=1)})),o.ee.on("fn-err",(function(t,r,n){o.abortHandler&&!n[M.A]&&((0,I.X)(n,M.A,(function(){return!0})),this.thrown=!0,(0,s.p)("err",[n,(0,p.z)()],void 0,e.D.jserrors,o.ee))})),o.ee.on("fn-end",(function(){o.abortHandler&&!this.thrown&&o.skipNext>0&&(o.skipNext-=1)})),o.ee.on("internal-error",(function(t){(0,s.p)("ierr",[t,(0,p.z)(),!0],void 0,e.D.jserrors,o.ee)})),this.origOnerror=c._A.onerror,c._A.onerror=this.onerrorHandler.bind(this),c._A.addEventListener("unhandledrejection",(t=>{const r=function(e){let t="Unhandled Promise Rejection: ";if(e instanceof Error)try{return e.message=t+e.message,e}catch(t){return e}if(void 0===e)return new Error(t);try{return new Error(t+(0,D.P)(e))}catch(e){return new Error(t)}}(t.reason);(0,s.p)("err",[r,(0,p.z)(),!1,{unhandledPromiseRejection:1}],void 0,e.D.jserrors,this.ee)}),(0,O.m$)(!1,this.removeOnAbort?.signal)),(0,k.gy)(this.ee),(0,k.BV)(this.ee),(0,k.em)(this.ee),(0,t.OP)(r).xhrWrappable&&(0,k.Kf)(this.ee),this.abortHandler=this.#e,this.importAggregator()}#e(){this.removeOnAbort?.abort(),this.abortHandler=void 0}onerrorHandler(t,r,n,i,o){"function"==typeof this.origOnerror&&this.origOnerror(...arguments);try{this.skipNext?this.skipNext-=1:(0,s.p)("err",[o||new F(t,r,n),(0,p.z)()],void 0,e.D.jserrors,this.ee)}catch(t){try{(0,s.p)("ierr",[t,(0,p.z)(),!0],void 0,e.D.jserrors,this.ee)}catch(e){}}return!1}}function F(e,t,r){this.message=e||"Uncaught error with no additional information",this.sourceURL=t,this.line=r}let U=1;const q="nr@id";function G(e){const t=typeof e;return!e||"object"!==t&&"function"!==t?-1:e===c._A?0:(0,I.X)(e,q,(function(){return U++}))}function V(e){if("string"==typeof e&&e.length)return e.length;if("object"==typeof e){if("undefined"!=typeof ArrayBuffer&&e instanceof ArrayBuffer&&e.byteLength)return e.byteLength;if("undefined"!=typeof Blob&&e instanceof Blob&&e.size)return e.size;if(!("undefined"!=typeof FormData&&e instanceof FormData))try{return(0,D.P)(e).length}catch(e){return}}}var X=i(7243);class W{constructor(e){this.agentIdentifier=e,this.generateTracePayload=this.generateTracePayload.bind(this),this.shouldGenerateTrace=this.shouldGenerateTrace.bind(this)}generateTracePayload(e){if(!this.shouldGenerateTrace(e))return null;var r=(0,t.DL)(this.agentIdentifier);if(!r)return null;var n=(r.accountID||"").toString()||null,i=(r.agentID||"").toString()||null,o=(r.trustKey||"").toString()||null;if(!n||!i)return null;var a=(0,_.M)(),s=(0,_.Ht)(),c=Date.now(),u={spanId:a,traceId:s,timestamp:c};return(e.sameOrigin||this.isAllowedOrigin(e)&&this.useTraceContextHeadersForCors())&&(u.traceContextParentHeader=this.generateTraceContextParentHeader(a,s),u.traceContextStateHeader=this.generateTraceContextStateHeader(a,c,n,i,o)),(e.sameOrigin&&!this.excludeNewrelicHeader()||!e.sameOrigin&&this.isAllowedOrigin(e)&&this.useNewrelicHeaderForCors())&&(u.newrelicHeader=this.generateTraceHeader(a,s,c,n,i,o)),u}generateTraceContextParentHeader(e,t){return"00-"+t+"-"+e+"-01"}generateTraceContextStateHeader(e,t,r,n,i){return i+"@nr=0-1-"+r+"-"+n+"-"+e+"----"+t}generateTraceHeader(e,t,r,n,i,o){if(!("function"==typeof c._A?.btoa))return null;var a={v:[0,1],d:{ty:"Browser",ac:n,ap:i,id:e,tr:t,ti:r}};return o&&n!==o&&(a.d.tk=o),btoa((0,D.P)(a))}shouldGenerateTrace(e){return this.isDtEnabled()&&this.isAllowedOrigin(e)}isAllowedOrigin(e){var r=!1,n={};if((0,t.Mt)(this.agentIdentifier,"distributed_tracing")&&(n=(0,t.P_)(this.agentIdentifier).distributed_tracing),e.sameOrigin)r=!0;else if(n.allowed_origins instanceof Array)for(var i=0;i 2&&void 0!==arguments[2])||arguments[2];super(r,n,Z.t,i),(0,t.OP)(r).xhrWrappable&&(this.dt=new W(r),this.handler=(e,t,r,n)=>(0,s.p)(e,t,r,n,this.ee),(0,k.u5)(this.ee),(0,k.Kf)(this.ee),function(r,n,i,o){function a(e){var t=this;t.totalCbs=0,t.called=0,t.cbTime=0,t.end=E,t.ended=!1,t.xhrGuids={},t.lastSize=null,t.loadCaptureCalled=!1,t.params=this.params||{},t.metrics=this.metrics||{},e.addEventListener("load",(function(r){_(t,e)}),(0,O.m$)(!1)),c.IF||e.addEventListener("progress",(function(e){t.lastSize=e.loaded}),(0,O.m$)(!1))}function s(e){this.params={method:e[0]},T(this,e[1]),this.metrics={}}function u(e,n){var i=(0,t.DL)(r);i.xpid&&this.sameOrigin&&n.setRequestHeader("X-NewRelic-ID",i.xpid);var a=o.generateTracePayload(this.parsedOrigin);if(a){var s=!1;a.newrelicHeader&&(n.setRequestHeader("newrelic",a.newrelicHeader),s=!0),a.traceContextParentHeader&&(n.setRequestHeader("traceparent",a.traceContextParentHeader),a.traceContextStateHeader&&n.setRequestHeader("tracestate",a.traceContextStateHeader),s=!0),s&&(this.dt=a)}}function d(e,t){var r=this.metrics,i=e[0],o=this;if(r&&i){var a=V(i);a&&(r.txSize=a)}this.startTime=(0,p.z)(),this.listener=function(e){try{"abort"!==e.type||o.loadCaptureCalled||(o.params.aborted=!0),("load"!==e.type||o.called===o.totalCbs&&(o.onloadCalled||"function"!=typeof t.onload)&&"function"==typeof o.end)&&o.end(t)}catch(e){try{n.emit("internal-error",[e])}catch(e){}}};for(var s=0;s 1?e[1]=i:e.push(i)}else e[0]&&e[0].headers&&s(e[0].headers,n)&&(this.dt=n);function s(e,t){var r=!1;return t.newrelicHeader&&(e.set("newrelic",t.newrelicHeader),r=!0),t.traceContextParentHeader&&(e.set("traceparent",t.traceContextParentHeader),t.traceContextStateHeader&&e.set("tracestate",t.traceContextStateHeader),r=!0),r}}function x(e,t){this.params={},this.metrics={},this.startTime=(0,p.z)(),this.dt=t,e.length>=1&&(this.target=e[0]),e.length>=2&&(this.opts=e[1]);var r,n=this.opts||{},i=this.target;"string"==typeof i?r=i:"object"==typeof i&&i instanceof Y?r=i.url:c._A?.URL&&"object"==typeof i&&i instanceof URL&&(r=i.href),T(this,r);var o=(""+(i&&i instanceof Y&&i.method||n.method||"GET")).toUpperCase();this.params.method=o,this.txSize=V(n.body)||0}function A(t,r){var n;this.endTime=(0,p.z)(),this.params||(this.params={}),this.params.status=r?r.status:0,"string"==typeof this.rxSize&&this.rxSize.length>0&&(n=+this.rxSize);var o={txSize:this.txSize,rxSize:n,duration:(0,p.z)()-this.startTime};i("xhr",[this.params,o,this.startTime,this.endTime,"fetch"],this,e.D.ajax)}function E(t){var r=this.params,n=this.metrics;if(!this.ended){this.ended=!0;for(var o=0;o 2&&void 0!==arguments[2])||arguments[2];super(e,t,we.t,r),this.importAggregator()}}new class{constructor(e){let t=arguments.length>1&&void 0!==arguments[1]?arguments[1]:(0,_.ky)(16);c._A?(this.agentIdentifier=t,this.sharedAggregator=new y({agentIdentifier:this.agentIdentifier}),this.features={},this.desiredFeatures=new Set(e.features||[]),this.desiredFeatures.add(m),Object.assign(this,(0,a.j)(this.agentIdentifier,e,e.loaderType||"agent")),this.start()):(0,l.Z)("Failed to initial the agent. Could not determine the runtime environment.")}get config(){return{info:(0,t.C5)(this.agentIdentifier),init:(0,t.P_)(this.agentIdentifier),loader_config:(0,t.DL)(this.agentIdentifier),runtime:(0,t.OP)(this.agentIdentifier)}}start(){const t="features";try{const r=n(this.agentIdentifier),i=[...this.desiredFeatures];i.sort(((t,r)=>e.p[t.featureName]-e.p[r.featureName])),i.forEach((t=>{if(r[t.featureName]||t.featureName===e.D.pageViewEvent){const n=function(t){switch(t){case e.D.ajax:return[e.D.jserrors];case e.D.sessionTrace:return[e.D.ajax,e.D.pageViewEvent];case e.D.sessionReplay:return[e.D.sessionTrace];case e.D.pageViewTiming:return[e.D.pageViewEvent];default:return[]}}(t.featureName);n.every((e=>r[e]))||(0,l.Z)("".concat(t.featureName," is enabled but one or more dependent features has been disabled (").concat((0,D.P)(n),"). This may cause unintended consequences or missing data...")),this.features[t.featureName]=new t(this.agentIdentifier,this.sharedAggregator)}})),(0,T.Qy)(this.agentIdentifier,this.features,t)}catch(e){(0,l.Z)("Failed to initialize all enabled instrument classes (agent aborted) -",e);for(const e in this.features)this.features[e].abortHandler?.();const r=(0,T.fP)();return delete r.initializedAgents[this.agentIdentifier]?.api,delete r.initializedAgents[this.agentIdentifier]?.[t],delete this.sharedAggregator,r.ee?.abort(),delete r.ee?.get(this.agentIdentifier),!1}}}({features:[J,m,S,class extends h{static featureName=oe;constructor(t,r){if(super(t,r,oe,!(arguments.length>2&&void 0!==arguments[2])||arguments[2]),!c.il)return;const n=this.ee;let i;(0,k.QU)(n),this.eventsEE=(0,k.em)(n),this.eventsEE.on(se,(function(e,t){this.bstStart=(0,p.z)()})),this.eventsEE.on(ae,(function(t,r){(0,s.p)("bst",[t[0],r,this.bstStart,(0,p.z)()],void 0,e.D.sessionTrace,n)})),n.on(ce+ne,(function(e){this.time=(0,p.z)(),this.startPath=location.pathname+location.hash})),n.on(ce+ie,(function(t){(0,s.p)("bstHist",[location.pathname+location.hash,this.startPath,this.time],void 0,e.D.sessionTrace,n)}));try{i=new PerformanceObserver((t=>{const r=t.getEntries();(0,s.p)(te,[r],void 0,e.D.sessionTrace,n)})),i.observe({type:re,buffered:!0})}catch(e){}this.importAggregator({resourceObserver:i})}},C,xe,B,class extends h{static featureName=de;constructor(e,r){if(super(e,r,de,!(arguments.length>2&&void 0!==arguments[2])||arguments[2]),!c.il)return;if(!(0,t.OP)(e).xhrWrappable)return;try{this.removeOnAbort=new AbortController}catch(e){}let n,i=0;const o=this.ee.get("tracer"),a=(0,k._L)(this.ee),s=(0,k.Lg)(this.ee),u=(0,k.BV)(this.ee),d=(0,k.Kf)(this.ee),f=this.ee.get("events"),l=(0,k.u5)(this.ee),h=(0,k.QU)(this.ee),g=(0,k.Gm)(this.ee);function m(e,t){h.emit("newURL",[""+window.location,t])}function v(){i++,n=window.location.hash,this[ve]=(0,p.z)()}function b(){i--,window.location.hash!==n&&m(0,!0);var e=(0,p.z)();this[pe]=~~this[pe]+e-this[ve],this[ye]=e}function y(e,t){e.on(t,(function(){this[t]=(0,p.z)()}))}this.ee.on(ve,v),s.on(be,v),a.on(be,v),this.ee.on(ye,b),s.on(ge,b),a.on(ge,b),this.ee.buffer([ve,ye,"xhr-resolved"],this.featureName),f.buffer([ve],this.featureName),u.buffer(["setTimeout"+le,"clearTimeout"+fe,ve],this.featureName),d.buffer([ve,"new-xhr","send-xhr"+fe],this.featureName),l.buffer([me+fe,me+"-done",me+he+fe,me+he+le],this.featureName),h.buffer(["newURL"],this.featureName),g.buffer([ve],this.featureName),s.buffer(["propagate",be,ge,"executor-err","resolve"+fe],this.featureName),o.buffer([ve,"no-"+ve],this.featureName),a.buffer(["new-jsonp","cb-start","jsonp-error","jsonp-end"],this.featureName),y(l,me+fe),y(l,me+"-done"),y(a,"new-jsonp"),y(a,"jsonp-end"),y(a,"cb-start"),h.on("pushState-end",m),h.on("replaceState-end",m),window.addEventListener("hashchange",m,(0,O.m$)(!0,this.removeOnAbort?.signal)),window.addEventListener("load",m,(0,O.m$)(!0,this.removeOnAbort?.signal)),window.addEventListener("popstate",(function(){m(0,i>1)}),(0,O.m$)(!0,this.removeOnAbort?.signal)),this.abortHandler=this.#e,this.importAggregator()}#e(){this.removeOnAbort?.abort(),this.abortHandler=void 0}}],loaderType:"spa"})})(),window.NRBA=o})(); window.jQuery || document.write(' ') CKEDITOR_BASEPATH='https://f1000research.com/js/vendor/ckeditor/' window.reactTheme = 'research'; window.MathJax = { CommonHTML: { linebreaks: { automatic: true } }, 'HTML-CSS': { linebreaks: { automatic: true } }, SVG: { linebreaks: { automatic: true } }, AuthorInit: function() { MathJax.Hub.Register.MessageHook('End Process', function () { let timeout = false; // holder for timeout id const delay = 250; // delay after event is "complete" to run callback const reflowMath = function() { const dispFormulas = document.querySelectorAll('.disp-formula.panel'); if (!dispFormulas) { return; } for (const dispFormula of dispFormulas) { const child = dispFormula.querySelector('.MathJax_Preview').nextSibling.firstChild; const isMultiline = MathJax.Hub.getAllJax(dispFormula)[0].root.isMultiline; if (dispFormula.offsetWidth < child.offsetWidth || isMultiline) { MathJax.Hub.Queue(['Rerender', MathJax.Hub, dispFormula]); } } }; window.addEventListener('resize', function() { clearTimeout(timeout); // clear the timeout timeout = setTimeout(reflowMath, delay); // start timing for event "completion" }); }); }, }; if (window.location.hash == '#_=_'){ window.location = window.location.href.split('#')[0] } !function(f,b,e,v,n,t,s){if(f.fbq)return;n=f.fbq=function() {n.callMethod? n.callMethod.apply(n,arguments):n.queue.push(arguments)} ;if(!f._fbq)f._fbq=n; n.push=n;n.loaded=!0;n.version='2.0';n.queue=[];t=b.createElement(e);t.async=!0; t.src=v;s=b.getElementsByTagName(e)[0];s.parentNode.insertBefore(t,s)}(window, document,'script','https://connect.facebook.net/en_US/fbevents.js'); fbq('init', '1641728616063202'); fbq('track', "PixelInitialized", {}); (function(h,o,t,j,a,r){ h.hj=h.hj||function(){(h.hj.q=h.hj.q||[]).push(arguments)}; h._hjSettings={hjid:2318163,hjsv:6}; a=o.getElementsByTagName('head')[0]; r=o.createElement('script');r.async=1; r.src=t+h._hjSettings.hjid+j+h._hjSettings.hjsv; a.appendChild(r); })(window,document,'https://static.hotjar.com/c/hotjar-','.js?sv='); search file_upload Submit your research search menu close search Browse Gateways & Collections How to Publish Submit your Research My Submissions Article Guidelines Article Guidelines (New Versions) Open Data, Software and Code Guidelines Open Data and Accessible Source Materials Guidelines (HSS) Open Data, Software and Code Guidelines (PSE) Prepublication Checks Production Process Posters and Slides Guidelines Document Guidelines Article Processing Charges Peer Review Finding Article Reviewers About How it Works For Reviewers Our Advisors Policies Glossary FAQs For Developers Newsroom Contact My Research Submissions Content and Tracking Alerts My Details Sign In file_upload Submit your research { "@context": "https://schema.org", "@type": "ScholarlyArticle", "mainEntityOfPage": { "@type": "WebPage", "@id": "https://f1000research.com/articles/13-1488" }, "headline": "Addressing common inferential mistakes when failing to reject the null-hypothesis", "datePublished": "2024-12-05T13:56:58", "dateModified": "2025-04-01T10:40:25", "author": [ { "@type": "Person", "name": "Amand Schmidt" } ], "publisher": { "@type": "Organization", "name": "F1000Research", "logo": { "@type": "ImageObject", "url": "https://f1000research.com/img/AMP/F1000Research_image.png", "height": 480, "width": 60 } }, "image": { "@type": "ImageObject", "url": "https://f1000research.com/img/AMP/F1000Research_image.png", "height": 1200, "width": 150 }, "description": "Failure to reject a null-hypothesis may lead to erroneous conclusions regarding the absence of an association or inadequate statistical power. Because an estimate (and its variance) can never be exactly zero, traditional statistical tests cannot conclusively demonstrate the absence of an association. Instead, estimates of accuracy should be used to identify settings in which an association and its variability are sufficiently small to be clinically acceptable, directly providing information on safety and efficacy. Post-hoc power calculations should be avoided, as they offer no additional information beyond statistical tests and p-values. Furthermore, post-hoc power calculations can be misleading because of an inability to distinguish between results based on insufficient sample size and results that reflect clinically irrelevant differences. Most multiple testing procedures unrealistically assume that all positive results are false positives. However, in applied settings, results typically represent a mix of true and false positives. This implies that multiplicity corrections do not effectively differentiate between true and false positives. Instead, considering the distributions of p-values and the proportion of significant results can help to identify bodies of evidence unlikely to be driven by false-positive results. In conclusion, rather than attempting to categorize results as true or false, medical research should embrace established statistical methods that focus on estimation accuracy, replication, and consistency." } { "@context": "http://schema.org", "@type": "BreadcrumbList", "itemListElement": [ { "@type": "ListItem", "position": "1", "item": { "@id": "https://f1000research.com/", "name": "Home" } }, { "@type": "ListItem", "position": "2", "item": { "@id": "https://f1000research.com/browse/articles", "name": "Browse" } }, { "@type": "ListItem", "position": "3", "item": { "@id": "https://f1000research.com/articles/13-1488/v3", "name": "Addressing common inferential mistakes when failing to reject the..." } } ] } Home Browse Addressing common inferential mistakes when failing to reject the... ALL Metrics - Views Downloads Get PDF Get XML Cite How to cite this article Schmidt A. Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.12688/f1000research.158434.3 ) NOTE: If applicable, it is important to ensure the information in square brackets after the title is included in all citations of this article. Close Copy Citation Details Export Export Citation Sciwheel EndNote Ref. Manager Bibtex ProCite Sente EXPORT Select a format first Track Share ▬ ✚ Opinion Article Revised Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] Amand Schmidt https://orcid.org/0000-0003-1327-0424 1-4 Amand Schmidt https://orcid.org/0000-0003-1327-0424 1-4 PUBLISHED 01 Apr 2025 Author details Author details 1 Department of Cardiology, University of Amsterdam, Amsterdam Zuidoost, 22660, Netherlands Antilles 2 University College London Faculty of Population Health Sciences, London, England, UK 3 Department of Cardiology, Utrecht University, Heidelberglaan, 3584 CX, Netherlands Antilles 4 UCL British Heart Foundation Research Accelerator, London, Chenies Mews, WC1E6HX, UK Amand Schmidt Roles: Conceptualization, Formal Analysis, Methodology, Visualization, Writing – Original Draft Preparation, Writing – Review & Editing OPEN PEER REVIEW DETAILS REVIEWER STATUS This article is included in the Research on Research, Policy & Culture gateway. This article is included in the University College London collection. Abstract Failure to reject a null-hypothesis may lead to erroneous conclusions regarding the absence of an association or inadequate statistical power. Because an estimate (and its variance) can never be exactly zero, traditional statistical tests cannot conclusively demonstrate the absence of an association. Instead, estimates of accuracy should be used to identify settings in which an association and its variability are sufficiently small to be clinically acceptable, directly providing information on safety and efficacy. Post-hoc power calculations should be avoided, as they offer no additional information beyond statistical tests and p-values. Furthermore, post-hoc power calculations can be misleading because of an inability to distinguish between results based on insufficient sample size and results that reflect clinically irrelevant differences. Most multiple testing procedures unrealistically assume that all positive results are false positives. However, in applied settings, results typically represent a mix of true and false positives. This implies that multiplicity corrections do not effectively differentiate between true and false positives. Instead, considering the distributions of p-values and the proportion of significant results can help to identify bodies of evidence unlikely to be driven by false-positive results. In conclusion, rather than attempting to categorize results as true or false, medical research should embrace established statistical methods that focus on estimation accuracy, replication, and consistency. READ ALL READ LESS Keywords Statistical inference, null-hypothesis, equivalence testing, statistical power, accuracy. Corresponding Author(s) Amand Schmidt ( [email protected] ) Close Corresponding author: Amand Schmidt Competing interests: AFS has received funding from New Amsterdam Pharma unrelated projects. Grant information: AFS was supported by BHF grant PG/22/10989, the UCL BHF Research Accelerator AA/18/6/34223, MR/V033867/1, and the National Institute for Health and Care Research University College London Hospitals Biomedical Research Centre. The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript. Copyright: © 2025 Schmidt A. This is an open access article distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. How to cite: Schmidt A. Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.12688/f1000research.158434.3 ) First published: 05 Dec 2024, 13 :1488 ( https://doi.org/10.12688/f1000research.158434.1 ) Latest published: 01 Apr 2025, 13 :1488 ( https://doi.org/10.12688/f1000research.158434.3 ) Revised Amendments from Version 2 The supplementary note has been updated reflecting comments by reviewer 2 - please see the figshare link https://doi.org/10.5522/04/27854043. The supplementary note has been updated reflecting comments by reviewer 2 - please see the figshare link https://doi.org/10.5522/04/27854043. See the author's detailed response to the review by Ying Cui See the author's detailed response to the review by J. Alexander Heimel READ REVIEWER RESPONSES Background Statistical tests and p-values are used to estimate the compatibility of the available data with a specific null-hypothesis. While statistical tests and p-values are strongly embraced by applied researchers, statistical science has raised important concerns about their interpretability, which may lead to incorrect statements about the presence, absence, or importance of an association. 1 This has led some researchers to suggest the replacement of p-values with alternative metrics, such as the S-value (i.e., S for surprise). 2 , 3 The current manuscript attempts to provide an accessible overview of these voiced concerns, particularly focusing on the appropriate interpretation of “non-significant” results when a p-value or statistical test does not provide sufficient reason to reject the posed null-hypothesis. In line with previous guidance in this area, this manuscript suggests moving away from categorizing results as True or False , instead suggesting research focusses on obtaining sufficiently accurate results on potential benefits and harms. As an illustrative example, we will consider results from the VOYAGER PAD 4 trial, which randomized patients with peripheral artery disease (PAD) to twice-daily rivaroxaban (2.5 mg) or a placebo, evaluating differences in the incidence of ischemic cardiovascular disease. The reported hazard ratio (HR) for rivaroxaban was 0.85 (95%CI 0.76; 0.96), tested against a null-hypothesis HR of 1.00, which resulted in a p-value of 0.006. Here the p-value indicates the proportion of subsequent trials (using the same design, intervention, and types of patients) which would result in an HR of 0.85 or more extreme, assuming the true population HR is 1. By convention, a p-value smaller than 0.05 is considered “significant”, however more stringent or liberal choices may also be applied. Aside from whether the CI rejects an HR of 1, the size of the 95%CI provides an indication of the variability of the HR, which in this case supports a smaller (HR of 0.96) or larger (HR of 0.76) benefit. Irrespective of the effect magnitude, there is reason to question the null-hypothesis HR of 1, where the data provides most support for an HR of 0.85. In the aforementioned example, there is substantial evidence against the null-hypothesis, and hence, inference is fairly straightforward and uncontested. Conversely, interpreting results of “non-significant” analyses, where the null-hypothesis cannot be rejected, may leads to erroneous conclusions such as claiming a lack of “statistical significance” supports the null-hypothesis. For example, based on an HR of 0.86 (95%CI 0.40;1.87; p-value=0.71), the VOYAGER-PAD authors concluded that for the subgroup of patients with endovascular PAD, there was “no increase in intracranial or fatal bleeding”. 5 While it is clear that the null-hypothesis of no difference cannot be rejected, with the CI including an HR of 1.87, it is also clear that there is little evidence to support the absence of a harmful effect. Instead of claiming an absence of a risk-increasing effect, the presented results suggest that additional research is needed before drawing conclusions on bleeding risks. In the following, we will discuss three common mistakes when interpreting results from statistical tests that fail to reject the null-hypothesis: 1) claiming that the null-hypothesis is true, 2) claiming that the study was underpowered, and 3) using multiple testing corrections to support claims about true or false associations. Why a non-significant result does not rule out a potentially meaningful association There are two types of hypotheses: a strict null-hypothesis, where a supposed population parameter μ (e.g., the mean, mean difference, or hazard ratio) takes on a single value (e.g., μ 0 = 0 ), and a composite null-hypothesis, where μ follows a range of values (e.g., 0.9 < μ 0 < 1.1 ). As with the illustrative examples, most often a strict null-hypothesis is (implicitly) evaluated. Given that a strict null-hypothesis postulates that μ is equal to a single value, it can be readily demonstrated that a statistical test cannot support the strict null-hypothesis. This understanding stems directly from the fact that for an arbitrary function ∫ its integral sum with limits a = b is zero (see Extended data for a formal proof ). More intuitively, to prove that μ is equal to any single value, one must obtain an estimate with zero bias and infinite precision. Essentially, this requires divine knowledge about the value of μ . While a strict null-hypothesis cannot be supported by the data, no matter how much data is collected, there may often be a need to rule out certain effect(s). For example, the risk of bleeding when treating patients with rivaroxaban, or identifying interventions with limited efficacy. The solution is to simply use composite null-hypotheses. Returning to the illustrative example, one might consider a risk-increasing effect of 1.25 or less as sufficiently modest to not offset the observed benefit on ischemic cardiovascular disease. In this case, the null-hypothesis would be μ 0 ≥ 1.25 , which would be rejected when both the estimated HR and the upper bound of the CI are smaller than 1.25 (or similarly using a one-sided test). For example, estimate 2 in Figure 1 reflects the VOYAGER PAD estimate of the rivaroxaban bleeding effect, and it is clear that the confidence interval includes HRs above 1.25; hence, the null-hypothesis should not be rejected and there is no reason to conclude that rivaroxaban has a relatively small effect on bleeding risk. However, estimate 4 is HR 1.15 (95%CI 1.06; 1.24) does exclude a HR larger than 1.25. Thus while estimate 4 supports a risk increasing effect, one can nevertheless conclude that the effect size is sufficiently small to suggest that rivaroxaban is relatively safe. Figure 1. A forest plot illustrating non-inferiority and equivalence testing. N.b. Points represent hazard ratios (HR) with horizontal lines indicating 95% confidence intervals (CI). The orange vertical line indicates margins of equivalence at 0.80 and 1.25, with the vertical line at 1.00, indicating no difference. Estimates 1-2 are based on the VOYAGER PAD 5 rivaroxaban results for ischemic cardiovascular disease and bleeding risk, respectively. Estimates 3-5 are purely hypothetical and included as illustrations. This procedure is referred to as non-inferiority testing, where 1.25 is the bound of equivalence. Depending on what is deemed clinically non-inferior, such a bound can be substantially larger or smaller. For example, in the EBBINGHAUS trial of the PCSK9-inhibitor evolocumab the equivalence bound was set equal to 20% of the standard deviation of the cognitive function score measured in the placebo group. 6 The key characteristic of non-inferiority testing for safety is that both the point estimate (here, the HR) and the upper bound of the CI should be smaller than the supposed bound of equivalence. Given that one is not testing against a strict null-hypothesis of no difference, whether confidence (or statistical test) refutes a neutral value (e.g. an HR of 1) is immaterial. The consideration of two bounds, one on each side of a neutral HR of 1, naturally leads to equivalence testing. For example, in Figure 1 , both estimates 4 and 5 are deemed equivalent, despite both rejecting an HR of 1.0. Defining bounds of equivalence or non-inferiority is challenging and a possible source of contention. Typically, such bounds are defined by combining statistical and clinical considerations. For example, evidence from previous studies can be meta-analysed to obtain a pooled effect estimate and confidence interval, where the confidence interval limits can be multiplied by a constants representing the amount of effect that one would like to preserve or rule out (for safety). 7 Ideas about non-inferiority and equivalence can be further generalized by considering the entire range of CIs providing information on precision and the HRs supported by collected data 2 ( Figure 2 ). For example, the rivaroxaban trial data support a wide range of bleeding risk HRs, and an HR of 1.25 would only be excluded using a 66%CI ( Figure 2 , left panel). This can be contrasted by the small range of HRs supported by the rivaroxaban trial data for a protective effect on ischemic cardiovascular disease, indicating that the trial data are highly supportive of a protective effect ( Figure 2 , right panel). Figure 2. A compatibility graph comparing the confidence interval coverage against a range of hazard ratios for the rivaroxaban estimates on ischemic cardiovascular disease (left) and bleeding outcomes (right). Vertical lines indicate a hazard ratio (HR) of 1.00, and a possible margin of equivalence for an HR of 1.25. The shaded area indicates the HRs supported for a given coverage probability, indicated on the y-axis. Why post-hoc power provides the same information as a p-values and null-hypothesis test Failure to reject a null-hypothesis naturally raises concerns about whether the study was sufficiently powered to detect a difference, often tempting researchers to conduct “post-hoc” or “observed” power calculations utilizing the observed point estimates (e.g. HRs) and variance estimates. Briefly, power reflects the probability of rejecting the null-hypothesis if it is false. This is the direct opposite of statistical tests and related quantities, such as p-values, which reflect the rejection probability assuming the null-hypothesis is true. As such, p-values and observed power are equivalent, and no additional information is obtained by considering both ( Figure 3 , see Extended data for a formal proof). To see this, suppose we observe a p-value of 0.05, which is equal to the conventional level of significance indicated by an alpha (i.e., the type 1 error rate) of 0.05. In this case, observed power can be obtained by calculating as probability of rejecting the null-hypothesis assuming the point estimates and it standard error are the true population values (which implies that the null-hypothesis is false), which in this case would be exactly 50% ( Figure 3 ). As such, while p-values evaluate the difference between the estimated values and the null-hypothesis assuming the former is true, power evaluates the same estimated values assuming the null-hypothesis is false. Hence, when p-values have been calculated, calculating observed power offers no additional information about the absence or presence of a difference. Figure 3. The relationship between p-values and observed power. Alpha refers to the type 1 error rate of a test. The dashed lines on the x-axis indicate the locations where the p-value is exactly equal to the alpha. The dashed lines on the y-axis indicate an observed power of 50%. More concerning is that post-hoc power calculations may lead to erroneous conclusions regarding the lack of statistical power or the absence of an effect. 8 , 9 For example, the following two HR estimates both have a p-value of 0.71 and a power of 7%: HR 0.86 (95% CI 0.39; 1.91), HR 1.00 (95%CI 0.99; 1.01). Looking at the post-hoc power, one might conclude that both analyses were underpowered; however, the HR of 0.86 is non-significant due to considerable variability, whereas the latter HR of 1.00 simply reflects a clinically irrelevant association. Hence, a more relevant alternative to post-hoc power calculations is to evaluate the extent to which the CI includes clinically relevant effect estimates, which is in line with the aforementioned equivalence/non-inferiority approach. For example, the VOYAGER PAD HR estimate of 0.86 (95%CI 0.40;1.87) for bleeding risk in people with endovascular PAD clearly shows that the collected data supports a wide range of effect estimates, including potentially harmful associations. However, because the confidence interval only partially overlap with the proposed (hypothetical) upper bounds of acceptable harm of 1.25, testing against this bound results a p-value of 0.17 which is considerably smaller than testing against the complete absence of an effect: p-value 0.70. By comparison, the observed power estimate for these results is 7%, which implies that if the true HR was 0.86 one would have rejected the strict null-hypothesis in 7 out of 100 repeated experiment. As such observed power provide limit information relative to the presented alternative approaches, particularly the confidence interval based approach which allows for an informative discussion of benefits and harms in terms of effect magnitude(s). Researchers may alternatively wish to calculate the power to reject a clinically meaningful difference other than the point estimate. Such calculations can meaningfully inform the design and viability of future studies; although sample size estimates may be more readily interpretable. However, when such power calculations are used to make statements about the presence or absence of an effect, or even lack of sample size, the described approach utilising confidence intervals and equivalence/non-inferiority margins provides more relevant information on study accuracy. The futility of using power to make claims of the absence of an effect is further illustrated by noting that in the absence of an effect (i.e. when the p-value is 1) observed power is equal to the employed alpha threshold (e.g. 0.05). Hence, rather counterintuitively, low power may actually argue for the absence of an effect. Because power can only be calculated assuming the null-hypothesis is false, this metric cannot be used to make claims in favour of the null-hypothesis. Furthermore, as discussed in the preceding section, statistical tests cannot be used to support the strict null-hypothesis, as such this also holds for derived metrics such as p-value and power. While power remains essential when designing a future study, it should not be used to interpret results of a completed study. At this stage more relevant metrics such as confidence intervals are available which do not condition on the presence or absence of an effect, and provide information on accuracy as well as on effect magnitude. Why multiple testing correction does not differentiate between true and false results Considerations of power and type 1 error rate are extremely relevant when designing a study, ensuring that a sufficiently accurate effect estimate may be realistically obtained given the available resources. However, both power and type 1 error rate are conditional probabilities assuming that the null-hypothesis is either true or false. As such these concepts are less relevant after the data have been collected, which would generally not consist of null-hypothesises which are either all true or all false, but instead will include an unknown mixture of both. It is important to realize that the type 1 error rate itself does not reflect the proportion of false positive results, but merely reflects the expected proportion of false positive results should all null-hypotheses be true. When considering the results of two or more null-hypothesis tests, in an attempt to decrease the number of false positive results, there is an expectation to perform multiple testing corrections. For example, the Bonferroni method is a popular multiplicity correction evaluating p-values against an alpha (e.g., 0.05) divided by the number of conducted tests. It is well known that post-hoc multiple testing correction (e.g., corrections not accounted for during the design stage by increasing the collected sample size) will decrease power ( Figure 3 ). An often overlooked point is that, depending on the unknown balance between false positives and true positives in a set of test results, applying multiplicity correction can sometimes increase the false discovery rate (i.e., the fraction of false positives divided by the total number of positive tests) instead of reducing it. For example, Figure 4A presents a naive expectation of multiple testing corrections, where the false discovery rate decreases from 1/3 to 0. However, there is no reason why the scenario depicted in Figure 4B may not occur; here, the false discovery rate increases from 1/3 to 1. Figure 4. Illustrating the impact of multiple testing correction on the false discovery rate. In each panel, four tests are true positives (orangered) and six are false positives (black). Horizontal lines are drawn at 0.05 and 0.005, the latter reflecting a Bonferroni correction for the 10 applied null-hypothesis tests. As shown in Figure 5 , in which the distribution of p-values is generated in the absence and presence of an association, small p-values may occur in both settings. As such, while a small p-value (or equivalently, a extreme test statistic) is unlikely to occur when there is no association, observing a single small p-value is insufficient to differentiate between true and false positive results. Concepts such as indirect or direct replication and internal consistency are more relevant to differentiate between true and false positives. For example, in the case of rivaroxaban, associations with multiple types of ischemic cardiovascular events (e.g., myocardial infarction, ischemic stroke, and acute limb ischemia) will be more convincing than an association with any one outcome. Figure 5. The distribution of p-values when the null-hypothesis is true or false. N.b. The p-values were derived by arbitrarily sampling 1,000 test statistic from a normal distribution and leveraging its cumulative density function to calculate the area on the left and right side of the sampled test-statistic. Specifically, the employed standard distribution had a standard deviation of 1 and mean of either 0 or 2, when the null-hypothesis was true and false, respectively. Please note that the normal distribution is only used as an exemplar, and alternative distributions with a known cumulative density function (e.g. chi-square, beta, or gamma) could have been used instead. While individual p-values and null-hypothesis tests cannot differentiate between false and true positive results, a set of p-values ( Figure 5 ) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results. This approach is independent of the specific statistical test used to derive individual p-values. Moreover, the method can be generalised to account for dependencies among p-values, such as dependencies arising from the inclusion of both composite and individual outcomes (e.g., evaluating both any stroke and ischaemic stroke). Similarly, one can determine the proportion of p-values that are smaller than a predefined alpha; for example, in Figure 5 , the proportion of p-values smaller than 0.05 is of course 0.05 for the top panel and 0.49 for the lower panel. Returning to our illustrative example, despite showing a potentially protective association with myocardial infarction, ischemic stroke, major amputation, and venous thromboembolism, the null-hypothesis could only be rejected for the association between rivaroxaban and acute limb ischemia: HR 0.67 (95% 0.55; 0.82). 4 Utilizing a non-parametric Kolmogorov-Smirnov test to compare the set of p-values for all the aforementioned outcomes against a uniform distribution nevertheless resulted in a p-value of 0.02, suggesting that the protective effect of rivaroxaban is shared across multiple cardiovascular outcomes. Discussion In the current manuscript, we have addressed why statistical tests cannot be used to support the strict null-hypothesis. Instead, concerns regarding the safety or lack of efficacy should be evaluated using equivalence testing. This can be readily implemented in any study by combining confidence intervals with bounds of clinical insignificance. Such an approach provides direct information on whether a non-significant test is due to an association and its variability being sufficiently small or simply reflects a lack of accuracy. Furthermore, contrary to expectation and depending on the unknown proportion of true positive results, multiple testing corrections may increase the false discovery rate. Finally, because power and type 1 error rate make extreme assumptions about whether all results are true or false positives, these concepts have limited relevance after the data have been collected and analysed. The current manuscript provides guidance on how standard null-hypothesis testing can be used to provide clinically meaningful insights, and attempts to move beyond the current erroneous modus vivendi, categorizing associations as true and false. Contrary to recent calls to completely abandon significance testing, 2 this contribution calls for a more considerate and bespoke application of the currently available and ubiquitously accepted methods. Specifically, researchers should routinely indicate bounds between which an effect is sufficiently small to be considered clinically irrelevant. Related to this, the idea that any intervention should (or even can) be without harmful side effects needs to be dismissed and replaced with a notion of benefit versus harm, where clinically supported bounds off irrelevance can help to directly inform. Second, while notions of power and type 1 errors are essential at the study design phase, because these deal in hypothetical scenarios where all results are either true or false such metrics have limited relevance when interpreting results. Power and type 1 errors can be framed in terms of probabilities because the analysis has not yet been conducted. Once the experiment has been completed, these hypothetical probabilities are immaterial, and one is simply confronted with an unknown proportion of true-positive results. At this stage, concepts of power and type 1 error must be replaced by indicators of precision, such as confidence intervals. Instead of using confidence intervals as a proxy for null-hypothesis testing (i.e., whether the null-hypothesis value is excluded), inference should focus on determining too what extent there is sufficient precision to exclude meaningful differences. Finally, while decreasing the significance threshold (e.g. from 0.05 to 0.005) decreases the type 1 error rate this decreases power as well, and hence may decrease the number of true associations discovered. Depending on the area of research overlooked, true positive results may be more harmful than false positive results. For example, protein drug targets identified in early drug development are often subjected to a substantial number of follow-up analyses, which filter out false positive results. Such follow-up studies, however, rarely expand the number of candidates, hence suggesting a more inclusive approach might be more considerate. In settings more proximal to clinical implementation and less discovery oriented, such as phase 3 clinical trials, stringent multiple testing correction is clearly called for. Notwithstanding, it is important to realize that not every study needs to be designed as a clinical trial. In conclusion, failure to reject a strict null-hypothesis does not support the absence of a clinically meaningful association. Instead, researchers should routinely apply composite null-hypothesis tests evaluated against meaningful bounds of insignificance. Genuine consideration of estimation accuracy, as provided through confidence intervals, precludes the need for questionable post-hoc power calculations. Finally, because power and type 1 errors unrealistically assume that all results are true or false, these concepts have limited value once data collection and analysis have been completed. Author contributions AFS designed the illustrations and wrote the manuscript. Ethics and consent Ethics and consent were not required. Data availability No data associated with this article. Extended data Addressing common inferential mistakes when failing to reject the null-hypothesis https://doi.org/10.5522/04/27854043 . 10 Data are available under the terms of the Creative Commons Attribution 4.0 International license (CC-BY 4.0). References 1. Wasserstein RL, Lazar NA: The ASA Statement on p-Values: Context, Process, and Purpose. Am. Stat. 2016; 70 : 129–133. Publisher Full Text 2. Rafi Z, Greenland S: Semantic and cognitive tools to aid statistical science: replace confidence and significance by compatibility and surprise. BMC Med. Res. Methodol. 2020; 20 : 244. PubMed Abstract | Publisher Full Text | Free Full Text 3. Cole SR, Edwards JK, Greenland S: Surprise!. Am. J. Epidemiol. 2021; 190 : 191–193. PubMed Abstract | Publisher Full Text | Free Full Text 4. Bonaca MP, et al. : Rivaroxaban in Peripheral Artery Disease after Revascularization. N. Engl. J. Med. 2020; 382 : 1994–2004. Publisher Full Text 5. Rymer J, et al. : Rivaroxaban Plus Aspirin Versus Aspirin Alone After Endovascular Revascularization for Symptomatic PAD: Insights From VOYAGER PAD. Circulation. 2023; 148 : 1919–1928. PubMed Abstract | Publisher Full Text 6. Calabro P, Gragnano F, Pirro M: Cognitive function in a randomized trial of evolocumab. N. Engl. J. Med. 2017; 377 : 1996–1997. PubMed Abstract | Publisher Full Text 7. Althunian TA, de Boer A , Groenwold RHH, et al. : Defining the noninferiority margin and analysing noninferiority: An overview.Br. J. Clin. Pharmacol.2017 Aug; 83 (8):1636–1642. PubMed Abstract | Publisher Full Text | Free Full Text 8. Hoenig JM, Heisey DM: The Abuse of Power. Am. Stat. 2001; 55 : 19–24. Publisher Full Text 9. Thomas L: Retrospective Power Analysis. Conserv. Biol. 1997; 11 : 276–280. 10. Addressing common inferential mistakes when failing to reject the null-hypothesis. DOI: 10.5522/04/27854043 Comments on this article Comments (0) Version 3 VERSION 3 PUBLISHED 05 Dec 2024 ADD YOUR COMMENT Comment Author details Author details 1 Department of Cardiology, University of Amsterdam, Amsterdam Zuidoost, 22660, Netherlands Antilles 2 University College London Faculty of Population Health Sciences, London, England, UK 3 Department of Cardiology, Utrecht University, Heidelberglaan, 3584 CX, Netherlands Antilles 4 UCL British Heart Foundation Research Accelerator, London, Chenies Mews, WC1E6HX, UK Amand Schmidt Roles: Conceptualization, Formal Analysis, Methodology, Visualization, Writing – Original Draft Preparation, Writing – Review & Editing Competing interests AFS has received funding from New Amsterdam Pharma unrelated projects. Grant information AFS was supported by BHF grant PG/22/10989, the UCL BHF Research Accelerator AA/18/6/34223, MR/V033867/1, and the National Institute for Health and Care Research University College London Hospitals Biomedical Research Centre. The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript. Article Versions (3) version 3 Revised Published: 01 Apr 2025, 13:1488 https://doi.org/10.12688/f1000research.158434.3 version 2 Revised Published: 25 Feb 2025, 13:1488 https://doi.org/10.12688/f1000research.158434.2 version 1 Published: 05 Dec 2024, 13:1488 https://doi.org/10.12688/f1000research.158434.1 Copyright © 2025 Schmidt A. This is an open access article distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. Download Export To Sciwheel Bibtex EndNote ProCite Ref. Manager (RIS) Sente metrics Views Downloads F1000Research - - PubMed Central info_outline Data from PMC are received and updated monthly. - - Citations open_in_new 0 open_in_new 0 open_in_new SEE MORE DETAILS CITE how to cite this article Schmidt A. Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.12688/f1000research.158434.3 ) NOTE: If applicable, it is important to ensure the information in square brackets after the title is included in all citations of this article. COPY CITATION DETAILS track receive updates on this article Track an article to receive email alerts on any updates to this article. TRACK THIS ARTICLE Share Open Peer Review Current Reviewer Status: ? Key to Reviewer Statuses VIEW HIDE Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions Version 2 VERSION 2 PUBLISHED 25 Feb 2025 Revised Views 0 Cite How to cite this report: Heimel JA. Reviewer Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.177945.r368387 ) The direct URL for this report is: https://f1000research.com/articles/13-1488/v2#referee-response-368387 NOTE: it is important to ensure the information in square brackets after the title is included in this citation. Close Copy Citation Details Reviewer Report 21 Mar 2025 J. Alexander Heimel , Netherlands Institute for Neuroscience, Amsterdam, The Netherlands Approved VIEWS 0 https://doi.org/10.5256/f1000research.177945.r368387 The author has answered my comments to my satisfaction, except for one small point. In his reply to my question, the author wrote that the relationship between p-value and observed power is independent of sample size n. ... Continue reading READ ALL The author has answered my comments to my satisfaction, except for one small point. In his reply to my question, the author wrote that the relationship between p-value and observed power is independent of sample size n. I do not believe this to be true. In the new supplementary material, the author sketches this relationship, but ignores the fact the t-distribution (denoted as g in the supplementary material) is dependent on the degrees of freedom (d.o.f. = n-1) and therefore on the sample size. The value t_c will thus depend on n and the observed power, given by the author as g(t_0-t_c)+g(-t_0-t_c) will also depend on n, through g and t_c. In fact, I believe the correct formula for observed power should be: Observed_power(t) = g_{d.o.f, t}(t_c) + g_{d.o.f, -t}(t_c) With g_{d.o.f.,t}(t_c) the cumulative non-central t-distribution with d.o.f. degrees of freedom shift t evaluated at t_c. For large n, the approximations used by the author converge, and the dependence on the sample size disappears. This dependence on sample size, however, does not fundamentally change or invalidate the author’s argument that a post-hoc power calculation using the observed effect is not useful. While studying the relationship between observed power and p-value for my reply, I came across the Wikipedia page on power analysis that also makes the author’s point that post-hoc power analysis is fundamentally flawed, with references to Hoenig (The American Statistician, 2001), cited more than 2000 times, and Thomas (Conservation Biology, 1997) already pointing out the same flaw. The author’s case is thus not new, but may still be very relevant and a good reminder. It certainly was for me. Minor details about the new supplementary note: I think that when writing “the p-value for a one-sided tests would simply be 1-g(|t_0|)”, already the assumption of f being symmetrical about zero is made. The statement is not true for a general f and t_0<0. “In this case the two-sided p-value is 2x(1-g(2)) \approx 0.46”. I believe this should be “\approx 0.046” (and is only true for large sample size). References 1. Hoenig J, Heisey D: The Abuse of Power. The American Statistician . 2001; 55 (1): 19-24 Publisher Full Text 2. Thomas, Len. "Retrospective power analysis." Conservation Biology 11.1 (1997): 276-280. Competing Interests: No competing interests were disclosed. Reviewer Expertise: Neuroscience I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard. Close READ LESS CITE CITE HOW TO CITE THIS REPORT Heimel JA. Reviewer Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.177945.r368387 ) The direct URL for this report is: https://f1000research.com/articles/13-1488/v2#referee-response-368387 NOTE: it is important to ensure the information in square brackets after the title is included in all citations of this article. COPY CITATION DETAILS Report a concern Author Response 01 Apr 2025 Amand Schmidt , UCL British Heart Foundation Research Accelerator, London, WC1E6HX, UK 01 Apr 2025 Author Response Reviewer 2 Comment: The author has answered my comments to my satisfaction, except for one small point. In his reply to my question, the author wrote that the relationship ... Continue reading Reviewer 2 Comment: The author has answered my comments to my satisfaction, except for one small point. In his reply to my question, the author wrote that the relationship between p-value and observed power is independent of sample size n. I do not believe this to be true. In the new supplementary material, the author sketches this relationship, but ignores the fact the t-distribution (denoted as g in the supplementary material) is dependent on the degrees of freedom (d.o.f. = n-1) and therefore on the sample size. The value t_c will thus depend on n and the observed power, given by the author as g(t_0-t_c)+g(-t_0-t_c) will also depend on n, through g and t_c. In fact, I believe the correct formula for observed power should be: Observed_power(t) = g_{d.o.f, t}(t_c) + g_{d.o.f, -t}(t_c) With g_{d.o.f.,t}(t_c) the cumulative non-central t-distribution with d.o.f. degrees of freedom shift t evaluated at t_c. For large n, the approximations used by the author converge, and the dependence on the sample size disappears. This dependence on sample size, however, does not fundamentally change or invalidate the author’s argument that a post-hoc power calculation using the observed effect is not useful. Response: I am pleased to note that the reviewer agrees with the main conclusions of the manuscript. The reviewer appears to be referring to the sample size–dependent convergence of the t -distribution to the standard normal distribution. While this is, of course, entirely correct and uncontested, it is not directly relevant to the point under discussion. The relationship between p-values and statistical power is independent of the choice of distribution, and does not rely on any asymptotic approximation. Although the illustrative examples in the manuscript use the standard normal distribution -which does not depend on sample size or degrees of freedom - the relationship between p-value and power holds equally when using distributions that do, such as the t- or F-distributions. This is because both the p -value and power are calculated using the same distribution, with the same degrees of freedom. For example, if one calculates a p-value using a t-distribution with 7 degrees of freedom, the corresponding power would also be calculated using that same t-distribution with 7 degrees of freedom. Thus, while the distribution itself may depend on sample size, the relationship between p-value and power remains exact and independent of sample size. The following was added to the supplementary to help clarify this. “ As in the previous section, these expressions do not depend on the amount of data collected and are therefore exact. If one is uses a statistical distribution with degrees of freedom (which often depend on sample size), such as the t or F distributions, the relationship between the p-value and power remain exact, simply because both quantities are calculated using the same distribution with the same degrees of freedom. “ Comment: While studying the relationship between observed power and p-value for my reply, I came across the Wikipedia page on power analysis that also makes the author’s point that post-hoc power analysis is fundamentally flawed, with references to Hoenig (The American Statistician, 2001), cited more than 2000 times, and Thomas (Conservation Biology, 1997) already pointing out the same flaw. The author’s case is thus not new, but may still be very relevant and a good reminder. It certainly was for me. Response: Thank you these reference have been included in the updated manuscript. Comment: Minor details about the new supplementary note: I think that when writing “the p-value for a one-sided tests would simply be 1-g(|t_0|)”, already the assumption of f being symmetrical about zero is made. The statement is not true for a general f and t_0<0. Response: This section briefly describes p-values for one-sided and two-sided tests. The section has been clarified to explain that this statement is not an assumption but merely reflects a known quantity about some distributions being symmetrical about zero where others are not. The expanded section now provides examples of alternative distributions which are not symmetrical about zero. The reason that this was not originally included is that commonly known non-symmetrical distribution such as the Chi-square, the F distribution, or even the Gamma distribution are not defined for negative values making them less applicable for two-sided tests. “ Next, say we observe test-statistic t o , in that case the p-value for a one-sided tests would simply be 1-g t o , and assuming f is symmetrical about zero (e.g. the normal distribution or t-distribution) the p-value for a two-sided test would simply be 2× 1-g t o . For distributions that are not symmetric about zero, both sides of the distribution should be considered separately. It is important to note, however, that many commonly used asymmetric distributions—such as the F, Chi-squared, and Gamma distributions—are defined only for positive values. This restricts their direct applicability for two-sided hypothesis testing. “ Comment: “In this case the two-sided p-value is 2x(1-g(2)) \approx 0.46”. I believe this should be “\approx 0.046” (and is only true for large sample size). Response: Thanks for spotting this type-o, this should be 0.046 indeed. However, this is not dependent on any sample size assumption. The approximate symbol (≈) is used purely to reflect rounding to three decimal places, not to imply asymptotic behaviour or large-sample properties. It’s possible the reviewer is conflating this with the well-known approximation of the t -distribution by the standard normal distribution as sample size increases. While that is certainly valid in a different context, it is not relevant here. I am not making any claims about inference or distributional convergence; rather, I am simply referring to the numerical value of the cumulative distribution function. To clarify: if g denotes the cumulative distribution function of the standard normal distribution, then 2×(1-g 2 ≈0.046 , regardless of sample size. Similarly, if one were to use the t -distribution with 7 degrees of freedom, the corresponding value would be approximately 0.086 - the choice of 7 here being arbitrary and illustrative. This is a purely mathematical statement about evaluating a distribution function at a specific point, and not a comment on sampling or estimation. Reviewer 2 Comment: The author has answered my comments to my satisfaction, except for one small point. In his reply to my question, the author wrote that the relationship between p-value and observed power is independent of sample size n. I do not believe this to be true. In the new supplementary material, the author sketches this relationship, but ignores the fact the t-distribution (denoted as g in the supplementary material) is dependent on the degrees of freedom (d.o.f. = n-1) and therefore on the sample size. The value t_c will thus depend on n and the observed power, given by the author as g(t_0-t_c)+g(-t_0-t_c) will also depend on n, through g and t_c. In fact, I believe the correct formula for observed power should be: Observed_power(t) = g_{d.o.f, t}(t_c) + g_{d.o.f, -t}(t_c) With g_{d.o.f.,t}(t_c) the cumulative non-central t-distribution with d.o.f. degrees of freedom shift t evaluated at t_c. For large n, the approximations used by the author converge, and the dependence on the sample size disappears. This dependence on sample size, however, does not fundamentally change or invalidate the author’s argument that a post-hoc power calculation using the observed effect is not useful. Response: I am pleased to note that the reviewer agrees with the main conclusions of the manuscript. The reviewer appears to be referring to the sample size–dependent convergence of the t -distribution to the standard normal distribution. While this is, of course, entirely correct and uncontested, it is not directly relevant to the point under discussion. The relationship between p-values and statistical power is independent of the choice of distribution, and does not rely on any asymptotic approximation. Although the illustrative examples in the manuscript use the standard normal distribution -which does not depend on sample size or degrees of freedom - the relationship between p-value and power holds equally when using distributions that do, such as the t- or F-distributions. This is because both the p -value and power are calculated using the same distribution, with the same degrees of freedom. For example, if one calculates a p-value using a t-distribution with 7 degrees of freedom, the corresponding power would also be calculated using that same t-distribution with 7 degrees of freedom. Thus, while the distribution itself may depend on sample size, the relationship between p-value and power remains exact and independent of sample size. The following was added to the supplementary to help clarify this. “ As in the previous section, these expressions do not depend on the amount of data collected and are therefore exact. If one is uses a statistical distribution with degrees of freedom (which often depend on sample size), such as the t or F distributions, the relationship between the p-value and power remain exact, simply because both quantities are calculated using the same distribution with the same degrees of freedom. “ Comment: While studying the relationship between observed power and p-value for my reply, I came across the Wikipedia page on power analysis that also makes the author’s point that post-hoc power analysis is fundamentally flawed, with references to Hoenig (The American Statistician, 2001), cited more than 2000 times, and Thomas (Conservation Biology, 1997) already pointing out the same flaw. The author’s case is thus not new, but may still be very relevant and a good reminder. It certainly was for me. Response: Thank you these reference have been included in the updated manuscript. Comment: Minor details about the new supplementary note: I think that when writing “the p-value for a one-sided tests would simply be 1-g(|t_0|)”, already the assumption of f being symmetrical about zero is made. The statement is not true for a general f and t_0<0. Response: This section briefly describes p-values for one-sided and two-sided tests. The section has been clarified to explain that this statement is not an assumption but merely reflects a known quantity about some distributions being symmetrical about zero where others are not. The expanded section now provides examples of alternative distributions which are not symmetrical about zero. The reason that this was not originally included is that commonly known non-symmetrical distribution such as the Chi-square, the F distribution, or even the Gamma distribution are not defined for negative values making them less applicable for two-sided tests. “ Next, say we observe test-statistic t o , in that case the p-value for a one-sided tests would simply be 1-g t o , and assuming f is symmetrical about zero (e.g. the normal distribution or t-distribution) the p-value for a two-sided test would simply be 2× 1-g t o . For distributions that are not symmetric about zero, both sides of the distribution should be considered separately. It is important to note, however, that many commonly used asymmetric distributions—such as the F, Chi-squared, and Gamma distributions—are defined only for positive values. This restricts their direct applicability for two-sided hypothesis testing. “ Comment: “In this case the two-sided p-value is 2x(1-g(2)) \approx 0.46”. I believe this should be “\approx 0.046” (and is only true for large sample size). Response: Thanks for spotting this type-o, this should be 0.046 indeed. However, this is not dependent on any sample size assumption. The approximate symbol (≈) is used purely to reflect rounding to three decimal places, not to imply asymptotic behaviour or large-sample properties. It’s possible the reviewer is conflating this with the well-known approximation of the t -distribution by the standard normal distribution as sample size increases. While that is certainly valid in a different context, it is not relevant here. I am not making any claims about inference or distributional convergence; rather, I am simply referring to the numerical value of the cumulative distribution function. To clarify: if g denotes the cumulative distribution function of the standard normal distribution, then 2×(1-g 2 ≈0.046 , regardless of sample size. Similarly, if one were to use the t -distribution with 7 degrees of freedom, the corresponding value would be approximately 0.086 - the choice of 7 here being arbitrary and illustrative. This is a purely mathematical statement about evaluating a distribution function at a specific point, and not a comment on sampling or estimation. Competing Interests: No competing interests were disclosed. Close Report a concern Respond or Comment COMMENTS ON THIS REPORT Author Response 01 Apr 2025 Amand Schmidt , UCL British Heart Foundation Research Accelerator, London, WC1E6HX, UK 01 Apr 2025 Author Response Reviewer 2 Comment: The author has answered my comments to my satisfaction, except for one small point. In his reply to my question, the author wrote that the relationship ... Continue reading Reviewer 2 Comment: The author has answered my comments to my satisfaction, except for one small point. In his reply to my question, the author wrote that the relationship between p-value and observed power is independent of sample size n. I do not believe this to be true. In the new supplementary material, the author sketches this relationship, but ignores the fact the t-distribution (denoted as g in the supplementary material) is dependent on the degrees of freedom (d.o.f. = n-1) and therefore on the sample size. The value t_c will thus depend on n and the observed power, given by the author as g(t_0-t_c)+g(-t_0-t_c) will also depend on n, through g and t_c. In fact, I believe the correct formula for observed power should be: Observed_power(t) = g_{d.o.f, t}(t_c) + g_{d.o.f, -t}(t_c) With g_{d.o.f.,t}(t_c) the cumulative non-central t-distribution with d.o.f. degrees of freedom shift t evaluated at t_c. For large n, the approximations used by the author converge, and the dependence on the sample size disappears. This dependence on sample size, however, does not fundamentally change or invalidate the author’s argument that a post-hoc power calculation using the observed effect is not useful. Response: I am pleased to note that the reviewer agrees with the main conclusions of the manuscript. The reviewer appears to be referring to the sample size–dependent convergence of the t -distribution to the standard normal distribution. While this is, of course, entirely correct and uncontested, it is not directly relevant to the point under discussion. The relationship between p-values and statistical power is independent of the choice of distribution, and does not rely on any asymptotic approximation. Although the illustrative examples in the manuscript use the standard normal distribution -which does not depend on sample size or degrees of freedom - the relationship between p-value and power holds equally when using distributions that do, such as the t- or F-distributions. This is because both the p -value and power are calculated using the same distribution, with the same degrees of freedom. For example, if one calculates a p-value using a t-distribution with 7 degrees of freedom, the corresponding power would also be calculated using that same t-distribution with 7 degrees of freedom. Thus, while the distribution itself may depend on sample size, the relationship between p-value and power remains exact and independent of sample size. The following was added to the supplementary to help clarify this. “ As in the previous section, these expressions do not depend on the amount of data collected and are therefore exact. If one is uses a statistical distribution with degrees of freedom (which often depend on sample size), such as the t or F distributions, the relationship between the p-value and power remain exact, simply because both quantities are calculated using the same distribution with the same degrees of freedom. “ Comment: While studying the relationship between observed power and p-value for my reply, I came across the Wikipedia page on power analysis that also makes the author’s point that post-hoc power analysis is fundamentally flawed, with references to Hoenig (The American Statistician, 2001), cited more than 2000 times, and Thomas (Conservation Biology, 1997) already pointing out the same flaw. The author’s case is thus not new, but may still be very relevant and a good reminder. It certainly was for me. Response: Thank you these reference have been included in the updated manuscript. Comment: Minor details about the new supplementary note: I think that when writing “the p-value for a one-sided tests would simply be 1-g(|t_0|)”, already the assumption of f being symmetrical about zero is made. The statement is not true for a general f and t_0<0. Response: This section briefly describes p-values for one-sided and two-sided tests. The section has been clarified to explain that this statement is not an assumption but merely reflects a known quantity about some distributions being symmetrical about zero where others are not. The expanded section now provides examples of alternative distributions which are not symmetrical about zero. The reason that this was not originally included is that commonly known non-symmetrical distribution such as the Chi-square, the F distribution, or even the Gamma distribution are not defined for negative values making them less applicable for two-sided tests. “ Next, say we observe test-statistic t o , in that case the p-value for a one-sided tests would simply be 1-g t o , and assuming f is symmetrical about zero (e.g. the normal distribution or t-distribution) the p-value for a two-sided test would simply be 2× 1-g t o . For distributions that are not symmetric about zero, both sides of the distribution should be considered separately. It is important to note, however, that many commonly used asymmetric distributions—such as the F, Chi-squared, and Gamma distributions—are defined only for positive values. This restricts their direct applicability for two-sided hypothesis testing. “ Comment: “In this case the two-sided p-value is 2x(1-g(2)) \approx 0.46”. I believe this should be “\approx 0.046” (and is only true for large sample size). Response: Thanks for spotting this type-o, this should be 0.046 indeed. However, this is not dependent on any sample size assumption. The approximate symbol (≈) is used purely to reflect rounding to three decimal places, not to imply asymptotic behaviour or large-sample properties. It’s possible the reviewer is conflating this with the well-known approximation of the t -distribution by the standard normal distribution as sample size increases. While that is certainly valid in a different context, it is not relevant here. I am not making any claims about inference or distributional convergence; rather, I am simply referring to the numerical value of the cumulative distribution function. To clarify: if g denotes the cumulative distribution function of the standard normal distribution, then 2×(1-g 2 ≈0.046 , regardless of sample size. Similarly, if one were to use the t -distribution with 7 degrees of freedom, the corresponding value would be approximately 0.086 - the choice of 7 here being arbitrary and illustrative. This is a purely mathematical statement about evaluating a distribution function at a specific point, and not a comment on sampling or estimation. Reviewer 2 Comment: The author has answered my comments to my satisfaction, except for one small point. In his reply to my question, the author wrote that the relationship between p-value and observed power is independent of sample size n. I do not believe this to be true. In the new supplementary material, the author sketches this relationship, but ignores the fact the t-distribution (denoted as g in the supplementary material) is dependent on the degrees of freedom (d.o.f. = n-1) and therefore on the sample size. The value t_c will thus depend on n and the observed power, given by the author as g(t_0-t_c)+g(-t_0-t_c) will also depend on n, through g and t_c. In fact, I believe the correct formula for observed power should be: Observed_power(t) = g_{d.o.f, t}(t_c) + g_{d.o.f, -t}(t_c) With g_{d.o.f.,t}(t_c) the cumulative non-central t-distribution with d.o.f. degrees of freedom shift t evaluated at t_c. For large n, the approximations used by the author converge, and the dependence on the sample size disappears. This dependence on sample size, however, does not fundamentally change or invalidate the author’s argument that a post-hoc power calculation using the observed effect is not useful. Response: I am pleased to note that the reviewer agrees with the main conclusions of the manuscript. The reviewer appears to be referring to the sample size–dependent convergence of the t -distribution to the standard normal distribution. While this is, of course, entirely correct and uncontested, it is not directly relevant to the point under discussion. The relationship between p-values and statistical power is independent of the choice of distribution, and does not rely on any asymptotic approximation. Although the illustrative examples in the manuscript use the standard normal distribution -which does not depend on sample size or degrees of freedom - the relationship between p-value and power holds equally when using distributions that do, such as the t- or F-distributions. This is because both the p -value and power are calculated using the same distribution, with the same degrees of freedom. For example, if one calculates a p-value using a t-distribution with 7 degrees of freedom, the corresponding power would also be calculated using that same t-distribution with 7 degrees of freedom. Thus, while the distribution itself may depend on sample size, the relationship between p-value and power remains exact and independent of sample size. The following was added to the supplementary to help clarify this. “ As in the previous section, these expressions do not depend on the amount of data collected and are therefore exact. If one is uses a statistical distribution with degrees of freedom (which often depend on sample size), such as the t or F distributions, the relationship between the p-value and power remain exact, simply because both quantities are calculated using the same distribution with the same degrees of freedom. “ Comment: While studying the relationship between observed power and p-value for my reply, I came across the Wikipedia page on power analysis that also makes the author’s point that post-hoc power analysis is fundamentally flawed, with references to Hoenig (The American Statistician, 2001), cited more than 2000 times, and Thomas (Conservation Biology, 1997) already pointing out the same flaw. The author’s case is thus not new, but may still be very relevant and a good reminder. It certainly was for me. Response: Thank you these reference have been included in the updated manuscript. Comment: Minor details about the new supplementary note: I think that when writing “the p-value for a one-sided tests would simply be 1-g(|t_0|)”, already the assumption of f being symmetrical about zero is made. The statement is not true for a general f and t_0<0. Response: This section briefly describes p-values for one-sided and two-sided tests. The section has been clarified to explain that this statement is not an assumption but merely reflects a known quantity about some distributions being symmetrical about zero where others are not. The expanded section now provides examples of alternative distributions which are not symmetrical about zero. The reason that this was not originally included is that commonly known non-symmetrical distribution such as the Chi-square, the F distribution, or even the Gamma distribution are not defined for negative values making them less applicable for two-sided tests. “ Next, say we observe test-statistic t o , in that case the p-value for a one-sided tests would simply be 1-g t o , and assuming f is symmetrical about zero (e.g. the normal distribution or t-distribution) the p-value for a two-sided test would simply be 2× 1-g t o . For distributions that are not symmetric about zero, both sides of the distribution should be considered separately. It is important to note, however, that many commonly used asymmetric distributions—such as the F, Chi-squared, and Gamma distributions—are defined only for positive values. This restricts their direct applicability for two-sided hypothesis testing. “ Comment: “In this case the two-sided p-value is 2x(1-g(2)) \approx 0.46”. I believe this should be “\approx 0.046” (and is only true for large sample size). Response: Thanks for spotting this type-o, this should be 0.046 indeed. However, this is not dependent on any sample size assumption. The approximate symbol (≈) is used purely to reflect rounding to three decimal places, not to imply asymptotic behaviour or large-sample properties. It’s possible the reviewer is conflating this with the well-known approximation of the t -distribution by the standard normal distribution as sample size increases. While that is certainly valid in a different context, it is not relevant here. I am not making any claims about inference or distributional convergence; rather, I am simply referring to the numerical value of the cumulative distribution function. To clarify: if g denotes the cumulative distribution function of the standard normal distribution, then 2×(1-g 2 ≈0.046 , regardless of sample size. Similarly, if one were to use the t -distribution with 7 degrees of freedom, the corresponding value would be approximately 0.086 - the choice of 7 here being arbitrary and illustrative. This is a purely mathematical statement about evaluating a distribution function at a specific point, and not a comment on sampling or estimation. Competing Interests: No competing interests were disclosed. Close Report a concern COMMENT ON THIS REPORT Views 0 Cite How to cite this report: Cui Y. Reviewer Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.177945.r368388 ) The direct URL for this report is: https://f1000research.com/articles/13-1488/v2#referee-response-368388 NOTE: it is important to ensure the information in square brackets after the title is included in this citation. Close Copy Citation Details Reviewer Report 06 Mar 2025 Ying Cui , Stanford University, Stanford,, California, USA Approved VIEWS 0 https://doi.org/10.5256/f1000research.177945.r368388 No ... Continue reading READ ALL No further comments. Competing Interests: No competing interests were disclosed. Reviewer Expertise: Hypothesis testing, Biostatistics I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard. Close READ LESS CITE CITE HOW TO CITE THIS REPORT Cui Y. Reviewer Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.177945.r368388 ) The direct URL for this report is: https://f1000research.com/articles/13-1488/v2#referee-response-368388 NOTE: it is important to ensure the information in square brackets after the title is included in all citations of this article. COPY CITATION DETAILS Report a concern Respond or Comment COMMENT ON THIS REPORT Version 1 VERSION 1 PUBLISHED 05 Dec 2024 Views 0 Cite How to cite this report: Heimel JA. Reviewer Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.174018.r350494 ) The direct URL for this report is: https://f1000research.com/articles/13-1488/v1#referee-response-350494 NOTE: it is important to ensure the information in square brackets after the title is included in this citation. Close Copy Citation Details Reviewer Report 07 Jan 2025 J. Alexander Heimel , Netherlands Institute for Neuroscience, Amsterdam, The Netherlands Approved with Reservations VIEWS 0 https://doi.org/10.5256/f1000research.174018.r350494 The manuscript, “Addressing common inferential mistakes when failing to reject the null-hypothesis” contains two main messages. First, the author argues that post-hoc power calculations should be avoided, as “they offer no additional information beyond statistical tests and p-values” and they ... Continue reading READ ALL The manuscript, “Addressing common inferential mistakes when failing to reject the null-hypothesis” contains two main messages. First, the author argues that post-hoc power calculations should be avoided, as “they offer no additional information beyond statistical tests and p-values” and they “can be misleading because of an inability to distinguish between results based on insufficient sample size and results that reflect clinically irrelevant differences”. Second, for interpreting the results of multiple comparison, the authors recommends to consider “the distribution of p-values and the proportion of significant results to identify bodies” rather than standard multiple testing procedures, because they “unrealistically assume that all positive results are false positive”. The paper is an interesting read, but I have a few remarks. I have two comments on the first point. “p-values and observed power are equivalent, and no additional information is obtained by considering both ( Figure 3).” No proof is given of this statement and it is not completely trivial. Can a (hint of a) proof be given or a reference with a proof cited? I believe the relationship shown and figure 3 between p and power depends on the number of samples, but this is not shown in the text or figure. Can you comment on this? The dependence on the sample number could give some added value for making this post-hoc power calculation, and p-values and observed power are thus not completely equivalent. Are there examples that people do this particular post-hoc power calculation using the measured effect and the measured standard deviation? What I have encountered, is post-hoc power calculations based on the measured standard deviation of the sample, and for an assumed effect size (rather than the measured effect). If there are publications doing the post-hoc power analysis in the way suggested by this publication, then please give some example references. Otherwise, the reader is left to wonder if this part of the manuscript is arguing against hypothetical reasoning that people do not actually use. The second point of the paper is that “While individual p-values and null hypothesis tests cannot differentiate between false and true positive results, a set of p-values (Figure 5) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results.” This can indeed be helpful. However, if assumptions like independence between tests or normality of the measurements for some tests fail, then this meta-analysis on the p-value may give a false positive outcome. Without a good understanding of the underlying data and statistics, I would use this test only to suggest that results are possibly false positives if the distribution of p-values is not distinguishable from a uniform distribution, rather than concluding that there must be some true positive effect if the distribution of p-values is not uniform, as is done by the author for the specific example for the effect of rivaroxaban. These are my main comments. I also have a number of smaller remarks. The manuscript also discusses whether null hypothesis testing can prove whether difference in means between two (infinite) populations is truly zero, but I fail to see the connection of this more philosophical point on the interpretation of hypothesis testing to the main messages. I think that the manuscript would be stronger if it is left out. The meaning of the sentence “Given that power and type 1 error make extreme assumptions where either all results are true or false positives, these concepts are less relevant after the data have been collected. ” is not at all clear to me. Perhaps rephrase to clarify. “applying multiplicity correction can sometimes increase the false positive rate instead of reducing it.” In this sentence, and the ensuing section the term “false positive rate” is used differently from what is the common use in the statistical literature. In the literature, “false positive rate” is the expectation of a false positive result when repeating the procedure. The meaning that is taken in this manuscript is that of the fraction of the significant results of a given set of experiments that is a false positive. It is not clear that the interpretation chosen here, and which is shown by example to lead to a false expectation, actually often occurs in the scientific literature. Can some example references be provided? Fig. 5: The number of samples used for producing the figure is missing. “comparing the set of p-values for all the aforementioned outcomes against a uniform distribution resulted in a p-value of 0.02,” Which test is used for the comparison. “researchers should routinely indicate the bounds between which an effect is sufficiently small to be considered clinically irrelevant” This is a valuable recommendation. Before the start of a study it is necessary to choose an expected effect size for determining the power of a study or the number of samples. If the expected effect size, is below the bound, then either the study should not be started (if one is looking to prove a positive effect) or the bound should be taken as the effect size for the power calculation. “Second, while notions of power and type 1 errors are essential at the study design phase because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results.” I do not fully agree. For a power analysis at the design phase, typically both an effect size and the variance need to be estimated. In my opinion, a post-hoc power analysis could be relevant for interpreting a negative result, to take into account the observed variance. If the variance is a lot larger than a priori expected, then the study could have been underpowered. Perhaps adjust the text, or explain why I am wrong. I do not understand “Finally, depending on the area of research overlooked, true positive results may be more harmful than false positive results”. Is meant that setting too stringent boundaries for significance can be harmful? Consider rephrasing to make its meaning clear. Is the topic of the opinion article discussed accurately in the context of the current literature? Partly Are all factual statements correct and adequately supported by citations? Partly Are arguments sufficiently supported by evidence from the published literature? Partly Are the conclusions drawn balanced and justified on the basis of the presented arguments? Partly Competing Interests: No competing interests were disclosed. Reviewer Expertise: Neuroscience I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above. Close READ LESS CITE CITE HOW TO CITE THIS REPORT Heimel JA. Reviewer Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.174018.r350494 ) The direct URL for this report is: https://f1000research.com/articles/13-1488/v1#referee-response-350494 NOTE: it is important to ensure the information in square brackets after the title is included in all citations of this article. COPY CITATION DETAILS Report a concern Author Response 25 Feb 2025 Amand Schmidt , UCL British Heart Foundation Research Accelerator, London, WC1E6HX, UK 25 Feb 2025 Author Response The manuscript, “Addressing common inferential mistakes when failing to reject the null-hypothesis” contains two main messages. First, the author argues that post-hoc power calculations should be avoided, as “they offer ... Continue reading The manuscript, “Addressing common inferential mistakes when failing to reject the null-hypothesis” contains two main messages. First, the author argues that post-hoc power calculations should be avoided, as “they offer no additional information beyond statistical tests and p-values” and they “can be misleading because of an inability to distinguish between results based on insufficient sample size and results that reflect clinically irrelevant differences”. Second, for interpreting the results of multiple comparison, the authors recommends to consider “the distribution of p-values and the proportion of significant results to identify bodies” rather than standard multiple testing procedures, because they “unrealistically assume that all positive results are false positive”. The paper is an interesting read, but I have a few remarks. I have two comments on the first point. “p-values and observed power are equivalent, and no additional information is obtained by considering both ( Figure 3).” No proof is given of this statement and it is not completely trivial. Can a (hint of a) proof be given or a reference with a proof cited? I believe the relationship shown and figure 3 between p and power depends on the number of samples, but this is not shown in the text or figure. Can you comment on this? The dependence on the sample number could give some added value for making this post-hoc power calculation, and p-values and observed power are thus not completely equivalent. Response: thank you, the following proof is provided in the updated supplementary material, also confirming this relationship is independent of sample size. “ To describe the relationship between p-values and observed power we first let g represent the commulative density function of the p.d.f. f , that is g t = -∞ t f x dx . Next, say we observe test-statistic t o , in that case the p-value for a one-sided tests would simply be 1-g t o , and assuming f is symmetrical about zero the p-value for a two-sided test would simply be 2× 1-g t o . If we let t c represent the critical value of a statistical test beyond which the null-hypothesis is rejected, power can be calculated as g t o -tc +g( -t 0 - t c ) , where in most cases either the first or the second term will be close to zero and can be ignored. As an example, let f be the p.d.f. of the standard normal distribution, with t 0 =2 and t c =1.96 . In this case the two-sided p-value is 2× 1-g 2 ≈0.46 , and power would be g 2-1.96 +g -2-1.96 ≈0.52+0.00=0.52. Similarly, say t 0 =0 (i.e., the null-hypothesis is true), and f and t c are the same as before, we then find that 2× 1-g 0 =1 and g -1.96 +g -1.96 =0.05 . In other words when the null-hypothesis is true the p-value would be 1 and power would be 0.05 (or more generally power would equal to the area excluded by t c ). As such there exists a one-to-one relationship between the p-value and observed power based on t o . Furthermore, and perhaps counterintuitively observed power is smallest when the p-value provides most support for the null-hypothesis and vice versa. This is a reflection of the p-value evaluating t o assuming the null-hypothesis is true, whereas observed power evaluates t o assuming this hypothesis is false. As in the previous section, these expressions do not depend on the amount of data collected and are therefore exact. Furthermore, given that a statistical test, comparing t o to t c , cannot be used to prove the null-hypothesis is true (as shown in the preceding section), the same holds for derived metrics including p-values and power. In fact power can only be calculated assuming the null-hypothesis is false, and hence cannot be used to argue in favour of the null-hypothesis being true. “ Are there examples that people do this particular post-hoc power calculation using the measured effect and the measured standard deviation? What I have encountered, is post-hoc power calculations based on the measured standard deviation of the sample, and for an assumed effect size (rather than the measured effect). If there are publications doing the post-hoc power analysis in the way suggested by this publication, then please give some example references. Otherwise, the reader is left to wonder if this part of the manuscript is arguing against hypothetical reasoning that people do not actually use. Response: Thank you for raising this important point. The manuscript has been updated to clarify that the raised issues with observed power are of general concern irrespective of the whether the point estimates reflect the study results or some pre-existing notion of relevant difference. The concerns about power pertain to its use as a metric to interpret current results. In doing so the manuscript more clearly differentiates with power and/or sample size calculations to inform future research (which are without fault) and power calculations for already completed studies. Page 6-7 “ Researchers may alternatively wish to calculate the power to reject a clinically meaningful difference other than the point estimate. Such calculations can meaningfully inform the design and viability of future studies; although sample size estimates may be more readily interpretable. However, when such power calculations are used to make statements about the presence or absence of an effect, or even lack of sample size, the described approach utilising confidence intervals and equivalence/non-inferiority margins provides more relevant information on study accuracy. The futility of using power to make claims of the absence of an effect is further illustrated by noting that in the absence of an effect (i.e. when the p-value is 1) observed power is equal to the employed alpha threshold (e.g. 0.05). Hence, rather counterintuitively, low power may actually argue for the absence of an effect. Because power can only be calculated assuming the null-hypothesis is false, this metric cannot be used to make claims in favour of the null-hypothesis. Furthermore, as discussed in the preceding section, statistical tests cannot be used to support the strict null-hypothesis, as such this also holds for derived metrics such as p-value and power. While power remains essential when designing a future study, it should not be used to interpret results of a completed study. At this stage more relevant metrics such as confidence intervals are available which do not condition on the presence or absence of an effect, and provide information on accuracy as well as on effect magnitude. “ As well as page 9 “ Second, while notions of power and type 1 errors are essential at the study design phase, because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results. Power and type 1 errors can be framed in terms of probabilities because the analysis has not yet been conducted. Once the experiment has been completed, these hypothetical probabilities are immaterial, and one is simply confronted with an unknown proportion of true-positive results. “ The second point of the paper is that “While individual p-values and null hypothesis tests cannot differentiate between false and true positive results, a set of p-values (Figure 5) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results.” This can indeed be helpful. However, if assumptions like independence between tests or normality of the measurements for some tests fail, then this meta-analysis on the p-value may give a false positive outcome. Without a good understanding of the underlying data and statistics, I would use this test only to suggest that results are possibly false positives if the distribution of p-values is not distinguishable from a uniform distribution, rather than concluding that there must be some true positive effect if the distribution of p-values is not uniform, as is done by the author for the specific example for the effect of rivaroxaban. Response: Thanks for raising these important points. The manuscript has been updated to clarify this involves a general testing procedure which does not make assumptions on the test statistics or the set of p-values following a normal distribution. Furthermore, the manuscript now explains that there may be a need to account for dependencies between sets of p-values. The following has been included on page 8 “ While individual p-values and null-hypothesis tests cannot differentiate between false and true positive results, a set of p-values ( Figure 5 ) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results. This approach is independent of the specific statistical test used to derive individual p-values. Moreover, the method can be generalised to account for dependencies among p-values, such as dependencies arising from the inclusion of both composite and individual outcomes (e.g., evaluating both any stroke and ischaemic stroke). “ And the figure 5 note “ N.b. The p-values were derived by arbitrarily sampling 1,000 test statistic from a normal distribution and leveraging its cumulative density function to calculate the area on the left and right side of the sampled test-statistic. Specifically, the employed standard distribution had based on random draws from a normal distribution with a a standard deviation of 1 and mean of either 0 or 2, when the null-hypothesis was true and false, respectively. Please note that the normal distribution is only used as an exemplar, and alternative distributions with a known cumulative density function (e.g. chi-square, beta, or gamma) could have been used instead. “ These are my main comments. I also have a number of smaller remarks. The manuscript also discusses whether null hypothesis testing can prove whether difference in means between two (infinite) populations is truly zero, but I fail to see the connection of this more philosophical point on the interpretation of hypothesis testing to the main messages. I think that the manuscript would be stronger if it is left out. Response: This has now been integrated with the sections describing power, to further support the argumentation that (observed) power calculations are futile because the null-hypothesis can never be empirically shown to be true. As such this section is far from philosophical or redundant. The meaning of the sentence “Given that power and type 1 error make extreme assumptions where either all results are true or false positives, these concepts are less relevant after the data have been collected. ” is not at all clear to me. Perhaps rephrase to clarify. Response: The following clarification was added. Page 7 “ Considerations of power and type 1 error rate are extremely relevant when designing a study, ensuring that a sufficiently accurate effect estimate may be realistically obtained given the available resources. However, both power and type 1 error rate are conditional probabilities assuming that the null-hypothesis is either true or false. As such these concepts are less relevant after the data have been collected, which would generally not consist of null-hypothesises which are either all true or all false, but instead will include an unknown mixture of both. “ “applying multiplicity correction can sometimes increase the false positive rate instead of reducing it.” In this sentence, and the ensuing section the term “false positive rate” is used differently from what is the common use in the statistical literature. In the literature, “false positive rate” is the expectation of a false positive result when repeating the procedure. The meaning that is taken in this manuscript is that of the fraction of the significant results of a given set of experiments that is a false positive. It is not clear that the interpretation chosen here, and which is shown by example to lead to a false expectation, actually often occurs in the scientific literature. Can some example references be provided? Response: Thanks for pointing this out, the manuscript has been updated to refer to this as the false discovery rate, including a formal definition. Page 7 “ An often overlooked point is that, depending on the unknown balance between false positives and true positives in a set of test results, applying multiplicity correction can sometimes increase the false discovery rate (i.e., the fraction of false positives divided by the total number of positive tests) instead of reducing it. For example, Figure 4A presents a naive expectation of multiple testing corrections, where the false discovery rate decreases from 1/3 to 0. However, there is no reason why the scenario depicted in Figure 4B may not occur; here, the false discovery rate increases from 1/3 to 1. “ Given that the false discovery rate can only be defined based on the knowledge about which hypotheses are true and which are false, the second part of the question is impossible to answer for empirical research. However, the following thought experiment can provide some intuition. We know that the incidence of cardiovascular disease increases with age, which implies that in a sufficiently large study there would be a significant association between age and cardiovascular disease, say the p-value is 0.001. If we conduct a single test at an alpha (type 1 error rate) of 0.05 we would correctly reject a null hypothesis of no association. If we subsequently preformed four additional tests of association using completely random pairs of variables (i.e., where the null hypothesis is true), the Bonferroni multiplicity corrected alpha would be 0.01 and we would not reject the null hypothesis that age associates with cardiovascular disease. If however due to random chance one of the p-value for one of the remaining tests is smaller than 0.01 we would incorrectly reject the null-hypothesis thereby increasing the false discovery rate from 0 (when we only test for age) to 1/5. Fig. 5: The number of samples used for producing the figure is missing. Response: This was added. “comparing the set of p-values for all the aforementioned outcomes against a uniform distribution resulted in a p-value of 0.02,” Which test is used for the comparison. Response: This has been added. “ Utilizing a non-parametric Kolmogorov-Smirnov test to compare the set of p-values for all the aforementioned outcomes against a uniform distribution nevertheless resulted in a p-value of 0.02, suggesting that the protective effect of rivaroxaban is shared across multiple cardiovascular outcomes. “ “researchers should routinely indicate the bounds between which an effect is sufficiently small to be considered clinically irrelevant” This is a valuable recommendation. Before the start of a study it is necessary to choose an expected effect size for determining the power of a study or the number of samples. If the expected effect size, is below the bound, then either the study should not be started (if one is looking to prove a positive effect) or the bound should be taken as the effect size for the power calculation. Response: Thank you. “Second, while notions of power and type 1 errors are essential at the study design phase because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results.” I do not fully agree. For a power analysis at the design phase, typically both an effect size and the variance need to be estimated. In my opinion, a post-hoc power analysis could be relevant for interpreting a negative result, to take into account the observed variance. If the variance is a lot larger than a priori expected, then the study could have been underpowered. Perhaps adjust the text, or explain why I am wrong. Response: The requested explanation has been included. Please refer to page 7-8. “ Researchers may alternatively wish to calculate the power to reject a clinically meaningful difference other than the point estimate. Such calculations can meaningfully inform the design and viability of future studies; although sample size estimates may be more readily interpretable. However, when such power calculations are used to make statements about the presence or absence of an effect, or even lack of sample size, the described approach utilising confidence intervals and equivalence/non-inferiority margins provides more relevant information on study accuracy. The futility of using power to make claims of the absence of an effect is further illustrated by noting that in the absence of an effect (i.e. when the p-value is 1) observed power is equal to the employed alpha threshold (e.g. 0.05). Hence, rather counterintuitively, low power may actually argue for the absence of an effect. Because power can only be calculated assuming the null-hypothesis is false, this metric cannot be used to make claims in favour of the null-hypothesis. Furthermore, as discussed in the preceding section, statistical tests cannot be used to support the strict null-hypothesis, as such this also holds for derived metrics such as p-value and power. While power remains essential when designing a future study, it should not be used to interpret results of a completed study. At this stage more relevant metrics such as confidence intervals are available which do not condition on the presence or absence of an effect, and provide information on accuracy as well as on effect magnitude. “ I do not understand “Finally, depending on the area of research overlooked, true positive results may be more harmful than false positive results”. Is meant that setting too stringent boundaries for significance can be harmful? Consider rephrasing to make its meaning clear. Response: The following has been added to clarify this matter further. Page 9 “ Finally, while decreasing the significance threshold (e.g. from 0.05 to 0.005) decreases the type 1 error rate this decreases power as well, and hence may decrease the number of true associations discovered. Depending on the area of research overlooked, true positive results may be more harmful than false positive results. For example, protein drug targets identified in early drug development are often subjected to a substantial number of follow-up analyses, which filter out false positive results. Such follow-up studies, however, rarely expand the number of candidates, hence suggesting a more inclusive approach might be more considerate. In settings more proximal to clinical implementation and less discovery oriented, such as phase 3 clinical trials, stringent multiple testing correction is clearly called for. “ The manuscript, “Addressing common inferential mistakes when failing to reject the null-hypothesis” contains two main messages. First, the author argues that post-hoc power calculations should be avoided, as “they offer no additional information beyond statistical tests and p-values” and they “can be misleading because of an inability to distinguish between results based on insufficient sample size and results that reflect clinically irrelevant differences”. Second, for interpreting the results of multiple comparison, the authors recommends to consider “the distribution of p-values and the proportion of significant results to identify bodies” rather than standard multiple testing procedures, because they “unrealistically assume that all positive results are false positive”. The paper is an interesting read, but I have a few remarks. I have two comments on the first point. “p-values and observed power are equivalent, and no additional information is obtained by considering both ( Figure 3).” No proof is given of this statement and it is not completely trivial. Can a (hint of a) proof be given or a reference with a proof cited? I believe the relationship shown and figure 3 between p and power depends on the number of samples, but this is not shown in the text or figure. Can you comment on this? The dependence on the sample number could give some added value for making this post-hoc power calculation, and p-values and observed power are thus not completely equivalent. Response: thank you, the following proof is provided in the updated supplementary material, also confirming this relationship is independent of sample size. “ To describe the relationship between p-values and observed power we first let g represent the commulative density function of the p.d.f. f , that is g t = -∞ t f x dx . Next, say we observe test-statistic t o , in that case the p-value for a one-sided tests would simply be 1-g t o , and assuming f is symmetrical about zero the p-value for a two-sided test would simply be 2× 1-g t o . If we let t c represent the critical value of a statistical test beyond which the null-hypothesis is rejected, power can be calculated as g t o -tc +g( -t 0 - t c ) , where in most cases either the first or the second term will be close to zero and can be ignored. As an example, let f be the p.d.f. of the standard normal distribution, with t 0 =2 and t c =1.96 . In this case the two-sided p-value is 2× 1-g 2 ≈0.46 , and power would be g 2-1.96 +g -2-1.96 ≈0.52+0.00=0.52. Similarly, say t 0 =0 (i.e., the null-hypothesis is true), and f and t c are the same as before, we then find that 2× 1-g 0 =1 and g -1.96 +g -1.96 =0.05 . In other words when the null-hypothesis is true the p-value would be 1 and power would be 0.05 (or more generally power would equal to the area excluded by t c ). As such there exists a one-to-one relationship between the p-value and observed power based on t o . Furthermore, and perhaps counterintuitively observed power is smallest when the p-value provides most support for the null-hypothesis and vice versa. This is a reflection of the p-value evaluating t o assuming the null-hypothesis is true, whereas observed power evaluates t o assuming this hypothesis is false. As in the previous section, these expressions do not depend on the amount of data collected and are therefore exact. Furthermore, given that a statistical test, comparing t o to t c , cannot be used to prove the null-hypothesis is true (as shown in the preceding section), the same holds for derived metrics including p-values and power. In fact power can only be calculated assuming the null-hypothesis is false, and hence cannot be used to argue in favour of the null-hypothesis being true. “ Are there examples that people do this particular post-hoc power calculation using the measured effect and the measured standard deviation? What I have encountered, is post-hoc power calculations based on the measured standard deviation of the sample, and for an assumed effect size (rather than the measured effect). If there are publications doing the post-hoc power analysis in the way suggested by this publication, then please give some example references. Otherwise, the reader is left to wonder if this part of the manuscript is arguing against hypothetical reasoning that people do not actually use. Response: Thank you for raising this important point. The manuscript has been updated to clarify that the raised issues with observed power are of general concern irrespective of the whether the point estimates reflect the study results or some pre-existing notion of relevant difference. The concerns about power pertain to its use as a metric to interpret current results. In doing so the manuscript more clearly differentiates with power and/or sample size calculations to inform future research (which are without fault) and power calculations for already completed studies. Page 6-7 “ Researchers may alternatively wish to calculate the power to reject a clinically meaningful difference other than the point estimate. Such calculations can meaningfully inform the design and viability of future studies; although sample size estimates may be more readily interpretable. However, when such power calculations are used to make statements about the presence or absence of an effect, or even lack of sample size, the described approach utilising confidence intervals and equivalence/non-inferiority margins provides more relevant information on study accuracy. The futility of using power to make claims of the absence of an effect is further illustrated by noting that in the absence of an effect (i.e. when the p-value is 1) observed power is equal to the employed alpha threshold (e.g. 0.05). Hence, rather counterintuitively, low power may actually argue for the absence of an effect. Because power can only be calculated assuming the null-hypothesis is false, this metric cannot be used to make claims in favour of the null-hypothesis. Furthermore, as discussed in the preceding section, statistical tests cannot be used to support the strict null-hypothesis, as such this also holds for derived metrics such as p-value and power. While power remains essential when designing a future study, it should not be used to interpret results of a completed study. At this stage more relevant metrics such as confidence intervals are available which do not condition on the presence or absence of an effect, and provide information on accuracy as well as on effect magnitude. “ As well as page 9 “ Second, while notions of power and type 1 errors are essential at the study design phase, because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results. Power and type 1 errors can be framed in terms of probabilities because the analysis has not yet been conducted. Once the experiment has been completed, these hypothetical probabilities are immaterial, and one is simply confronted with an unknown proportion of true-positive results. “ The second point of the paper is that “While individual p-values and null hypothesis tests cannot differentiate between false and true positive results, a set of p-values (Figure 5) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results.” This can indeed be helpful. However, if assumptions like independence between tests or normality of the measurements for some tests fail, then this meta-analysis on the p-value may give a false positive outcome. Without a good understanding of the underlying data and statistics, I would use this test only to suggest that results are possibly false positives if the distribution of p-values is not distinguishable from a uniform distribution, rather than concluding that there must be some true positive effect if the distribution of p-values is not uniform, as is done by the author for the specific example for the effect of rivaroxaban. Response: Thanks for raising these important points. The manuscript has been updated to clarify this involves a general testing procedure which does not make assumptions on the test statistics or the set of p-values following a normal distribution. Furthermore, the manuscript now explains that there may be a need to account for dependencies between sets of p-values. The following has been included on page 8 “ While individual p-values and null-hypothesis tests cannot differentiate between false and true positive results, a set of p-values ( Figure 5 ) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results. This approach is independent of the specific statistical test used to derive individual p-values. Moreover, the method can be generalised to account for dependencies among p-values, such as dependencies arising from the inclusion of both composite and individual outcomes (e.g., evaluating both any stroke and ischaemic stroke). “ And the figure 5 note “ N.b. The p-values were derived by arbitrarily sampling 1,000 test statistic from a normal distribution and leveraging its cumulative density function to calculate the area on the left and right side of the sampled test-statistic. Specifically, the employed standard distribution had based on random draws from a normal distribution with a a standard deviation of 1 and mean of either 0 or 2, when the null-hypothesis was true and false, respectively. Please note that the normal distribution is only used as an exemplar, and alternative distributions with a known cumulative density function (e.g. chi-square, beta, or gamma) could have been used instead. “ These are my main comments. I also have a number of smaller remarks. The manuscript also discusses whether null hypothesis testing can prove whether difference in means between two (infinite) populations is truly zero, but I fail to see the connection of this more philosophical point on the interpretation of hypothesis testing to the main messages. I think that the manuscript would be stronger if it is left out. Response: This has now been integrated with the sections describing power, to further support the argumentation that (observed) power calculations are futile because the null-hypothesis can never be empirically shown to be true. As such this section is far from philosophical or redundant. The meaning of the sentence “Given that power and type 1 error make extreme assumptions where either all results are true or false positives, these concepts are less relevant after the data have been collected. ” is not at all clear to me. Perhaps rephrase to clarify. Response: The following clarification was added. Page 7 “ Considerations of power and type 1 error rate are extremely relevant when designing a study, ensuring that a sufficiently accurate effect estimate may be realistically obtained given the available resources. However, both power and type 1 error rate are conditional probabilities assuming that the null-hypothesis is either true or false. As such these concepts are less relevant after the data have been collected, which would generally not consist of null-hypothesises which are either all true or all false, but instead will include an unknown mixture of both. “ “applying multiplicity correction can sometimes increase the false positive rate instead of reducing it.” In this sentence, and the ensuing section the term “false positive rate” is used differently from what is the common use in the statistical literature. In the literature, “false positive rate” is the expectation of a false positive result when repeating the procedure. The meaning that is taken in this manuscript is that of the fraction of the significant results of a given set of experiments that is a false positive. It is not clear that the interpretation chosen here, and which is shown by example to lead to a false expectation, actually often occurs in the scientific literature. Can some example references be provided? Response: Thanks for pointing this out, the manuscript has been updated to refer to this as the false discovery rate, including a formal definition. Page 7 “ An often overlooked point is that, depending on the unknown balance between false positives and true positives in a set of test results, applying multiplicity correction can sometimes increase the false discovery rate (i.e., the fraction of false positives divided by the total number of positive tests) instead of reducing it. For example, Figure 4A presents a naive expectation of multiple testing corrections, where the false discovery rate decreases from 1/3 to 0. However, there is no reason why the scenario depicted in Figure 4B may not occur; here, the false discovery rate increases from 1/3 to 1. “ Given that the false discovery rate can only be defined based on the knowledge about which hypotheses are true and which are false, the second part of the question is impossible to answer for empirical research. However, the following thought experiment can provide some intuition. We know that the incidence of cardiovascular disease increases with age, which implies that in a sufficiently large study there would be a significant association between age and cardiovascular disease, say the p-value is 0.001. If we conduct a single test at an alpha (type 1 error rate) of 0.05 we would correctly reject a null hypothesis of no association. If we subsequently preformed four additional tests of association using completely random pairs of variables (i.e., where the null hypothesis is true), the Bonferroni multiplicity corrected alpha would be 0.01 and we would not reject the null hypothesis that age associates with cardiovascular disease. If however due to random chance one of the p-value for one of the remaining tests is smaller than 0.01 we would incorrectly reject the null-hypothesis thereby increasing the false discovery rate from 0 (when we only test for age) to 1/5. Fig. 5: The number of samples used for producing the figure is missing. Response: This was added. “comparing the set of p-values for all the aforementioned outcomes against a uniform distribution resulted in a p-value of 0.02,” Which test is used for the comparison. Response: This has been added. “ Utilizing a non-parametric Kolmogorov-Smirnov test to compare the set of p-values for all the aforementioned outcomes against a uniform distribution nevertheless resulted in a p-value of 0.02, suggesting that the protective effect of rivaroxaban is shared across multiple cardiovascular outcomes. “ “researchers should routinely indicate the bounds between which an effect is sufficiently small to be considered clinically irrelevant” This is a valuable recommendation. Before the start of a study it is necessary to choose an expected effect size for determining the power of a study or the number of samples. If the expected effect size, is below the bound, then either the study should not be started (if one is looking to prove a positive effect) or the bound should be taken as the effect size for the power calculation. Response: Thank you. “Second, while notions of power and type 1 errors are essential at the study design phase because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results.” I do not fully agree. For a power analysis at the design phase, typically both an effect size and the variance need to be estimated. In my opinion, a post-hoc power analysis could be relevant for interpreting a negative result, to take into account the observed variance. If the variance is a lot larger than a priori expected, then the study could have been underpowered. Perhaps adjust the text, or explain why I am wrong. Response: The requested explanation has been included. Please refer to page 7-8. “ Researchers may alternatively wish to calculate the power to reject a clinically meaningful difference other than the point estimate. Such calculations can meaningfully inform the design and viability of future studies; although sample size estimates may be more readily interpretable. However, when such power calculations are used to make statements about the presence or absence of an effect, or even lack of sample size, the described approach utilising confidence intervals and equivalence/non-inferiority margins provides more relevant information on study accuracy. The futility of using power to make claims of the absence of an effect is further illustrated by noting that in the absence of an effect (i.e. when the p-value is 1) observed power is equal to the employed alpha threshold (e.g. 0.05). Hence, rather counterintuitively, low power may actually argue for the absence of an effect. Because power can only be calculated assuming the null-hypothesis is false, this metric cannot be used to make claims in favour of the null-hypothesis. Furthermore, as discussed in the preceding section, statistical tests cannot be used to support the strict null-hypothesis, as such this also holds for derived metrics such as p-value and power. While power remains essential when designing a future study, it should not be used to interpret results of a completed study. At this stage more relevant metrics such as confidence intervals are available which do not condition on the presence or absence of an effect, and provide information on accuracy as well as on effect magnitude. “ I do not understand “Finally, depending on the area of research overlooked, true positive results may be more harmful than false positive results”. Is meant that setting too stringent boundaries for significance can be harmful? Consider rephrasing to make its meaning clear. Response: The following has been added to clarify this matter further. Page 9 “ Finally, while decreasing the significance threshold (e.g. from 0.05 to 0.005) decreases the type 1 error rate this decreases power as well, and hence may decrease the number of true associations discovered. Depending on the area of research overlooked, true positive results may be more harmful than false positive results. For example, protein drug targets identified in early drug development are often subjected to a substantial number of follow-up analyses, which filter out false positive results. Such follow-up studies, however, rarely expand the number of candidates, hence suggesting a more inclusive approach might be more considerate. In settings more proximal to clinical implementation and less discovery oriented, such as phase 3 clinical trials, stringent multiple testing correction is clearly called for. “ Competing Interests: No competing interests were disclosed. Close Report a concern Respond or Comment COMMENTS ON THIS REPORT Author Response 25 Feb 2025 Amand Schmidt , UCL British Heart Foundation Research Accelerator, London, WC1E6HX, UK 25 Feb 2025 Author Response The manuscript, “Addressing common inferential mistakes when failing to reject the null-hypothesis” contains two main messages. First, the author argues that post-hoc power calculations should be avoided, as “they offer ... Continue reading The manuscript, “Addressing common inferential mistakes when failing to reject the null-hypothesis” contains two main messages. First, the author argues that post-hoc power calculations should be avoided, as “they offer no additional information beyond statistical tests and p-values” and they “can be misleading because of an inability to distinguish between results based on insufficient sample size and results that reflect clinically irrelevant differences”. Second, for interpreting the results of multiple comparison, the authors recommends to consider “the distribution of p-values and the proportion of significant results to identify bodies” rather than standard multiple testing procedures, because they “unrealistically assume that all positive results are false positive”. The paper is an interesting read, but I have a few remarks. I have two comments on the first point. “p-values and observed power are equivalent, and no additional information is obtained by considering both ( Figure 3).” No proof is given of this statement and it is not completely trivial. Can a (hint of a) proof be given or a reference with a proof cited? I believe the relationship shown and figure 3 between p and power depends on the number of samples, but this is not shown in the text or figure. Can you comment on this? The dependence on the sample number could give some added value for making this post-hoc power calculation, and p-values and observed power are thus not completely equivalent. Response: thank you, the following proof is provided in the updated supplementary material, also confirming this relationship is independent of sample size. “ To describe the relationship between p-values and observed power we first let g represent the commulative density function of the p.d.f. f , that is g t = -∞ t f x dx . Next, say we observe test-statistic t o , in that case the p-value for a one-sided tests would simply be 1-g t o , and assuming f is symmetrical about zero the p-value for a two-sided test would simply be 2× 1-g t o . If we let t c represent the critical value of a statistical test beyond which the null-hypothesis is rejected, power can be calculated as g t o -tc +g( -t 0 - t c ) , where in most cases either the first or the second term will be close to zero and can be ignored. As an example, let f be the p.d.f. of the standard normal distribution, with t 0 =2 and t c =1.96 . In this case the two-sided p-value is 2× 1-g 2 ≈0.46 , and power would be g 2-1.96 +g -2-1.96 ≈0.52+0.00=0.52. Similarly, say t 0 =0 (i.e., the null-hypothesis is true), and f and t c are the same as before, we then find that 2× 1-g 0 =1 and g -1.96 +g -1.96 =0.05 . In other words when the null-hypothesis is true the p-value would be 1 and power would be 0.05 (or more generally power would equal to the area excluded by t c ). As such there exists a one-to-one relationship between the p-value and observed power based on t o . Furthermore, and perhaps counterintuitively observed power is smallest when the p-value provides most support for the null-hypothesis and vice versa. This is a reflection of the p-value evaluating t o assuming the null-hypothesis is true, whereas observed power evaluates t o assuming this hypothesis is false. As in the previous section, these expressions do not depend on the amount of data collected and are therefore exact. Furthermore, given that a statistical test, comparing t o to t c , cannot be used to prove the null-hypothesis is true (as shown in the preceding section), the same holds for derived metrics including p-values and power. In fact power can only be calculated assuming the null-hypothesis is false, and hence cannot be used to argue in favour of the null-hypothesis being true. “ Are there examples that people do this particular post-hoc power calculation using the measured effect and the measured standard deviation? What I have encountered, is post-hoc power calculations based on the measured standard deviation of the sample, and for an assumed effect size (rather than the measured effect). If there are publications doing the post-hoc power analysis in the way suggested by this publication, then please give some example references. Otherwise, the reader is left to wonder if this part of the manuscript is arguing against hypothetical reasoning that people do not actually use. Response: Thank you for raising this important point. The manuscript has been updated to clarify that the raised issues with observed power are of general concern irrespective of the whether the point estimates reflect the study results or some pre-existing notion of relevant difference. The concerns about power pertain to its use as a metric to interpret current results. In doing so the manuscript more clearly differentiates with power and/or sample size calculations to inform future research (which are without fault) and power calculations for already completed studies. Page 6-7 “ Researchers may alternatively wish to calculate the power to reject a clinically meaningful difference other than the point estimate. Such calculations can meaningfully inform the design and viability of future studies; although sample size estimates may be more readily interpretable. However, when such power calculations are used to make statements about the presence or absence of an effect, or even lack of sample size, the described approach utilising confidence intervals and equivalence/non-inferiority margins provides more relevant information on study accuracy. The futility of using power to make claims of the absence of an effect is further illustrated by noting that in the absence of an effect (i.e. when the p-value is 1) observed power is equal to the employed alpha threshold (e.g. 0.05). Hence, rather counterintuitively, low power may actually argue for the absence of an effect. Because power can only be calculated assuming the null-hypothesis is false, this metric cannot be used to make claims in favour of the null-hypothesis. Furthermore, as discussed in the preceding section, statistical tests cannot be used to support the strict null-hypothesis, as such this also holds for derived metrics such as p-value and power. While power remains essential when designing a future study, it should not be used to interpret results of a completed study. At this stage more relevant metrics such as confidence intervals are available which do not condition on the presence or absence of an effect, and provide information on accuracy as well as on effect magnitude. “ As well as page 9 “ Second, while notions of power and type 1 errors are essential at the study design phase, because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results. Power and type 1 errors can be framed in terms of probabilities because the analysis has not yet been conducted. Once the experiment has been completed, these hypothetical probabilities are immaterial, and one is simply confronted with an unknown proportion of true-positive results. “ The second point of the paper is that “While individual p-values and null hypothesis tests cannot differentiate between false and true positive results, a set of p-values (Figure 5) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results.” This can indeed be helpful. However, if assumptions like independence between tests or normality of the measurements for some tests fail, then this meta-analysis on the p-value may give a false positive outcome. Without a good understanding of the underlying data and statistics, I would use this test only to suggest that results are possibly false positives if the distribution of p-values is not distinguishable from a uniform distribution, rather than concluding that there must be some true positive effect if the distribution of p-values is not uniform, as is done by the author for the specific example for the effect of rivaroxaban. Response: Thanks for raising these important points. The manuscript has been updated to clarify this involves a general testing procedure which does not make assumptions on the test statistics or the set of p-values following a normal distribution. Furthermore, the manuscript now explains that there may be a need to account for dependencies between sets of p-values. The following has been included on page 8 “ While individual p-values and null-hypothesis tests cannot differentiate between false and true positive results, a set of p-values ( Figure 5 ) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results. This approach is independent of the specific statistical test used to derive individual p-values. Moreover, the method can be generalised to account for dependencies among p-values, such as dependencies arising from the inclusion of both composite and individual outcomes (e.g., evaluating both any stroke and ischaemic stroke). “ And the figure 5 note “ N.b. The p-values were derived by arbitrarily sampling 1,000 test statistic from a normal distribution and leveraging its cumulative density function to calculate the area on the left and right side of the sampled test-statistic. Specifically, the employed standard distribution had based on random draws from a normal distribution with a a standard deviation of 1 and mean of either 0 or 2, when the null-hypothesis was true and false, respectively. Please note that the normal distribution is only used as an exemplar, and alternative distributions with a known cumulative density function (e.g. chi-square, beta, or gamma) could have been used instead. “ These are my main comments. I also have a number of smaller remarks. The manuscript also discusses whether null hypothesis testing can prove whether difference in means between two (infinite) populations is truly zero, but I fail to see the connection of this more philosophical point on the interpretation of hypothesis testing to the main messages. I think that the manuscript would be stronger if it is left out. Response: This has now been integrated with the sections describing power, to further support the argumentation that (observed) power calculations are futile because the null-hypothesis can never be empirically shown to be true. As such this section is far from philosophical or redundant. The meaning of the sentence “Given that power and type 1 error make extreme assumptions where either all results are true or false positives, these concepts are less relevant after the data have been collected. ” is not at all clear to me. Perhaps rephrase to clarify. Response: The following clarification was added. Page 7 “ Considerations of power and type 1 error rate are extremely relevant when designing a study, ensuring that a sufficiently accurate effect estimate may be realistically obtained given the available resources. However, both power and type 1 error rate are conditional probabilities assuming that the null-hypothesis is either true or false. As such these concepts are less relevant after the data have been collected, which would generally not consist of null-hypothesises which are either all true or all false, but instead will include an unknown mixture of both. “ “applying multiplicity correction can sometimes increase the false positive rate instead of reducing it.” In this sentence, and the ensuing section the term “false positive rate” is used differently from what is the common use in the statistical literature. In the literature, “false positive rate” is the expectation of a false positive result when repeating the procedure. The meaning that is taken in this manuscript is that of the fraction of the significant results of a given set of experiments that is a false positive. It is not clear that the interpretation chosen here, and which is shown by example to lead to a false expectation, actually often occurs in the scientific literature. Can some example references be provided? Response: Thanks for pointing this out, the manuscript has been updated to refer to this as the false discovery rate, including a formal definition. Page 7 “ An often overlooked point is that, depending on the unknown balance between false positives and true positives in a set of test results, applying multiplicity correction can sometimes increase the false discovery rate (i.e., the fraction of false positives divided by the total number of positive tests) instead of reducing it. For example, Figure 4A presents a naive expectation of multiple testing corrections, where the false discovery rate decreases from 1/3 to 0. However, there is no reason why the scenario depicted in Figure 4B may not occur; here, the false discovery rate increases from 1/3 to 1. “ Given that the false discovery rate can only be defined based on the knowledge about which hypotheses are true and which are false, the second part of the question is impossible to answer for empirical research. However, the following thought experiment can provide some intuition. We know that the incidence of cardiovascular disease increases with age, which implies that in a sufficiently large study there would be a significant association between age and cardiovascular disease, say the p-value is 0.001. If we conduct a single test at an alpha (type 1 error rate) of 0.05 we would correctly reject a null hypothesis of no association. If we subsequently preformed four additional tests of association using completely random pairs of variables (i.e., where the null hypothesis is true), the Bonferroni multiplicity corrected alpha would be 0.01 and we would not reject the null hypothesis that age associates with cardiovascular disease. If however due to random chance one of the p-value for one of the remaining tests is smaller than 0.01 we would incorrectly reject the null-hypothesis thereby increasing the false discovery rate from 0 (when we only test for age) to 1/5. Fig. 5: The number of samples used for producing the figure is missing. Response: This was added. “comparing the set of p-values for all the aforementioned outcomes against a uniform distribution resulted in a p-value of 0.02,” Which test is used for the comparison. Response: This has been added. “ Utilizing a non-parametric Kolmogorov-Smirnov test to compare the set of p-values for all the aforementioned outcomes against a uniform distribution nevertheless resulted in a p-value of 0.02, suggesting that the protective effect of rivaroxaban is shared across multiple cardiovascular outcomes. “ “researchers should routinely indicate the bounds between which an effect is sufficiently small to be considered clinically irrelevant” This is a valuable recommendation. Before the start of a study it is necessary to choose an expected effect size for determining the power of a study or the number of samples. If the expected effect size, is below the bound, then either the study should not be started (if one is looking to prove a positive effect) or the bound should be taken as the effect size for the power calculation. Response: Thank you. “Second, while notions of power and type 1 errors are essential at the study design phase because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results.” I do not fully agree. For a power analysis at the design phase, typically both an effect size and the variance need to be estimated. In my opinion, a post-hoc power analysis could be relevant for interpreting a negative result, to take into account the observed variance. If the variance is a lot larger than a priori expected, then the study could have been underpowered. Perhaps adjust the text, or explain why I am wrong. Response: The requested explanation has been included. Please refer to page 7-8. “ Researchers may alternatively wish to calculate the power to reject a clinically meaningful difference other than the point estimate. Such calculations can meaningfully inform the design and viability of future studies; although sample size estimates may be more readily interpretable. However, when such power calculations are used to make statements about the presence or absence of an effect, or even lack of sample size, the described approach utilising confidence intervals and equivalence/non-inferiority margins provides more relevant information on study accuracy. The futility of using power to make claims of the absence of an effect is further illustrated by noting that in the absence of an effect (i.e. when the p-value is 1) observed power is equal to the employed alpha threshold (e.g. 0.05). Hence, rather counterintuitively, low power may actually argue for the absence of an effect. Because power can only be calculated assuming the null-hypothesis is false, this metric cannot be used to make claims in favour of the null-hypothesis. Furthermore, as discussed in the preceding section, statistical tests cannot be used to support the strict null-hypothesis, as such this also holds for derived metrics such as p-value and power. While power remains essential when designing a future study, it should not be used to interpret results of a completed study. At this stage more relevant metrics such as confidence intervals are available which do not condition on the presence or absence of an effect, and provide information on accuracy as well as on effect magnitude. “ I do not understand “Finally, depending on the area of research overlooked, true positive results may be more harmful than false positive results”. Is meant that setting too stringent boundaries for significance can be harmful? Consider rephrasing to make its meaning clear. Response: The following has been added to clarify this matter further. Page 9 “ Finally, while decreasing the significance threshold (e.g. from 0.05 to 0.005) decreases the type 1 error rate this decreases power as well, and hence may decrease the number of true associations discovered. Depending on the area of research overlooked, true positive results may be more harmful than false positive results. For example, protein drug targets identified in early drug development are often subjected to a substantial number of follow-up analyses, which filter out false positive results. Such follow-up studies, however, rarely expand the number of candidates, hence suggesting a more inclusive approach might be more considerate. In settings more proximal to clinical implementation and less discovery oriented, such as phase 3 clinical trials, stringent multiple testing correction is clearly called for. “ The manuscript, “Addressing common inferential mistakes when failing to reject the null-hypothesis” contains two main messages. First, the author argues that post-hoc power calculations should be avoided, as “they offer no additional information beyond statistical tests and p-values” and they “can be misleading because of an inability to distinguish between results based on insufficient sample size and results that reflect clinically irrelevant differences”. Second, for interpreting the results of multiple comparison, the authors recommends to consider “the distribution of p-values and the proportion of significant results to identify bodies” rather than standard multiple testing procedures, because they “unrealistically assume that all positive results are false positive”. The paper is an interesting read, but I have a few remarks. I have two comments on the first point. “p-values and observed power are equivalent, and no additional information is obtained by considering both ( Figure 3).” No proof is given of this statement and it is not completely trivial. Can a (hint of a) proof be given or a reference with a proof cited? I believe the relationship shown and figure 3 between p and power depends on the number of samples, but this is not shown in the text or figure. Can you comment on this? The dependence on the sample number could give some added value for making this post-hoc power calculation, and p-values and observed power are thus not completely equivalent. Response: thank you, the following proof is provided in the updated supplementary material, also confirming this relationship is independent of sample size. “ To describe the relationship between p-values and observed power we first let g represent the commulative density function of the p.d.f. f , that is g t = -∞ t f x dx . Next, say we observe test-statistic t o , in that case the p-value for a one-sided tests would simply be 1-g t o , and assuming f is symmetrical about zero the p-value for a two-sided test would simply be 2× 1-g t o . If we let t c represent the critical value of a statistical test beyond which the null-hypothesis is rejected, power can be calculated as g t o -tc +g( -t 0 - t c ) , where in most cases either the first or the second term will be close to zero and can be ignored. As an example, let f be the p.d.f. of the standard normal distribution, with t 0 =2 and t c =1.96 . In this case the two-sided p-value is 2× 1-g 2 ≈0.46 , and power would be g 2-1.96 +g -2-1.96 ≈0.52+0.00=0.52. Similarly, say t 0 =0 (i.e., the null-hypothesis is true), and f and t c are the same as before, we then find that 2× 1-g 0 =1 and g -1.96 +g -1.96 =0.05 . In other words when the null-hypothesis is true the p-value would be 1 and power would be 0.05 (or more generally power would equal to the area excluded by t c ). As such there exists a one-to-one relationship between the p-value and observed power based on t o . Furthermore, and perhaps counterintuitively observed power is smallest when the p-value provides most support for the null-hypothesis and vice versa. This is a reflection of the p-value evaluating t o assuming the null-hypothesis is true, whereas observed power evaluates t o assuming this hypothesis is false. As in the previous section, these expressions do not depend on the amount of data collected and are therefore exact. Furthermore, given that a statistical test, comparing t o to t c , cannot be used to prove the null-hypothesis is true (as shown in the preceding section), the same holds for derived metrics including p-values and power. In fact power can only be calculated assuming the null-hypothesis is false, and hence cannot be used to argue in favour of the null-hypothesis being true. “ Are there examples that people do this particular post-hoc power calculation using the measured effect and the measured standard deviation? What I have encountered, is post-hoc power calculations based on the measured standard deviation of the sample, and for an assumed effect size (rather than the measured effect). If there are publications doing the post-hoc power analysis in the way suggested by this publication, then please give some example references. Otherwise, the reader is left to wonder if this part of the manuscript is arguing against hypothetical reasoning that people do not actually use. Response: Thank you for raising this important point. The manuscript has been updated to clarify that the raised issues with observed power are of general concern irrespective of the whether the point estimates reflect the study results or some pre-existing notion of relevant difference. The concerns about power pertain to its use as a metric to interpret current results. In doing so the manuscript more clearly differentiates with power and/or sample size calculations to inform future research (which are without fault) and power calculations for already completed studies. Page 6-7 “ Researchers may alternatively wish to calculate the power to reject a clinically meaningful difference other than the point estimate. Such calculations can meaningfully inform the design and viability of future studies; although sample size estimates may be more readily interpretable. However, when such power calculations are used to make statements about the presence or absence of an effect, or even lack of sample size, the described approach utilising confidence intervals and equivalence/non-inferiority margins provides more relevant information on study accuracy. The futility of using power to make claims of the absence of an effect is further illustrated by noting that in the absence of an effect (i.e. when the p-value is 1) observed power is equal to the employed alpha threshold (e.g. 0.05). Hence, rather counterintuitively, low power may actually argue for the absence of an effect. Because power can only be calculated assuming the null-hypothesis is false, this metric cannot be used to make claims in favour of the null-hypothesis. Furthermore, as discussed in the preceding section, statistical tests cannot be used to support the strict null-hypothesis, as such this also holds for derived metrics such as p-value and power. While power remains essential when designing a future study, it should not be used to interpret results of a completed study. At this stage more relevant metrics such as confidence intervals are available which do not condition on the presence or absence of an effect, and provide information on accuracy as well as on effect magnitude. “ As well as page 9 “ Second, while notions of power and type 1 errors are essential at the study design phase, because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results. Power and type 1 errors can be framed in terms of probabilities because the analysis has not yet been conducted. Once the experiment has been completed, these hypothetical probabilities are immaterial, and one is simply confronted with an unknown proportion of true-positive results. “ The second point of the paper is that “While individual p-values and null hypothesis tests cannot differentiate between false and true positive results, a set of p-values (Figure 5) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results.” This can indeed be helpful. However, if assumptions like independence between tests or normality of the measurements for some tests fail, then this meta-analysis on the p-value may give a false positive outcome. Without a good understanding of the underlying data and statistics, I would use this test only to suggest that results are possibly false positives if the distribution of p-values is not distinguishable from a uniform distribution, rather than concluding that there must be some true positive effect if the distribution of p-values is not uniform, as is done by the author for the specific example for the effect of rivaroxaban. Response: Thanks for raising these important points. The manuscript has been updated to clarify this involves a general testing procedure which does not make assumptions on the test statistics or the set of p-values following a normal distribution. Furthermore, the manuscript now explains that there may be a need to account for dependencies between sets of p-values. The following has been included on page 8 “ While individual p-values and null-hypothesis tests cannot differentiate between false and true positive results, a set of p-values ( Figure 5 ) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results. This approach is independent of the specific statistical test used to derive individual p-values. Moreover, the method can be generalised to account for dependencies among p-values, such as dependencies arising from the inclusion of both composite and individual outcomes (e.g., evaluating both any stroke and ischaemic stroke). “ And the figure 5 note “ N.b. The p-values were derived by arbitrarily sampling 1,000 test statistic from a normal distribution and leveraging its cumulative density function to calculate the area on the left and right side of the sampled test-statistic. Specifically, the employed standard distribution had based on random draws from a normal distribution with a a standard deviation of 1 and mean of either 0 or 2, when the null-hypothesis was true and false, respectively. Please note that the normal distribution is only used as an exemplar, and alternative distributions with a known cumulative density function (e.g. chi-square, beta, or gamma) could have been used instead. “ These are my main comments. I also have a number of smaller remarks. The manuscript also discusses whether null hypothesis testing can prove whether difference in means between two (infinite) populations is truly zero, but I fail to see the connection of this more philosophical point on the interpretation of hypothesis testing to the main messages. I think that the manuscript would be stronger if it is left out. Response: This has now been integrated with the sections describing power, to further support the argumentation that (observed) power calculations are futile because the null-hypothesis can never be empirically shown to be true. As such this section is far from philosophical or redundant. The meaning of the sentence “Given that power and type 1 error make extreme assumptions where either all results are true or false positives, these concepts are less relevant after the data have been collected. ” is not at all clear to me. Perhaps rephrase to clarify. Response: The following clarification was added. Page 7 “ Considerations of power and type 1 error rate are extremely relevant when designing a study, ensuring that a sufficiently accurate effect estimate may be realistically obtained given the available resources. However, both power and type 1 error rate are conditional probabilities assuming that the null-hypothesis is either true or false. As such these concepts are less relevant after the data have been collected, which would generally not consist of null-hypothesises which are either all true or all false, but instead will include an unknown mixture of both. “ “applying multiplicity correction can sometimes increase the false positive rate instead of reducing it.” In this sentence, and the ensuing section the term “false positive rate” is used differently from what is the common use in the statistical literature. In the literature, “false positive rate” is the expectation of a false positive result when repeating the procedure. The meaning that is taken in this manuscript is that of the fraction of the significant results of a given set of experiments that is a false positive. It is not clear that the interpretation chosen here, and which is shown by example to lead to a false expectation, actually often occurs in the scientific literature. Can some example references be provided? Response: Thanks for pointing this out, the manuscript has been updated to refer to this as the false discovery rate, including a formal definition. Page 7 “ An often overlooked point is that, depending on the unknown balance between false positives and true positives in a set of test results, applying multiplicity correction can sometimes increase the false discovery rate (i.e., the fraction of false positives divided by the total number of positive tests) instead of reducing it. For example, Figure 4A presents a naive expectation of multiple testing corrections, where the false discovery rate decreases from 1/3 to 0. However, there is no reason why the scenario depicted in Figure 4B may not occur; here, the false discovery rate increases from 1/3 to 1. “ Given that the false discovery rate can only be defined based on the knowledge about which hypotheses are true and which are false, the second part of the question is impossible to answer for empirical research. However, the following thought experiment can provide some intuition. We know that the incidence of cardiovascular disease increases with age, which implies that in a sufficiently large study there would be a significant association between age and cardiovascular disease, say the p-value is 0.001. If we conduct a single test at an alpha (type 1 error rate) of 0.05 we would correctly reject a null hypothesis of no association. If we subsequently preformed four additional tests of association using completely random pairs of variables (i.e., where the null hypothesis is true), the Bonferroni multiplicity corrected alpha would be 0.01 and we would not reject the null hypothesis that age associates with cardiovascular disease. If however due to random chance one of the p-value for one of the remaining tests is smaller than 0.01 we would incorrectly reject the null-hypothesis thereby increasing the false discovery rate from 0 (when we only test for age) to 1/5. Fig. 5: The number of samples used for producing the figure is missing. Response: This was added. “comparing the set of p-values for all the aforementioned outcomes against a uniform distribution resulted in a p-value of 0.02,” Which test is used for the comparison. Response: This has been added. “ Utilizing a non-parametric Kolmogorov-Smirnov test to compare the set of p-values for all the aforementioned outcomes against a uniform distribution nevertheless resulted in a p-value of 0.02, suggesting that the protective effect of rivaroxaban is shared across multiple cardiovascular outcomes. “ “researchers should routinely indicate the bounds between which an effect is sufficiently small to be considered clinically irrelevant” This is a valuable recommendation. Before the start of a study it is necessary to choose an expected effect size for determining the power of a study or the number of samples. If the expected effect size, is below the bound, then either the study should not be started (if one is looking to prove a positive effect) or the bound should be taken as the effect size for the power calculation. Response: Thank you. “Second, while notions of power and type 1 errors are essential at the study design phase because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results.” I do not fully agree. For a power analysis at the design phase, typically both an effect size and the variance need to be estimated. In my opinion, a post-hoc power analysis could be relevant for interpreting a negative result, to take into account the observed variance. If the variance is a lot larger than a priori expected, then the study could have been underpowered. Perhaps adjust the text, or explain why I am wrong. Response: The requested explanation has been included. Please refer to page 7-8. “ Researchers may alternatively wish to calculate the power to reject a clinically meaningful difference other than the point estimate. Such calculations can meaningfully inform the design and viability of future studies; although sample size estimates may be more readily interpretable. However, when such power calculations are used to make statements about the presence or absence of an effect, or even lack of sample size, the described approach utilising confidence intervals and equivalence/non-inferiority margins provides more relevant information on study accuracy. The futility of using power to make claims of the absence of an effect is further illustrated by noting that in the absence of an effect (i.e. when the p-value is 1) observed power is equal to the employed alpha threshold (e.g. 0.05). Hence, rather counterintuitively, low power may actually argue for the absence of an effect. Because power can only be calculated assuming the null-hypothesis is false, this metric cannot be used to make claims in favour of the null-hypothesis. Furthermore, as discussed in the preceding section, statistical tests cannot be used to support the strict null-hypothesis, as such this also holds for derived metrics such as p-value and power. While power remains essential when designing a future study, it should not be used to interpret results of a completed study. At this stage more relevant metrics such as confidence intervals are available which do not condition on the presence or absence of an effect, and provide information on accuracy as well as on effect magnitude. “ I do not understand “Finally, depending on the area of research overlooked, true positive results may be more harmful than false positive results”. Is meant that setting too stringent boundaries for significance can be harmful? Consider rephrasing to make its meaning clear. Response: The following has been added to clarify this matter further. Page 9 “ Finally, while decreasing the significance threshold (e.g. from 0.05 to 0.005) decreases the type 1 error rate this decreases power as well, and hence may decrease the number of true associations discovered. Depending on the area of research overlooked, true positive results may be more harmful than false positive results. For example, protein drug targets identified in early drug development are often subjected to a substantial number of follow-up analyses, which filter out false positive results. Such follow-up studies, however, rarely expand the number of candidates, hence suggesting a more inclusive approach might be more considerate. In settings more proximal to clinical implementation and less discovery oriented, such as phase 3 clinical trials, stringent multiple testing correction is clearly called for. “ Competing Interests: No competing interests were disclosed. Close Report a concern COMMENT ON THIS REPORT Views 0 Cite How to cite this report: Cui Y. Reviewer Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.174018.r350489 ) The direct URL for this report is: https://f1000research.com/articles/13-1488/v1#referee-response-350489 NOTE: it is important to ensure the information in square brackets after the title is included in this citation. Close Copy Citation Details Reviewer Report 02 Jan 2025 Ying Cui , Stanford University, Stanford,, California, USA Approved VIEWS 0 https://doi.org/10.5256/f1000research.174018.r350489 The author have made very interesting points on common mistakes when interpreting results from statistical tests that fail to reject the null hypothesis. There are some minor comments as listed below: 1. In Page 5, it was mentioned that ... Continue reading READ ALL The author have made very interesting points on common mistakes when interpreting results from statistical tests that fail to reject the null hypothesis. There are some minor comments as listed below: 1. In Page 5, it was mentioned that "Hence, a more relevant alternative to post-hoc power calculations is to evaluate the extent to which the CI includes clinically relevant effect estimates, which is in line with the aforementioned equivalence/non-inferiority approach.". It can be helpful if the author could provide a specific example (e.g. further discussion with the two HR estimates using equivalence/non-inferiority approach) to further illustrate this. 2. In the last line of Page 5, "naïve" should be "naive". 3. In the last lines of Page 7, it was recommended that "researchers should routinely apply composite null hypothesis tests evaluated against meaningful bounds of insignificance". It would be helpful if the author could provide more discussion with suggestions on how to identify the "bounds of insignificance". Is the topic of the opinion article discussed accurately in the context of the current literature? Yes Are all factual statements correct and adequately supported by citations? Yes Are arguments sufficiently supported by evidence from the published literature? Yes Are the conclusions drawn balanced and justified on the basis of the presented arguments? Yes Competing Interests: No competing interests were disclosed. Reviewer Expertise: Hypothesis testing, Biostatistics I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard. Close READ LESS CITE CITE HOW TO CITE THIS REPORT Cui Y. Reviewer Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.174018.r350489 ) The direct URL for this report is: https://f1000research.com/articles/13-1488/v1#referee-response-350489 NOTE: it is important to ensure the information in square brackets after the title is included in all citations of this article. COPY CITATION DETAILS Report a concern Author Response 25 Feb 2025 Amand Schmidt , UCL British Heart Foundation Research Accelerator, London, WC1E6HX, UK 25 Feb 2025 Author Response Reviewer Comments: The author have made very interesting points on common mistakes when interpreting results from statistical tests that fail to reject the null hypothesis. There are some minor comments as listed ... Continue reading Reviewer Comments: The author have made very interesting points on common mistakes when interpreting results from statistical tests that fail to reject the null hypothesis. There are some minor comments as listed below: 1. In Page 5, it was mentioned that "Hence, a more relevant alternative to post-hoc power calculations is to evaluate the extent to which the CI includes clinically relevant effect estimates, which is in line with the aforementioned equivalence/non-inferiority approach.". It can be helpful if the author could provide a specific example (e.g. further discussion with the two HR estimates using equivalence/non-inferiority approach) to further illustrate this. Response: Thank you, we have included the following example, using the VOYAGER PAD trial results. Page 6 “ For example, the VOYAGER PAD HR estimate of 0.86 (95%CI 0.40;1.87) for bleeding risk in people with endovascular PAD clearly shows that the collected data supports a wide range of effect estimates, including potentially harmful associations. However, because the confidence interval only partially overlap with the proposed (hypothetical) upper bounds of acceptable harm of 1.25, testing against this bound results a p-value of 0.17 which is considerably smaller than testing against the complete absence of an effect: p-value 0.70. By comparison, the observed power estimate for these results is 7%, which implies that if the true HR was 0.86 one would have rejected the strict null-hypothesis in 7 out of 100 repeated experiment. As such observed power provide limit information relative to the presented alternative approaches, particularly the confidence interval based approach which allows for an informative discussion of benefits and harms in terms of effect magnitude(s). “ Reviewer Comments: 2. In the last line of Page 5, "naïve" should be "naive". Response: Thank you. Reviewer Comments: 3. In the last lines of Page 7, it was recommended that "researchers should routinely apply composite null hypothesis tests evaluated against meaningful bounds of insignificance". It would be helpful if the author could provide more discussion with suggestions on how to identify the "bounds of insignificance". Response: The following has been included on page 4 “ Defining bounds of equivalence or non-inferiority is challenging and a possible source of contention. Typically, such bounds are defined by combining statistical and clinical considerations. For example, evidence from previous studies can be meta-analysed to obtain a pooled effect estimate and confidence interval, where the confidence interval limits can be multiplied by a constants representing the amount of effect that one would like to preserve or rule out (for safety). 7 “ Reviewer Comments: The author have made very interesting points on common mistakes when interpreting results from statistical tests that fail to reject the null hypothesis. There are some minor comments as listed below: 1. In Page 5, it was mentioned that "Hence, a more relevant alternative to post-hoc power calculations is to evaluate the extent to which the CI includes clinically relevant effect estimates, which is in line with the aforementioned equivalence/non-inferiority approach.". It can be helpful if the author could provide a specific example (e.g. further discussion with the two HR estimates using equivalence/non-inferiority approach) to further illustrate this. Response: Thank you, we have included the following example, using the VOYAGER PAD trial results. Page 6 “ For example, the VOYAGER PAD HR estimate of 0.86 (95%CI 0.40;1.87) for bleeding risk in people with endovascular PAD clearly shows that the collected data supports a wide range of effect estimates, including potentially harmful associations. However, because the confidence interval only partially overlap with the proposed (hypothetical) upper bounds of acceptable harm of 1.25, testing against this bound results a p-value of 0.17 which is considerably smaller than testing against the complete absence of an effect: p-value 0.70. By comparison, the observed power estimate for these results is 7%, which implies that if the true HR was 0.86 one would have rejected the strict null-hypothesis in 7 out of 100 repeated experiment. As such observed power provide limit information relative to the presented alternative approaches, particularly the confidence interval based approach which allows for an informative discussion of benefits and harms in terms of effect magnitude(s). “ Reviewer Comments: 2. In the last line of Page 5, "naïve" should be "naive". Response: Thank you. Reviewer Comments: 3. In the last lines of Page 7, it was recommended that "researchers should routinely apply composite null hypothesis tests evaluated against meaningful bounds of insignificance". It would be helpful if the author could provide more discussion with suggestions on how to identify the "bounds of insignificance". Response: The following has been included on page 4 “ Defining bounds of equivalence or non-inferiority is challenging and a possible source of contention. Typically, such bounds are defined by combining statistical and clinical considerations. For example, evidence from previous studies can be meta-analysed to obtain a pooled effect estimate and confidence interval, where the confidence interval limits can be multiplied by a constants representing the amount of effect that one would like to preserve or rule out (for safety). 7 “ Competing Interests: No competing interests were disclosed. Close Report a concern Respond or Comment COMMENTS ON THIS REPORT Author Response 25 Feb 2025 Amand Schmidt , UCL British Heart Foundation Research Accelerator, London, WC1E6HX, UK 25 Feb 2025 Author Response Reviewer Comments: The author have made very interesting points on common mistakes when interpreting results from statistical tests that fail to reject the null hypothesis. There are some minor comments as listed ... Continue reading Reviewer Comments: The author have made very interesting points on common mistakes when interpreting results from statistical tests that fail to reject the null hypothesis. There are some minor comments as listed below: 1. In Page 5, it was mentioned that "Hence, a more relevant alternative to post-hoc power calculations is to evaluate the extent to which the CI includes clinically relevant effect estimates, which is in line with the aforementioned equivalence/non-inferiority approach.". It can be helpful if the author could provide a specific example (e.g. further discussion with the two HR estimates using equivalence/non-inferiority approach) to further illustrate this. Response: Thank you, we have included the following example, using the VOYAGER PAD trial results. Page 6 “ For example, the VOYAGER PAD HR estimate of 0.86 (95%CI 0.40;1.87) for bleeding risk in people with endovascular PAD clearly shows that the collected data supports a wide range of effect estimates, including potentially harmful associations. However, because the confidence interval only partially overlap with the proposed (hypothetical) upper bounds of acceptable harm of 1.25, testing against this bound results a p-value of 0.17 which is considerably smaller than testing against the complete absence of an effect: p-value 0.70. By comparison, the observed power estimate for these results is 7%, which implies that if the true HR was 0.86 one would have rejected the strict null-hypothesis in 7 out of 100 repeated experiment. As such observed power provide limit information relative to the presented alternative approaches, particularly the confidence interval based approach which allows for an informative discussion of benefits and harms in terms of effect magnitude(s). “ Reviewer Comments: 2. In the last line of Page 5, "naïve" should be "naive". Response: Thank you. Reviewer Comments: 3. In the last lines of Page 7, it was recommended that "researchers should routinely apply composite null hypothesis tests evaluated against meaningful bounds of insignificance". It would be helpful if the author could provide more discussion with suggestions on how to identify the "bounds of insignificance". Response: The following has been included on page 4 “ Defining bounds of equivalence or non-inferiority is challenging and a possible source of contention. Typically, such bounds are defined by combining statistical and clinical considerations. For example, evidence from previous studies can be meta-analysed to obtain a pooled effect estimate and confidence interval, where the confidence interval limits can be multiplied by a constants representing the amount of effect that one would like to preserve or rule out (for safety). 7 “ Reviewer Comments: The author have made very interesting points on common mistakes when interpreting results from statistical tests that fail to reject the null hypothesis. There are some minor comments as listed below: 1. In Page 5, it was mentioned that "Hence, a more relevant alternative to post-hoc power calculations is to evaluate the extent to which the CI includes clinically relevant effect estimates, which is in line with the aforementioned equivalence/non-inferiority approach.". It can be helpful if the author could provide a specific example (e.g. further discussion with the two HR estimates using equivalence/non-inferiority approach) to further illustrate this. Response: Thank you, we have included the following example, using the VOYAGER PAD trial results. Page 6 “ For example, the VOYAGER PAD HR estimate of 0.86 (95%CI 0.40;1.87) for bleeding risk in people with endovascular PAD clearly shows that the collected data supports a wide range of effect estimates, including potentially harmful associations. However, because the confidence interval only partially overlap with the proposed (hypothetical) upper bounds of acceptable harm of 1.25, testing against this bound results a p-value of 0.17 which is considerably smaller than testing against the complete absence of an effect: p-value 0.70. By comparison, the observed power estimate for these results is 7%, which implies that if the true HR was 0.86 one would have rejected the strict null-hypothesis in 7 out of 100 repeated experiment. As such observed power provide limit information relative to the presented alternative approaches, particularly the confidence interval based approach which allows for an informative discussion of benefits and harms in terms of effect magnitude(s). “ Reviewer Comments: 2. In the last line of Page 5, "naïve" should be "naive". Response: Thank you. Reviewer Comments: 3. In the last lines of Page 7, it was recommended that "researchers should routinely apply composite null hypothesis tests evaluated against meaningful bounds of insignificance". It would be helpful if the author could provide more discussion with suggestions on how to identify the "bounds of insignificance". Response: The following has been included on page 4 “ Defining bounds of equivalence or non-inferiority is challenging and a possible source of contention. Typically, such bounds are defined by combining statistical and clinical considerations. For example, evidence from previous studies can be meta-analysed to obtain a pooled effect estimate and confidence interval, where the confidence interval limits can be multiplied by a constants representing the amount of effect that one would like to preserve or rule out (for safety). 7 “ Competing Interests: No competing interests were disclosed. Close Report a concern COMMENT ON THIS REPORT Comments on this article Comments (0) Version 3 VERSION 3 PUBLISHED 05 Dec 2024 ADD YOUR COMMENT Comment keyboard_arrow_left keyboard_arrow_right Open Peer Review Reviewer Status info_outline Alongside their report, reviewers assign a status to the article: Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions Reviewer Reports Invited Reviewers 1 2 Version 3 (revision) 01 Apr 25 Version 2 (revision) 25 Feb 25 read read Version 1 05 Dec 24 read read Ying Cui , Stanford University, Stanford,, USA J. Alexander Heimel , Netherlands Institute for Neuroscience, Amsterdam, The Netherlands Comments on this article All Comments (0) Add a comment Sign up for content alerts Sign Up You are now signed up to receive this alert Browse by related subjects keyboard_arrow_left Back to all reports Reviewer Report 0 Views copyright © 2025 Heimel J. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. 21 Mar 2025 | for Version 2 J. Alexander Heimel , Netherlands Institute for Neuroscience, Amsterdam, The Netherlands 0 Views copyright © 2025 Heimel J. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. format_quote Cite this report speaker_notes Responses (1) Approved info_outline Alongside their report, reviewers assign a status to the article: Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions The author has answered my comments to my satisfaction, except for one small point. In his reply to my question, the author wrote that the relationship between p-value and observed power is independent of sample size n. I do not believe this to be true. In the new supplementary material, the author sketches this relationship, but ignores the fact the t-distribution (denoted as g in the supplementary material) is dependent on the degrees of freedom (d.o.f. = n-1) and therefore on the sample size. The value t_c will thus depend on n and the observed power, given by the author as g(t_0-t_c)+g(-t_0-t_c) will also depend on n, through g and t_c. In fact, I believe the correct formula for observed power should be: Observed_power(t) = g_{d.o.f, t}(t_c) + g_{d.o.f, -t}(t_c) With g_{d.o.f.,t}(t_c) the cumulative non-central t-distribution with d.o.f. degrees of freedom shift t evaluated at t_c. For large n, the approximations used by the author converge, and the dependence on the sample size disappears. This dependence on sample size, however, does not fundamentally change or invalidate the author’s argument that a post-hoc power calculation using the observed effect is not useful. While studying the relationship between observed power and p-value for my reply, I came across the Wikipedia page on power analysis that also makes the author’s point that post-hoc power analysis is fundamentally flawed, with references to Hoenig (The American Statistician, 2001), cited more than 2000 times, and Thomas (Conservation Biology, 1997) already pointing out the same flaw. The author’s case is thus not new, but may still be very relevant and a good reminder. It certainly was for me. Minor details about the new supplementary note: I think that when writing “the p-value for a one-sided tests would simply be 1-g(|t_0|)”, already the assumption of f being symmetrical about zero is made. The statement is not true for a general f and t_0<0. “In this case the two-sided p-value is 2x(1-g(2)) \approx 0.46”. I believe this should be “\approx 0.046” (and is only true for large sample size). References 1. Hoenig J, Heisey D: The Abuse of Power. The American Statistician . 2001; 55 (1): 19-24 Publisher Full Text 2. Thomas, Len. "Retrospective power analysis." Conservation Biology 11.1 (1997): 276-280. Competing Interests No competing interests were disclosed. Reviewer Expertise Neuroscience I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard. reply Respond to this report Responses (1) Author Response 01 Apr 2025 Amand Schmidt, UCL British Heart Foundation Research Accelerator, London, WC1E6HX, UK Reviewer 2 Comment: The author has answered my comments to my satisfaction, except for one small point. In his reply to my question, the author wrote that the relationship between p-value and observed power is independent of sample size n. I do not believe this to be true. In the new supplementary material, the author sketches this relationship, but ignores the fact the t-distribution (denoted as g in the supplementary material) is dependent on the degrees of freedom (d.o.f. = n-1) and therefore on the sample size. The value t_c will thus depend on n and the observed power, given by the author as g(t_0-t_c)+g(-t_0-t_c) will also depend on n, through g and t_c. In fact, I believe the correct formula for observed power should be: Observed_power(t) = g_{d.o.f, t}(t_c) + g_{d.o.f, -t}(t_c) With g_{d.o.f.,t}(t_c) the cumulative non-central t-distribution with d.o.f. degrees of freedom shift t evaluated at t_c. For large n, the approximations used by the author converge, and the dependence on the sample size disappears. This dependence on sample size, however, does not fundamentally change or invalidate the author’s argument that a post-hoc power calculation using the observed effect is not useful. Response: I am pleased to note that the reviewer agrees with the main conclusions of the manuscript. The reviewer appears to be referring to the sample size–dependent convergence of the t -distribution to the standard normal distribution. While this is, of course, entirely correct and uncontested, it is not directly relevant to the point under discussion. The relationship between p-values and statistical power is independent of the choice of distribution, and does not rely on any asymptotic approximation. Although the illustrative examples in the manuscript use the standard normal distribution -which does not depend on sample size or degrees of freedom - the relationship between p-value and power holds equally when using distributions that do, such as the t- or F-distributions. This is because both the p -value and power are calculated using the same distribution, with the same degrees of freedom. For example, if one calculates a p-value using a t-distribution with 7 degrees of freedom, the corresponding power would also be calculated using that same t-distribution with 7 degrees of freedom. Thus, while the distribution itself may depend on sample size, the relationship between p-value and power remains exact and independent of sample size. The following was added to the supplementary to help clarify this. “ As in the previous section, these expressions do not depend on the amount of data collected and are therefore exact. If one is uses a statistical distribution with degrees of freedom (which often depend on sample size), such as the t or F distributions, the relationship between the p-value and power remain exact, simply because both quantities are calculated using the same distribution with the same degrees of freedom. “ Comment: While studying the relationship between observed power and p-value for my reply, I came across the Wikipedia page on power analysis that also makes the author’s point that post-hoc power analysis is fundamentally flawed, with references to Hoenig (The American Statistician, 2001), cited more than 2000 times, and Thomas (Conservation Biology, 1997) already pointing out the same flaw. The author’s case is thus not new, but may still be very relevant and a good reminder. It certainly was for me. Response: Thank you these reference have been included in the updated manuscript. Comment: Minor details about the new supplementary note: I think that when writing “the p-value for a one-sided tests would simply be 1-g(|t_0|)”, already the assumption of f being symmetrical about zero is made. The statement is not true for a general f and t_0<0. Response: This section briefly describes p-values for one-sided and two-sided tests. The section has been clarified to explain that this statement is not an assumption but merely reflects a known quantity about some distributions being symmetrical about zero where others are not. The expanded section now provides examples of alternative distributions which are not symmetrical about zero. The reason that this was not originally included is that commonly known non-symmetrical distribution such as the Chi-square, the F distribution, or even the Gamma distribution are not defined for negative values making them less applicable for two-sided tests. “ Next, say we observe test-statistic t o , in that case the p-value for a one-sided tests would simply be 1-g t o , and assuming f is symmetrical about zero (e.g. the normal distribution or t-distribution) the p-value for a two-sided test would simply be 2× 1-g t o . For distributions that are not symmetric about zero, both sides of the distribution should be considered separately. It is important to note, however, that many commonly used asymmetric distributions—such as the F, Chi-squared, and Gamma distributions—are defined only for positive values. This restricts their direct applicability for two-sided hypothesis testing. “ Comment: “In this case the two-sided p-value is 2x(1-g(2)) \approx 0.46”. I believe this should be “\approx 0.046” (and is only true for large sample size). Response: Thanks for spotting this type-o, this should be 0.046 indeed. However, this is not dependent on any sample size assumption. The approximate symbol (≈) is used purely to reflect rounding to three decimal places, not to imply asymptotic behaviour or large-sample properties. It’s possible the reviewer is conflating this with the well-known approximation of the t -distribution by the standard normal distribution as sample size increases. While that is certainly valid in a different context, it is not relevant here. I am not making any claims about inference or distributional convergence; rather, I am simply referring to the numerical value of the cumulative distribution function. To clarify: if g denotes the cumulative distribution function of the standard normal distribution, then 2×(1-g 2 ≈0.046 , regardless of sample size. Similarly, if one were to use the t -distribution with 7 degrees of freedom, the corresponding value would be approximately 0.086 - the choice of 7 here being arbitrary and illustrative. This is a purely mathematical statement about evaluating a distribution function at a specific point, and not a comment on sampling or estimation. View more View less Competing Interests No competing interests were disclosed. reply Respond Report a concern Heimel JA. Peer Review Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.177945.r368387) NOTE: it is important to ensure the information in square brackets after the title is included in this citation. The direct URL for this report is: https://f1000research.com/articles/13-1488/v2#referee-response-368387 keyboard_arrow_left Back to all reports Reviewer Report 0 Views copyright © 2025 Cui Y. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. 06 Mar 2025 | for Version 2 Ying Cui , Stanford University, Stanford,, California, USA 0 Views copyright © 2025 Cui Y. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. format_quote Cite this report speaker_notes Responses (0) Approved info_outline Alongside their report, reviewers assign a status to the article: Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions No further comments. Competing Interests No competing interests were disclosed. Reviewer Expertise Hypothesis testing, Biostatistics I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard. reply Respond to this report Responses (0) Cui Y. Peer Review Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.177945.r368388) NOTE: it is important to ensure the information in square brackets after the title is included in this citation. The direct URL for this report is: https://f1000research.com/articles/13-1488/v2#referee-response-368388 keyboard_arrow_left Back to all reports Reviewer Report 0 Views copyright © 2025 Heimel J. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. 07 Jan 2025 | for Version 1 J. Alexander Heimel , Netherlands Institute for Neuroscience, Amsterdam, The Netherlands 0 Views copyright © 2025 Heimel J. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. format_quote Cite this report speaker_notes Responses (1) Approved With Reservations info_outline Alongside their report, reviewers assign a status to the article: Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions The manuscript, “Addressing common inferential mistakes when failing to reject the null-hypothesis” contains two main messages. First, the author argues that post-hoc power calculations should be avoided, as “they offer no additional information beyond statistical tests and p-values” and they “can be misleading because of an inability to distinguish between results based on insufficient sample size and results that reflect clinically irrelevant differences”. Second, for interpreting the results of multiple comparison, the authors recommends to consider “the distribution of p-values and the proportion of significant results to identify bodies” rather than standard multiple testing procedures, because they “unrealistically assume that all positive results are false positive”. The paper is an interesting read, but I have a few remarks. I have two comments on the first point. “p-values and observed power are equivalent, and no additional information is obtained by considering both ( Figure 3).” No proof is given of this statement and it is not completely trivial. Can a (hint of a) proof be given or a reference with a proof cited? I believe the relationship shown and figure 3 between p and power depends on the number of samples, but this is not shown in the text or figure. Can you comment on this? The dependence on the sample number could give some added value for making this post-hoc power calculation, and p-values and observed power are thus not completely equivalent. Are there examples that people do this particular post-hoc power calculation using the measured effect and the measured standard deviation? What I have encountered, is post-hoc power calculations based on the measured standard deviation of the sample, and for an assumed effect size (rather than the measured effect). If there are publications doing the post-hoc power analysis in the way suggested by this publication, then please give some example references. Otherwise, the reader is left to wonder if this part of the manuscript is arguing against hypothetical reasoning that people do not actually use. The second point of the paper is that “While individual p-values and null hypothesis tests cannot differentiate between false and true positive results, a set of p-values (Figure 5) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results.” This can indeed be helpful. However, if assumptions like independence between tests or normality of the measurements for some tests fail, then this meta-analysis on the p-value may give a false positive outcome. Without a good understanding of the underlying data and statistics, I would use this test only to suggest that results are possibly false positives if the distribution of p-values is not distinguishable from a uniform distribution, rather than concluding that there must be some true positive effect if the distribution of p-values is not uniform, as is done by the author for the specific example for the effect of rivaroxaban. These are my main comments. I also have a number of smaller remarks. The manuscript also discusses whether null hypothesis testing can prove whether difference in means between two (infinite) populations is truly zero, but I fail to see the connection of this more philosophical point on the interpretation of hypothesis testing to the main messages. I think that the manuscript would be stronger if it is left out. The meaning of the sentence “Given that power and type 1 error make extreme assumptions where either all results are true or false positives, these concepts are less relevant after the data have been collected. ” is not at all clear to me. Perhaps rephrase to clarify. “applying multiplicity correction can sometimes increase the false positive rate instead of reducing it.” In this sentence, and the ensuing section the term “false positive rate” is used differently from what is the common use in the statistical literature. In the literature, “false positive rate” is the expectation of a false positive result when repeating the procedure. The meaning that is taken in this manuscript is that of the fraction of the significant results of a given set of experiments that is a false positive. It is not clear that the interpretation chosen here, and which is shown by example to lead to a false expectation, actually often occurs in the scientific literature. Can some example references be provided? Fig. 5: The number of samples used for producing the figure is missing. “comparing the set of p-values for all the aforementioned outcomes against a uniform distribution resulted in a p-value of 0.02,” Which test is used for the comparison. “researchers should routinely indicate the bounds between which an effect is sufficiently small to be considered clinically irrelevant” This is a valuable recommendation. Before the start of a study it is necessary to choose an expected effect size for determining the power of a study or the number of samples. If the expected effect size, is below the bound, then either the study should not be started (if one is looking to prove a positive effect) or the bound should be taken as the effect size for the power calculation. “Second, while notions of power and type 1 errors are essential at the study design phase because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results.” I do not fully agree. For a power analysis at the design phase, typically both an effect size and the variance need to be estimated. In my opinion, a post-hoc power analysis could be relevant for interpreting a negative result, to take into account the observed variance. If the variance is a lot larger than a priori expected, then the study could have been underpowered. Perhaps adjust the text, or explain why I am wrong. I do not understand “Finally, depending on the area of research overlooked, true positive results may be more harmful than false positive results”. Is meant that setting too stringent boundaries for significance can be harmful? Consider rephrasing to make its meaning clear. Is the topic of the opinion article discussed accurately in the context of the current literature? Partly Are all factual statements correct and adequately supported by citations? Partly Are arguments sufficiently supported by evidence from the published literature? Partly Are the conclusions drawn balanced and justified on the basis of the presented arguments? Partly Competing Interests No competing interests were disclosed. Reviewer Expertise Neuroscience I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard, however I have significant reservations, as outlined above. reply Respond to this report Responses (1) Author Response 25 Feb 2025 Amand Schmidt, UCL British Heart Foundation Research Accelerator, London, WC1E6HX, UK The manuscript, “Addressing common inferential mistakes when failing to reject the null-hypothesis” contains two main messages. First, the author argues that post-hoc power calculations should be avoided, as “they offer no additional information beyond statistical tests and p-values” and they “can be misleading because of an inability to distinguish between results based on insufficient sample size and results that reflect clinically irrelevant differences”. Second, for interpreting the results of multiple comparison, the authors recommends to consider “the distribution of p-values and the proportion of significant results to identify bodies” rather than standard multiple testing procedures, because they “unrealistically assume that all positive results are false positive”. The paper is an interesting read, but I have a few remarks. I have two comments on the first point. “p-values and observed power are equivalent, and no additional information is obtained by considering both ( Figure 3).” No proof is given of this statement and it is not completely trivial. Can a (hint of a) proof be given or a reference with a proof cited? I believe the relationship shown and figure 3 between p and power depends on the number of samples, but this is not shown in the text or figure. Can you comment on this? The dependence on the sample number could give some added value for making this post-hoc power calculation, and p-values and observed power are thus not completely equivalent. Response: thank you, the following proof is provided in the updated supplementary material, also confirming this relationship is independent of sample size. “ To describe the relationship between p-values and observed power we first let g represent the commulative density function of the p.d.f. f , that is g t = -∞ t f x dx . Next, say we observe test-statistic t o , in that case the p-value for a one-sided tests would simply be 1-g t o , and assuming f is symmetrical about zero the p-value for a two-sided test would simply be 2× 1-g t o . If we let t c represent the critical value of a statistical test beyond which the null-hypothesis is rejected, power can be calculated as g t o -tc +g( -t 0 - t c ) , where in most cases either the first or the second term will be close to zero and can be ignored. As an example, let f be the p.d.f. of the standard normal distribution, with t 0 =2 and t c =1.96 . In this case the two-sided p-value is 2× 1-g 2 ≈0.46 , and power would be g 2-1.96 +g -2-1.96 ≈0.52+0.00=0.52. Similarly, say t 0 =0 (i.e., the null-hypothesis is true), and f and t c are the same as before, we then find that 2× 1-g 0 =1 and g -1.96 +g -1.96 =0.05 . In other words when the null-hypothesis is true the p-value would be 1 and power would be 0.05 (or more generally power would equal to the area excluded by t c ). As such there exists a one-to-one relationship between the p-value and observed power based on t o . Furthermore, and perhaps counterintuitively observed power is smallest when the p-value provides most support for the null-hypothesis and vice versa. This is a reflection of the p-value evaluating t o assuming the null-hypothesis is true, whereas observed power evaluates t o assuming this hypothesis is false. As in the previous section, these expressions do not depend on the amount of data collected and are therefore exact. Furthermore, given that a statistical test, comparing t o to t c , cannot be used to prove the null-hypothesis is true (as shown in the preceding section), the same holds for derived metrics including p-values and power. In fact power can only be calculated assuming the null-hypothesis is false, and hence cannot be used to argue in favour of the null-hypothesis being true. “ Are there examples that people do this particular post-hoc power calculation using the measured effect and the measured standard deviation? What I have encountered, is post-hoc power calculations based on the measured standard deviation of the sample, and for an assumed effect size (rather than the measured effect). If there are publications doing the post-hoc power analysis in the way suggested by this publication, then please give some example references. Otherwise, the reader is left to wonder if this part of the manuscript is arguing against hypothetical reasoning that people do not actually use. Response: Thank you for raising this important point. The manuscript has been updated to clarify that the raised issues with observed power are of general concern irrespective of the whether the point estimates reflect the study results or some pre-existing notion of relevant difference. The concerns about power pertain to its use as a metric to interpret current results. In doing so the manuscript more clearly differentiates with power and/or sample size calculations to inform future research (which are without fault) and power calculations for already completed studies. Page 6-7 “ Researchers may alternatively wish to calculate the power to reject a clinically meaningful difference other than the point estimate. Such calculations can meaningfully inform the design and viability of future studies; although sample size estimates may be more readily interpretable. However, when such power calculations are used to make statements about the presence or absence of an effect, or even lack of sample size, the described approach utilising confidence intervals and equivalence/non-inferiority margins provides more relevant information on study accuracy. The futility of using power to make claims of the absence of an effect is further illustrated by noting that in the absence of an effect (i.e. when the p-value is 1) observed power is equal to the employed alpha threshold (e.g. 0.05). Hence, rather counterintuitively, low power may actually argue for the absence of an effect. Because power can only be calculated assuming the null-hypothesis is false, this metric cannot be used to make claims in favour of the null-hypothesis. Furthermore, as discussed in the preceding section, statistical tests cannot be used to support the strict null-hypothesis, as such this also holds for derived metrics such as p-value and power. While power remains essential when designing a future study, it should not be used to interpret results of a completed study. At this stage more relevant metrics such as confidence intervals are available which do not condition on the presence or absence of an effect, and provide information on accuracy as well as on effect magnitude. “ As well as page 9 “ Second, while notions of power and type 1 errors are essential at the study design phase, because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results. Power and type 1 errors can be framed in terms of probabilities because the analysis has not yet been conducted. Once the experiment has been completed, these hypothetical probabilities are immaterial, and one is simply confronted with an unknown proportion of true-positive results. “ The second point of the paper is that “While individual p-values and null hypothesis tests cannot differentiate between false and true positive results, a set of p-values (Figure 5) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results.” This can indeed be helpful. However, if assumptions like independence between tests or normality of the measurements for some tests fail, then this meta-analysis on the p-value may give a false positive outcome. Without a good understanding of the underlying data and statistics, I would use this test only to suggest that results are possibly false positives if the distribution of p-values is not distinguishable from a uniform distribution, rather than concluding that there must be some true positive effect if the distribution of p-values is not uniform, as is done by the author for the specific example for the effect of rivaroxaban. Response: Thanks for raising these important points. The manuscript has been updated to clarify this involves a general testing procedure which does not make assumptions on the test statistics or the set of p-values following a normal distribution. Furthermore, the manuscript now explains that there may be a need to account for dependencies between sets of p-values. The following has been included on page 8 “ While individual p-values and null-hypothesis tests cannot differentiate between false and true positive results, a set of p-values ( Figure 5 ) can be compared against a uniform distribution to determine the likelihood that the entire set is driven by false positive results. This approach is independent of the specific statistical test used to derive individual p-values. Moreover, the method can be generalised to account for dependencies among p-values, such as dependencies arising from the inclusion of both composite and individual outcomes (e.g., evaluating both any stroke and ischaemic stroke). “ And the figure 5 note “ N.b. The p-values were derived by arbitrarily sampling 1,000 test statistic from a normal distribution and leveraging its cumulative density function to calculate the area on the left and right side of the sampled test-statistic. Specifically, the employed standard distribution had based on random draws from a normal distribution with a a standard deviation of 1 and mean of either 0 or 2, when the null-hypothesis was true and false, respectively. Please note that the normal distribution is only used as an exemplar, and alternative distributions with a known cumulative density function (e.g. chi-square, beta, or gamma) could have been used instead. “ These are my main comments. I also have a number of smaller remarks. The manuscript also discusses whether null hypothesis testing can prove whether difference in means between two (infinite) populations is truly zero, but I fail to see the connection of this more philosophical point on the interpretation of hypothesis testing to the main messages. I think that the manuscript would be stronger if it is left out. Response: This has now been integrated with the sections describing power, to further support the argumentation that (observed) power calculations are futile because the null-hypothesis can never be empirically shown to be true. As such this section is far from philosophical or redundant. The meaning of the sentence “Given that power and type 1 error make extreme assumptions where either all results are true or false positives, these concepts are less relevant after the data have been collected. ” is not at all clear to me. Perhaps rephrase to clarify. Response: The following clarification was added. Page 7 “ Considerations of power and type 1 error rate are extremely relevant when designing a study, ensuring that a sufficiently accurate effect estimate may be realistically obtained given the available resources. However, both power and type 1 error rate are conditional probabilities assuming that the null-hypothesis is either true or false. As such these concepts are less relevant after the data have been collected, which would generally not consist of null-hypothesises which are either all true or all false, but instead will include an unknown mixture of both. “ “applying multiplicity correction can sometimes increase the false positive rate instead of reducing it.” In this sentence, and the ensuing section the term “false positive rate” is used differently from what is the common use in the statistical literature. In the literature, “false positive rate” is the expectation of a false positive result when repeating the procedure. The meaning that is taken in this manuscript is that of the fraction of the significant results of a given set of experiments that is a false positive. It is not clear that the interpretation chosen here, and which is shown by example to lead to a false expectation, actually often occurs in the scientific literature. Can some example references be provided? Response: Thanks for pointing this out, the manuscript has been updated to refer to this as the false discovery rate, including a formal definition. Page 7 “ An often overlooked point is that, depending on the unknown balance between false positives and true positives in a set of test results, applying multiplicity correction can sometimes increase the false discovery rate (i.e., the fraction of false positives divided by the total number of positive tests) instead of reducing it. For example, Figure 4A presents a naive expectation of multiple testing corrections, where the false discovery rate decreases from 1/3 to 0. However, there is no reason why the scenario depicted in Figure 4B may not occur; here, the false discovery rate increases from 1/3 to 1. “ Given that the false discovery rate can only be defined based on the knowledge about which hypotheses are true and which are false, the second part of the question is impossible to answer for empirical research. However, the following thought experiment can provide some intuition. We know that the incidence of cardiovascular disease increases with age, which implies that in a sufficiently large study there would be a significant association between age and cardiovascular disease, say the p-value is 0.001. If we conduct a single test at an alpha (type 1 error rate) of 0.05 we would correctly reject a null hypothesis of no association. If we subsequently preformed four additional tests of association using completely random pairs of variables (i.e., where the null hypothesis is true), the Bonferroni multiplicity corrected alpha would be 0.01 and we would not reject the null hypothesis that age associates with cardiovascular disease. If however due to random chance one of the p-value for one of the remaining tests is smaller than 0.01 we would incorrectly reject the null-hypothesis thereby increasing the false discovery rate from 0 (when we only test for age) to 1/5. Fig. 5: The number of samples used for producing the figure is missing. Response: This was added. “comparing the set of p-values for all the aforementioned outcomes against a uniform distribution resulted in a p-value of 0.02,” Which test is used for the comparison. Response: This has been added. “ Utilizing a non-parametric Kolmogorov-Smirnov test to compare the set of p-values for all the aforementioned outcomes against a uniform distribution nevertheless resulted in a p-value of 0.02, suggesting that the protective effect of rivaroxaban is shared across multiple cardiovascular outcomes. “ “researchers should routinely indicate the bounds between which an effect is sufficiently small to be considered clinically irrelevant” This is a valuable recommendation. Before the start of a study it is necessary to choose an expected effect size for determining the power of a study or the number of samples. If the expected effect size, is below the bound, then either the study should not be started (if one is looking to prove a positive effect) or the bound should be taken as the effect size for the power calculation. Response: Thank you. “Second, while notions of power and type 1 errors are essential at the study design phase because these deal in hypothetical scenarios where all results are either true or false, such metrics have limited relevance when interpreting results.” I do not fully agree. For a power analysis at the design phase, typically both an effect size and the variance need to be estimated. In my opinion, a post-hoc power analysis could be relevant for interpreting a negative result, to take into account the observed variance. If the variance is a lot larger than a priori expected, then the study could have been underpowered. Perhaps adjust the text, or explain why I am wrong. Response: The requested explanation has been included. Please refer to page 7-8. “ Researchers may alternatively wish to calculate the power to reject a clinically meaningful difference other than the point estimate. Such calculations can meaningfully inform the design and viability of future studies; although sample size estimates may be more readily interpretable. However, when such power calculations are used to make statements about the presence or absence of an effect, or even lack of sample size, the described approach utilising confidence intervals and equivalence/non-inferiority margins provides more relevant information on study accuracy. The futility of using power to make claims of the absence of an effect is further illustrated by noting that in the absence of an effect (i.e. when the p-value is 1) observed power is equal to the employed alpha threshold (e.g. 0.05). Hence, rather counterintuitively, low power may actually argue for the absence of an effect. Because power can only be calculated assuming the null-hypothesis is false, this metric cannot be used to make claims in favour of the null-hypothesis. Furthermore, as discussed in the preceding section, statistical tests cannot be used to support the strict null-hypothesis, as such this also holds for derived metrics such as p-value and power. While power remains essential when designing a future study, it should not be used to interpret results of a completed study. At this stage more relevant metrics such as confidence intervals are available which do not condition on the presence or absence of an effect, and provide information on accuracy as well as on effect magnitude. “ I do not understand “Finally, depending on the area of research overlooked, true positive results may be more harmful than false positive results”. Is meant that setting too stringent boundaries for significance can be harmful? Consider rephrasing to make its meaning clear. Response: The following has been added to clarify this matter further. Page 9 “ Finally, while decreasing the significance threshold (e.g. from 0.05 to 0.005) decreases the type 1 error rate this decreases power as well, and hence may decrease the number of true associations discovered. Depending on the area of research overlooked, true positive results may be more harmful than false positive results. For example, protein drug targets identified in early drug development are often subjected to a substantial number of follow-up analyses, which filter out false positive results. Such follow-up studies, however, rarely expand the number of candidates, hence suggesting a more inclusive approach might be more considerate. In settings more proximal to clinical implementation and less discovery oriented, such as phase 3 clinical trials, stringent multiple testing correction is clearly called for. “ View more View less Competing Interests No competing interests were disclosed. reply Respond Report a concern Heimel JA. Peer Review Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.174018.r350494) NOTE: it is important to ensure the information in square brackets after the title is included in this citation. The direct URL for this report is: https://f1000research.com/articles/13-1488/v1#referee-response-350494 keyboard_arrow_left Back to all reports Reviewer Report 0 Views copyright © 2025 Cui Y. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. 02 Jan 2025 | for Version 1 Ying Cui , Stanford University, Stanford,, California, USA 0 Views copyright © 2025 Cui Y. This is an open access peer review report distributed under the terms of the Creative Commons Attribution License , which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. format_quote Cite this report speaker_notes Responses (1) Approved info_outline Alongside their report, reviewers assign a status to the article: Approved The paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved Fundamental flaws in the paper seriously undermine the findings and conclusions The author have made very interesting points on common mistakes when interpreting results from statistical tests that fail to reject the null hypothesis. There are some minor comments as listed below: 1. In Page 5, it was mentioned that "Hence, a more relevant alternative to post-hoc power calculations is to evaluate the extent to which the CI includes clinically relevant effect estimates, which is in line with the aforementioned equivalence/non-inferiority approach.". It can be helpful if the author could provide a specific example (e.g. further discussion with the two HR estimates using equivalence/non-inferiority approach) to further illustrate this. 2. In the last line of Page 5, "naïve" should be "naive". 3. In the last lines of Page 7, it was recommended that "researchers should routinely apply composite null hypothesis tests evaluated against meaningful bounds of insignificance". It would be helpful if the author could provide more discussion with suggestions on how to identify the "bounds of insignificance". Is the topic of the opinion article discussed accurately in the context of the current literature? Yes Are all factual statements correct and adequately supported by citations? Yes Are arguments sufficiently supported by evidence from the published literature? Yes Are the conclusions drawn balanced and justified on the basis of the presented arguments? Yes Competing Interests No competing interests were disclosed. Reviewer Expertise Hypothesis testing, Biostatistics I confirm that I have read this submission and believe that I have an appropriate level of expertise to confirm that it is of an acceptable scientific standard. reply Respond to this report Responses (1) Author Response 25 Feb 2025 Amand Schmidt, UCL British Heart Foundation Research Accelerator, London, WC1E6HX, UK Reviewer Comments: The author have made very interesting points on common mistakes when interpreting results from statistical tests that fail to reject the null hypothesis. There are some minor comments as listed below: 1. In Page 5, it was mentioned that "Hence, a more relevant alternative to post-hoc power calculations is to evaluate the extent to which the CI includes clinically relevant effect estimates, which is in line with the aforementioned equivalence/non-inferiority approach.". It can be helpful if the author could provide a specific example (e.g. further discussion with the two HR estimates using equivalence/non-inferiority approach) to further illustrate this. Response: Thank you, we have included the following example, using the VOYAGER PAD trial results. Page 6 “ For example, the VOYAGER PAD HR estimate of 0.86 (95%CI 0.40;1.87) for bleeding risk in people with endovascular PAD clearly shows that the collected data supports a wide range of effect estimates, including potentially harmful associations. However, because the confidence interval only partially overlap with the proposed (hypothetical) upper bounds of acceptable harm of 1.25, testing against this bound results a p-value of 0.17 which is considerably smaller than testing against the complete absence of an effect: p-value 0.70. By comparison, the observed power estimate for these results is 7%, which implies that if the true HR was 0.86 one would have rejected the strict null-hypothesis in 7 out of 100 repeated experiment. As such observed power provide limit information relative to the presented alternative approaches, particularly the confidence interval based approach which allows for an informative discussion of benefits and harms in terms of effect magnitude(s). “ Reviewer Comments: 2. In the last line of Page 5, "naïve" should be "naive". Response: Thank you. Reviewer Comments: 3. In the last lines of Page 7, it was recommended that "researchers should routinely apply composite null hypothesis tests evaluated against meaningful bounds of insignificance". It would be helpful if the author could provide more discussion with suggestions on how to identify the "bounds of insignificance". Response: The following has been included on page 4 “ Defining bounds of equivalence or non-inferiority is challenging and a possible source of contention. Typically, such bounds are defined by combining statistical and clinical considerations. For example, evidence from previous studies can be meta-analysed to obtain a pooled effect estimate and confidence interval, where the confidence interval limits can be multiplied by a constants representing the amount of effect that one would like to preserve or rule out (for safety). 7 “ View more View less Competing Interests No competing interests were disclosed. reply Respond Report a concern Cui Y. Peer Review Report For: Addressing common inferential mistakes when failing to reject the null-hypothesis [version 3; peer review: 2 approved] . F1000Research 2025, 13 :1488 ( https://doi.org/10.5256/f1000research.174018.r350489) NOTE: it is important to ensure the information in square brackets after the title is included in this citation. The direct URL for this report is: https://f1000research.com/articles/13-1488/v1#referee-response-350489 Alongside their report, reviewers assign a status to the article: Approved - the paper is scientifically sound in its current form and only minor, if any, improvements are suggested Approved with reservations - A number of small changes, sometimes more significant revisions are required to address specific details and improve the papers academic merit. Not approved - fundamental flaws in the paper seriously undermine the findings and conclusions Adjust parameters to alter display View on desktop for interactive features Includes Interactive Elements View on desktop for interactive features Competing Interests Policy Provide sufficient details of any financial or non-financial competing interests to enable users to assess whether your comments might lead a reasonable person to question your impartiality. Consider the following examples, but note that this is not an exhaustive list: Examples of 'Non-Financial Competing Interests' Within the past 4 years, you have held joint grants, published or collaborated with any of the authors of the selected paper. You have a close personal relationship (e.g. parent, spouse, sibling, or domestic partner) with any of the authors. You are a close professional associate of any of the authors (e.g. scientific mentor, recent student). You work at the same institute as any of the authors. You hope/expect to benefit (e.g. favour or employment) as a result of your submission. You are an Editor for the journal in which the article is published. Examples of 'Financial Competing Interests' You expect to receive, or in the past 4 years have received, any of the following from any commercial organisation that may gain financially from your submission: a salary, fees, funding, reimbursements. You expect to receive, or in the past 4 years have received, shared grant support or other funding with any of the authors. You hold, or are currently applying for, any patents or significant stocks/shares relating to the subject matter of the paper you are commenting on. Stay Updated Sign up for content alerts and receive a weekly or monthly email with all newly published articles Register with F1000Research Already registered? Sign in Not now, thanks close PLEASE NOTE If you are an AUTHOR of this article, please check that you signed in with the account associated with this article otherwise we cannot automatically identify your role as an author and your comment will be labelled as a “User Comment”. If you are a REVIEWER of this article, please check that you have signed in with the account associated with this article and then go to your account to submit your report, please do not post your review here. If you do not have access to your original account, please contact us . All commenters must hold a formal affiliation as per our Policies . The information that you give us will be displayed next to your comment. User comments must be in English, comprehensible and relevant to the article under discussion. We reserve the right to remove any comments that we consider to be inappropriate, offensive or otherwise in breach of the User Comment Terms and Conditions . Commenters must not use a comment for personal attacks. When criticisms of the article are based on unpublished data, the data should be made available. I accept the User Comment Terms and Conditions Please confirm that you accept the User Comment Terms and Conditions. Affiliation ✕ refresh Please enter your institution. Note: To add your institution or organisation, start typing the name and then select the correct name from the list. Where applicable, the name will appear in both the original language and in English. Do not paste in the name. If the name does not appear in the drop-down list, we will display the information you have entered. ✕ refresh Country/Region * USA UK Canada China France Germany Afghanistan Aland Islands Albania Algeria American Samoa Andorra Angola Anguilla Antarctica Antigua and Barbuda Argentina Armenia Aruba Australia Austria Azerbaijan Bahamas Bahrain Bangladesh Barbados Belarus Belgium Belize Benin Bermuda Bhutan Bolivia Bosnia and Herzegovina Botswana Bouvet Island Brazil British Indian Ocean Territory British Virgin Islands Brunei Bulgaria Burkina Faso Burundi Cambodia Cameroon Canada Cape Verde Cayman Islands Central African Republic Chad Chile China Christmas Island Cocos (Keeling) Islands Colombia Comoros Congo Cook Islands Costa Rica Cote d'Ivoire Croatia Cuba Cyprus Czech Republic Democratic Republic of the Congo Denmark Djibouti Dominica Dominican Republic Ecuador Egypt El Salvador Equatorial Guinea Eritrea Estonia Ethiopia Falkland Islands Faroe Islands Federated States of Micronesia Fiji Finland France French Guiana French Polynesia French Southern Territories Gabon Georgia Germany Ghana Gibraltar Greece Greenland Grenada Guadeloupe Guam Guatemala Guernsey Guinea Guinea-Bissau Guyana Haiti Heard Island and Mcdonald Islands Holy See (Vatican City State) Honduras Hong Kong Hungary Iceland India Indonesia Iran Iraq Ireland Israel Italy Jamaica Japan Jersey Jordan Kazakhstan Kenya Kiribati Kosovo (Serbia and Montenegro) Kuwait Kyrgyzstan Lao People's Democratic Republic Latvia Lebanon Lesotho Liberia Libya Liechtenstein Lithuania Luxembourg Macao Madagascar Malawi Malaysia Maldives Mali Malta Marshall Islands Martinique Mauritania Mauritius Mayotte Mexico Minor Outlying Islands of the United States Moldova Monaco Mongolia Montenegro Montserrat Morocco Mozambique Myanmar Namibia Nauru Nepal Netherlands Antilles New Caledonia New Zealand Nicaragua Niger Nigeria Niue Norfolk Island North Korea North Macedonia Northern Mariana Islands Norway Oman Pakistan Palau Palestinian Territory Panama Papua New Guinea Paraguay Peru Philippines Pitcairn Poland Portugal Puerto Rico Qatar Reunion Romania Russian Federation Rwanda Saint Helena Saint Kitts and Nevis Saint Lucia Saint Pierre and Miquelon Saint Vincent and the Grenadines Samoa San Marino Sao Tome and Principe Saudi Arabia Senegal Serbia Seychelles Sierra Leone Singapore Slovakia Slovenia Solomon Islands Somalia South Africa South Georgia and the South Sandwich Is South Korea South Sudan Spain Sri Lanka Sudan Suriname Svalbard and Jan Mayen Swaziland Sweden Switzerland Syria Taiwan Tajikistan Tanzania Thailand The Gambia The Netherlands Timor-Leste Togo Tokelau Tonga Trinidad and Tobago Tunisia Turkey Turkmenistan Turks and Caicos Islands Tuvalu UK USA Uganda Ukraine United Arab Emirates United States Virgin Islands Uruguay Uzbekistan Vanuatu Venezuela Vietnam Wallis and Futuna West Bank and Gaza Strip Western Sahara Yemen Zambia Zimbabwe Please select your country/region. You must enter a comment. Competing Interests Please disclose any competing interests that might be construed to influence your judgment of the article's or peer review report's validity or importance. Competing Interests Policy Provide sufficient details of any financial or non-financial competing interests to enable users to assess whether your comments might lead a reasonable person to question your impartiality. Consider the following examples, but note that this is not an exhaustive list: Examples of 'Non-Financial Competing Interests' Within the past 4 years, you have held joint grants, published or collaborated with any of the authors of the selected paper. You have a close personal relationship (e.g. parent, spouse, sibling, or domestic partner) with any of the authors. You are a close professional associate of any of the authors (e.g. scientific mentor, recent student). You work at the same institute as any of the authors. You hope/expect to benefit (e.g. favour or employment) as a result of your submission. You are an Editor for the journal in which the article is published. Examples of 'Financial Competing Interests' You expect to receive, or in the past 4 years have received, any of the following from any commercial organisation that may gain financially from your submission: a salary, fees, funding, reimbursements. You expect to receive, or in the past 4 years have received, shared grant support or other funding with any of the authors. You hold, or are currently applying for, any patents or significant stocks/shares relating to the subject matter of the paper you are commenting on. Please state your competing interests The comment has been saved. An error has occurred. Please try again. Cancel Post var lTitle = "Addressing common inferential mistakes when...".replace("'", ''); var linkedInUrl = "http://www.linkedin.com/shareArticle?url=https://f1000research.com/articles/13-1488/v3" + "&title=" + encodeURIComponent(lTitle) + "&summary=" + encodeURIComponent('Read the article by '); var deliciousUrl = "https://del.icio.us/post?url=https://f1000research.com/articles/13-1488/v3&title=" + encodeURIComponent(lTitle); var redditUrl = "http://reddit.com/submit?url=https://f1000research.com/articles/13-1488/v3" + "&title=" + encodeURIComponent(lTitle); linkedInUrl += encodeURIComponent('Schmidt A'); var offsetTop = /chrome/i.test( navigator.userAgent ) ? 4 : -10; var addthis_config = { ui_offset_top: offsetTop, services_compact : "facebook,twitter,www.linkedin.com,www.mendeley.com,reddit.com", services_expanded : "facebook,twitter,www.linkedin.com,www.mendeley.com,reddit.com", services_custom : [ { name: "LinkedIn", url: linkedInUrl, icon:"/img/icon/at_linkedin.svg" }, { name: "Mendeley", url: "http://www.mendeley.com/import/?url=https://f1000research.com/articles/13-1488/v3/mendeley", icon:"/img/icon/at_mendeley.svg" }, { name: "Reddit", url: redditUrl, icon:"/img/icon/at_reddit.svg" }, ] }; var addthis_share = { url: "https://f1000research.com/articles/13-1488", templates : { twitter : "Addressing common inferential mistakes when failing to reject.... Schmidt A, published by " + "@F1000Research" + ", https://f1000research.com/articles/13-1488/v3" } }; if (typeof(addthis) != "undefined"){ addthis.addEventListener('addthis.ready', checkCount); addthis.addEventListener('addthis.menu.share', checkCount); } $(".f1r-shares-twitter").attr("href", "https://twitter.com/intent/tweet?text=" + addthis_share.templates.twitter); $(".f1r-shares-facebook").attr("href", "https://www.facebook.com/sharer/sharer.php?u=" + addthis_share.url); $(".f1r-shares-linkedin").attr("href", addthis_config.services_custom[0].url); $(".f1r-shares-reddit").attr("href", addthis_config.services_custom[2].url); $(".f1r-shares-mendelay").attr("href", addthis_config.services_custom[1].url); function checkCount(){ setTimeout(function(){ $(".addthis_button_expanded").each(function(){ var count = $(this).text(); if (count !== "" && count != "0") $(this).removeClass("is-hidden"); else $(this).addClass("is-hidden"); }); }, 1000); } close How to cite this report {{reportCitation}} Cancel Copy Citation Details $(function(){R.ui.buttonDropdowns('.dropdown-for-downloads');}); $(function(){R.ui.toolbarDropdowns('.toolbar-dropdown-for-downloads');}); $.get("/articles/acj/158434/179898") new F1000.Clipboard(); new F1000.ThesaurusTermsDisplay("articles", "article", "179898"); $(document).ready(function() { $( "#frame1" ).on('load', function() { var mydiv = $(this).contents().find("div"); var h = mydiv.height(); console.log(h) }); var tooltipLivingFigure = jQuery(".interactive-living-figure-label .icon-more-info"), titleLivingFigure = tooltipLivingFigure.attr("title"); tooltipLivingFigure.simpletip({ fixed: true, position: ["-115", "30"], baseClass: 'small-tooltip', content:titleLivingFigure + " " }); tooltipLivingFigure.removeAttr("title"); $("body").on("click", ".cite-living-figure", function(e) { e.preventDefault(); var ref = $(this).attr("data-ref"); $(this).closest(".living-figure-list-container").find("#" + ref).fadeIn(200); }); $("body").on("click", ".close-cite-living-figure", function(e) { e.preventDefault(); $(this).closest(".popup-window-wrapper").fadeOut(200); }); $(document).on("mouseup", function(e) { var metricsContainer = $(".article-metrics-popover-wrapper"); if (!metricsContainer.is(e.target) && metricsContainer.has(e.target).length === 0) { $(".article-metrics-close-button").click(); } }); var articleId = $('#articleId').val(); if($("#main-article-count-box").attachArticleMetrics) { $("#main-article-count-box").attachArticleMetrics(articleId, { articleMetricsView: true }); } }); var figshareWidget = $(".new_figshare_widget"); if (figshareWidget.length > 0) { window.figshare.load("f1000", function(Widget) { // Select a tag/tags defined in your page. In this tag we will place the widget. _.map(figshareWidget, function(el){ var widget = new Widget({ articleId: $(el).attr("figshare_articleId") //height:300 // this is the height of the viewer part. [Default: 550] }); widget.initialize(); // initialize the widget widget.mount(el); // mount it in a tag that's on your page // this will save the widget on the global scope for later use from // your JS scripts. This line is optional. //window.widget = widget; }); }); } close Error Close Add Reset F1000.MICROSERVICES.AFFILIATION = ''; $(document).ready(function () { $('.js-affiliations-form').each((index, form) => { new AffiliationForm({ formId: form.id, institutionErrorSelector: '.comment-enter-institution', departmentErrorSelector: '.comment-enter-department', placeSelector: '.js-add-comment-place', stateSelector: '.js-add-comment-state', zipCodeSelector: '.js-add-comment-zipcode', countrySelector: '.js-add-comment-country', countryErrorSelector: '.comment-enter-country', }); }); }); $(document).ready(function () { var reportIds = { "368388": 6, "368387": 14, "374927": 0, "374928": 0, "374749": 0, "350493": 0, "346781": 0, "374748": 0, "350492": 0, "346780": 0, "350495": 0, "346783": 0, "350494": 16, "346782": 0, "350489": 25, "350488": 0, "350491": 0, "346779": 0, "350490": 0, "346778": 0, "374949": 0, "374951": 0, "374950": 0, "350497": 0, "346785": 0, "350496": 0, "346784": 0, "346787": 0, "346786": 0, "374953": 0, "374952": 0, "374954": 0, }; $(".referee-response-container,.js-referee-report").each(function(index, el) { var reportId = $(el).attr("data-reportid"), reportCount = reportIds[reportId] || 0; $(el).find(".comments-count-container,.js-referee-report-views").html(reportCount); }); var uuidInput = $("#article_uuid"), oldUUId = uuidInput.val(), newUUId = "93146fa1-d415-4376-ad02-20837d506341"; uuidInput.val(newUUId); $("a[href*='article_uuid=']").each(function(index, el) { var newHref = $(el).attr("href").replace(oldUUId, newUUId); $(el).attr("href", newHref); }); }); An innovative open access publishing platform offering rapid publication and open peer review, whilst supporting data deposition and sharing. Browse Gateways Collections How it Works Contact For Developers Cookie Notice Privacy Notice RSS Submit Your Research Follow us © 2012-2026 F1000 Research Ltd. ISSN 2046-1402 | Legal | Partner of Research4Life • CrossRef • ORCID • FAIRSharing R.templateTests.simpleTemplate = R.template(' $text $text $text $text $text '); R.templateTests.runTests(); var F1000platform = new F1000.Platform({ name: "f1000research", displayName: "F1000Research", hostName: "f1000research.com", id: "1", editorialEmail: "[email protected]", infoEmail: "[email protected]", usePmcStats: true }); $(function(){R.ui.dropdowns('.dropdown-for-authors, .dropdown-for-about, .dropdown-for-myresearch');}); // $(function(){R.ui.dropdowns('.dropdown-for-referees');}); $(document).ready(function () { if ($(".cookie-warning").is(":visible")) { $(".sticky").css("margin-bottom", "35px"); $(".devices").addClass("devices-and-cookie-warning"); } $(".cookie-warning .close-button").click(function (e) { $(".devices").removeClass("devices-and-cookie-warning"); $(".sticky").css("margin-bottom", "0"); }); $("#tweeter-feed .tweet-message").each(function (i, message) { var self = $(message); self.html(linkify(self.html())); }); $(".partner").on("mouseenter mouseleave", function() { $(this).find(".gray-scale, .colour").toggleClass("is-hidden"); }); }); Sign In Remember me Forgotten your password? Sign In Cancel Email or password not correct. Please try again Please wait... $(function(){ // Note: All the setup needs to run against a name attribute and *not* the id due the clonish // nature of facebox... $("a[id=googleSignInButton]").click(function(event){ event.preventDefault(); $("input[id=oAuthSystem]").val("GOOGLE"); $("form[id=oAuthForm]").submit(); }); $("a[id=facebookSignInButton]").click(function(event){ event.preventDefault(); $("input[id=oAuthSystem]").val("FACEBOOK"); $("form[id=oAuthForm]").submit(); }); $("a[id=orcidSignInButton]").click(function(event){ event.preventDefault(); $("input[id=oAuthSystem]").val("ORCID"); $("form[id=oAuthForm]").submit(); }); }); If you've forgotten your password, please enter your email address below and we'll send you instructions on how to reset your password. The email address should be the one you originally registered with F1000. Email address not valid, please try again You registered with F1000 via Google, so we cannot reset your password. To sign in, please click here . If you still need help with your Google account password, please click here . You registered with F1000 via Facebook, so we cannot reset your password. To sign in, please click here . If you still need help with your Facebook account password, please click here . Code not correct, please try again Reset password Cancel Email us for further assistance. Server error, please try again. If your email address is registered with us, we will email you instructions to reset your password. If you think you should have received this email but it has not arrived, please check your spam filters and/or contact for further assistance. Please wait... Register $(document).ready(function () { signIn.createSignInAsRow($("#sign-in-form-gfb-popup")); $(".target-field").each(function () { var uris = $(this).val().split("/"); if (uris.pop() === "login") { $(this).val(uris.toString().replace(",","/")); } }); });

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall: last seen: 2026-05-23T02:00:01.238055+00:00

License: CC-BY-4.0