Full text
41,017 characters
· extracted from
preprint-html
· click to expand
Dramatic increases in redundant publications in the Generative AI era | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Dramatic increases in redundant publications in the Generative AI era View ORCID Profile Danny Maupin , View ORCID Profile Tulsi Suchak , View ORCID Profile Adrian Barnett , View ORCID Profile Matt Spick doi: https://doi.org/10.1101/2025.09.09.25335401 Danny Maupin 1 School of Health Sciences, Faculty of Health and Medical Sciences, University of Surrey , Guildford, Surrey, United Kingdom, GU2 7XH Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Danny Maupin Tulsi Suchak 1 School of Health Sciences, Faculty of Health and Medical Sciences, University of Surrey , Guildford, Surrey, United Kingdom, GU2 7XH Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tulsi Suchak Adrian Barnett 2 School of Public Health and Social Work, Queensland University of Technology , Kelvin Grove, Australia Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Adrian Barnett Matt Spick 1 School of Health Sciences, Faculty of Health and Medical Sciences, University of Surrey , Guildford, Surrey, United Kingdom, GU2 7XH Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Matt Spick For correspondence: matt.spick{at}surrey.ac.uk Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Background Redundant publication, the practice of submitting the same or substantially overlapping manuscripts multiple times, distorts the scientific record and wastes resources. Since 2022, publications using large open-science data resources have increased substantially, raising concerns that Generative AI (GenAI) may be facilitating the production of formulaic, redundant manuscripts. In this work we aim to quantify the extent of redundant publication from a single, large health dataset and to investigate whether GenAI can create submissions that evade standard integrity checks. Methods We conducted a systematic search for the years 2021 to 2025 (year to end-July) to identify redundant publications using the US Centers for Disease Control and Prevention National Health and Nutrition Examination Survey (NHANES) dataset. Redundancy was defined as publications analysing the same exposures associated with the same outcomes in the same national population. To test whether GenAI could facilitate creating these papers, we prompted large language models to write three synthetic manuscripts using redundant publications from our dataset as input, instructing them to maximise syntactic differences and evade plagiarism detectors. These three synthetic manuscripts were then tested using a leading plagiarism detection platform to assess their similarity scores. Findings Our search identified 411 redundant publications across 156 unique exposure-outcome pairings; for example, the association between oxidative balance score and chronic kidney disease using NHANES data was published six times in one year. In many instances, redundant articles appeared within the same journals. The three synthetic manuscripts created by GenAI to evade detection yielded overall similarity scores of 26%, 19%, and 14%, with individual similarity contributions below the typical 5% warning thresholds used by plagiarism detectors. Interpretation The rapid growth in redundant publications (a 17-fold increase between 2022 and 2024) is suggestive of a systemic failure of editorial checks. These papers distort meta-analyses and scientometric studies, waste scarce peer review resources and pose a significant threat to the integrity of the scientific record. We conclude that current checks for redundant publications and plagiarism are no longer fit for purpose in the GenAI era. Introduction Artificial intelligence tools, including both large language models (LLMs) and easily deployable machine learning workflows, offer potential for both new insights and productivity gains to researchers. 1 One promising area is large and complex data resources, such as in healthcare and the biosciences. 2 , 3 Inevitably, however, AI will multiply negative as well as positive outputs. 4 The publishing industry has always had to contend with unethical behaviours, and guidelines on best practices in publishing are provided by the Committee on Publication Ethics (COPE). These negative behaviours include text and image recycling by authors; 5 plagiarism of others’ work; 6 redundant submissions, with the latter including salami slicing or “dividing data or research findings into their smallest publishable units to increase the author’s publication output”; 7 , 8 and systematic manipulation. 9 , 10 We and others have previously documented the exploitation of Open Science data resources for the production of misleading research by entities such as paper mills (organizations that produce manuscripts for sale using derivative or fabricated text, images or data), 11 – 13 including via p-hacking and data dredging. 14 , 15 The ability of GenAI to synthesise new text in a handful of clicks poses an additional problem, as it bypasses both text recycling and plagiarism editorial checks, allowing mass manufacture of redundant manuscripts or simultaneous submission to journals. Whilst some publishers have processes to detect duplicate submissions, 16 GenAI can assist with evading these checks. Entities such as paper mills might even be able to submit unlimited conceptually identical but syntactically different manuscripts, dramatically increasing the potential production rate of redundant publications. When LLMs are combined with large public datasets such as NHANES or the FDA Adverse Events Reporting System (FAERS), the only rate limiting step remaining is the ability of publishers to review and accept manuscripts, placing ever more strain on editors and peer reviewers. In addition to negative outcomes arising from the use of GenAI by bad actors such as paper mills, there may also be risks from cross-domain work that substitutes traditional human collaboration with AI. LLMs can act as expert summarisers, allowing a researcher in one field to rapidly acquire a functional understanding of another, thereby reducing the need for direct partnership with domain experts. For instance, a researcher looking for evidence to support a hypothesis might use a LLM to write statistical code to interrogate a large open access database. This approach streamlines the preliminary stages of interdisciplinary research, allowing for quick hypothesis generation and project scoping. This represents a clear productivity gain (which will be hard for small teams to pass up in a highly competitive landscape), 17 reducing the difficulties associated with assembling and managing cross-disciplinary teams (conflicting terminologies, differing methodological standards, logistical overheads). 18 It can, however, also lead to epistemic trespassing, whereby researchers have a surface level understanding but lack deep domain expertise. Whilst this has been previously described for systematic reviews and meta-analyses, 19 , 20 GenAI can enable such opportunistic authorship more broadly. Here, we aim to investigate whether GenAI is contributing to a breakdown in the effectiveness of integrity checks in the scientific publishing process. First, we examine whether redundant publication is becoming more prevalent, whether this is inadvertent (through epistemic trespassing) or deliberate (through systematic manipulation). We use the NHANES dataset as our primary source 21 and investigate whether the incidence of redundant publications from this dataset has changed over time. Second, we investigate whether LLMs can facilitate the duplication of published works, by rewriting existing manuscripts and testing whether they pass plagiarism checks used by major publishers. Materials and methods Literature search and manuscript matching PubMed was used as the primary data source, with the following search string “ (nhanes[tiab] OR national health and nutrition examination survey[tiab]) AND ((“2021/01/01”[Date - Publication] : “2025/07/31”[Date - Publication])) ”, which produced an initial list of 15 597 publications. Following this, articles referencing the distinct and independent Korean NHANES survey were excluded (n = 1 657). In addition, comments on publications were excluded (n = 50). A list of exposures and outcomes was derived from our previous work on inappropriate manuscripts using the NHANES dataset, 15 and this was added to based on the authors’ experience of more recent literature analysing biomarkers and predictors. In total, a list of 217 exposures / predictors and 162 outcomes was generated (Table S1, Supplementary Material), including alternatives where appropriate. Manuscript titles were searched for matching exposures and predictors by simple token matching, resulting in a set of 3 465 manuscripts analysing the predefined list of simple exposures and outcomes. Those with more than one paper analysing the same exposure and outcome were recorded. Prior to matching, alternatives were manually replaced with general tokens (for example depressive symptoms was converted to depression, as both outcomes are measured using the same NHANES questionnaire). Titles and abstracts were then manually inspected; studies were treated as effectively redundant publications if they analysed the same exposure and outcome with only variations such as cohort included (e.g. 2014–2018 versus 2016–2020), male versus female, or variations in age ranges. More specific investigations, for example measuring associations between an exposure and an outcome in cancer survivors, were not included as redundant publications (n = 3 054), leaving 411 publications which under the definitions used in this work were redundant. This screening process was not intended to identify problematic publications in general (such as salami slicing or plagiarism); this workflow was intended to identify the specific issue of redundant / duplicate publication. Investigation of plagiarism detection In order to pilot whether simple click-science workflows were sufficient to evade existing mainstream plagiarism detection, three ‘new’ manuscripts were generated from existing published papers. The exposure and outcomes chosen are set out in Table 2 alongside the PMIDs of the ‘source’ publications for the synthesis. View this table: View inline View popup Download powerpoint Table 2: Pairings of exposures and outcomes selected for the production of synthetic manuscripts These exposure / outcome pairings were selected as examples due to the high number of duplicates found during pilot searches, and the large amount of available literature on the topics, which represent sources of considerable public interest. First, ‘new’ manuscripts were generated by three of the authors (D.M., T.S. and M.S.) independently using LLMs to rewrite existing manuscripts, with the only requirement that prompts explicitly specify that the new text should be syntactically different to the original text to avoid plagiarism detection. In addition to the main sections, LLMs were used to generate new syntactically different table titles, figure legends and end disclosures. Second, the ‘investigated’ cohort was changed, and all numbers were adjusted either by instructing an LLM to create new tables, or by manual adjustments to reflect the changes that subtle alterations to NHANES cohorts can produce. Third, all text was manually adjusted where appropriate to reflect the numerical changes, to ensure that text referred to the correct figure and table numbers, and to remove repetition of acronyms (for example spelling out LE8 as “Life’s Essential 8 (LE8)” in full each time mentioned) and formatted. It should be noted that the LLMs could not be prompted to produce completely error-free manuscripts, but the overall production process in each case was around two hours per paper. Subsequently, the three generated documents were submitted to iThenticate to assess whether a major current plagiarism detector would identify them as unoriginal documents. 22 We followed guidance from Springer Nature’s documentation on Crossref Similarity Check / iThenticate that it is “more important to look at the individual scores of the sources than the overall similarity index” and that individual scores in the 1% to 5% range offer “in general no sign of potential plagiarism”. 23 Therefore, we treated individual scores over 5% as generating potential red flags for editors and reviewers. Results Literature search and manuscript matching The literature search and manuscript matching process generated 411 paired redundant exposure-outcome documents. The most common result was two papers covering the same exposure / outcome pairing (n = 190 publications). The largest number of redundant analyses was six, occurring for three pairings (n = 18 publications). The 20 most frequent exposure–outcome pairs and the distribution of redundancies are shown in Figures 3A and 3B respectively. Figure 3C illustrates the incidence of the duplicates identified over time; there were very few redundant publications relating to NHANES prior to 2023 (12 in 2022 and 3 in 2021), with the 2024 count of 198 representing a 17-fold increase in two years. Download figure Open in new tab Figure 1: Existing COPE guidelines around plagiarism and related examples of misconduct Download figure Open in new tab Figure 2: Flowchart of inclusion and exclusion criteria for redundant publication search Download figure Open in new tab Figure 3: (A) 20 most frequent exposure–outcome pairs (B) Distribution of redundant publications (C) time trend of redundant publications, 2025 is YTD to end-July 2025 The 411 articles were published by 112 journals ( Figure 4A ), with 44% concentrated in just five journals. The most common statistical method used in the duplicated research publications was logistic regression across multiple models adding covariates cumulatively, followed by linear regression, with a small number of other methods used ( Figure 4B ). Download figure Open in new tab Figure 4: (A) tree map of journals using the number of redundant papers (B) breakdown of statistical methods used in the redundant papers. We use the abbreviated journal name – full journal names are provided in Supplementary Materials, Table S2 Submission and publication dates Three illustrative pairings were chosen from those with the highest rates of redundant publications for the generation of synthetic manuscripts. These were chosen as they should be harder to be duplicate and evade plagiarism detection (as more redundant instances are available in the public domain). Submission and publication dates for these redundant publications are shown in Figure 5 , together with the journals that published them. Articles commonly followed a pattern of simultaneous or near-simultaneous submission, but also included examples of submissions after the publication of previous works. In some cases, the same journals published the redundant manuscripts. This occurred 67 times across the cohort of 411 publications, with Frontiers in Nutrition publishing the same redundant pairing on 29 occasions, and Scientific Reports on 10 occasions (the full list of journals is included in Supplementary Materials, Table S2). Download figure Open in new tab Figure 5: Submission and publication date plots for three outcome–exposure pairings, with publication titles to the left of the submission dates and journal title and publication PMIDs to the right of acceptance dates : (A) oxidative balance score and chronic kidney disease (B) body roundness index and female infertility (C) Life’s Essential 8 and periodontitis Evasion of plagiarism checks The three synthetic manuscripts generated by LLM were then tested using iThenticate. The results are shown in Table 3 for the total similarity scores and for the five main individual similarity sources for each manuscript. The total scores including bibliographical matching are included for reference. All three manuscripts had percentage matching scores that were below 30% overall and no individual source percentages exceeded 5%, and so would not have raised automated red flags in editorial pipelines. 23 View this table: View inline View popup Download powerpoint Table 3: Summary of iThenticate checks for three synthetic manuscripts, processed on 26 August 2025 The LLM-rewritten manuscripts are presented in Supplementary Materials with watermarking and disclaimers. The watermarks and disclaimers were not included in the versions uploaded to iThenticate. Discussion Plagiarism and related misconduct such as submissions of redundant manuscripts have been a constant in scientific publishing, and many checks and balances exist to guard against these problems. 24 , 25 The results presented here, however, are suggestive that these checks are not adequate in the GenAI era, given the sharp acceleration since 2022 in the number of redundant publications (2024 exhibiting a 17-fold change versus 2022, for example). Our workflow identified 411 total redundant publications from a single data source, NHANES, but it should be noted that the matching procedure was heavily automated, and better-concealed duplications would have escaped this process (a full manual text check of 15 000+ manuscripts being beyond the scope of this work). This has happened at the same time as wider growth in submissions putting editorial and peer reviewing processes under increased strain, 26 , 27 possibly increasing the dependence on automated systems such as iThenticate. The workflow also identified some journals as accepting a disproportionate number of redundant publications, which may reflect paper mills targeting journals where fabricated manuscripts have already been accepted, 28 perhaps reflecting perceived weaknesses in their editorial checks. Our findings on duplicates illustrate the scale of the problem, but do not provide the mechanism by which it is happening. Here, we have demonstrated that ‘new’ syntactically different manuscripts can easily be generated that avoid text recycling or textual plagiarism. Whilst our work does not represent a systematic investigation of how easily plagiarism checks can be avoided, the three pairings chosen here did not exceed warning thresholds for individual source similarity scores of 5%, and were also free of ‘tortured phrases’, another indicator often used to detect problematic publications. 29 Interestingly, for all three synthetic manuscripts the Frontiers family of journals provided the highest ‘individual source similarity’ scores, which likely reflects Frontiers accepting more of these types of publications than other journals (Frontiers accepted 29% of the redundant publications identified in this work). Our synthetic manuscripts also passed thresholds despite our selecting highly duplicated pairings of exposures and outcomes, and the substantial time gap between publication and our submitting of the manuscripts to iThenticate. Redundant publications can also amplify bad research. For example, vitamin D intake can be correlated with depression, but using NHANES data the association between insufficient serum vitamin D and depression has been shown by Diaz-Amaya et al. to be fully attenuated after adjusting for food security and diet quality. 30 Nonetheless, here we have identified three publications taking a simplistic single-factor approach reporting that serum vitamin D is associated with depression, findings which we regard as unsound. Therefore, misleading publications on serum vitamin D using NHANES outnumber the more thorough analysis in a 3:1 ratio. A simple comparison by Google Scholar shows citations for these publications in a 30:2 ratio. This is likely to be a major problem for those that are not domain experts – epistemic trespassers are much more likely to take the ‘weight’ of publication or citation counts as indicative of evidence. In so doing, epistemic trespassers can then further propagate low-quality results. These findings extend previous work highlighting mass-manufactured low-quality research generated from publicly accessible datasets, leading to the literature being oversaturated with repetitive and potentially misleading findings. 15 , 31 Combined with the strong incentive to publish frequently (culturally for authors, and financially for journals), this has created a system where quantity is able to displace quality. There is growing evidence that these features have been exploited by “paper mills”, organisations that will produce manuscripts on a large scale, typically with minimal originality, and offer authorship slots to academic customers to meet the demand for rapid publication. 32 , 33 NHANES is an attractive target for these organisations: it is publicly available, has no registration or reporting requirements, and can be mined repeatedly with surface-level variations in study design. It is, however, important to recognise that the methods described in this work will not just be applicable to NHANES, albeit they may be easier to identify for this dataset due to the sheer volume of manuscripts. There is no reason why redundant submissions would not be enabled by GenAI for other health datasets, or for other types of research, such as systematic review papers, for example. We see risks that paper mills may migrate into these areas if negative attention around data resources such as NHANES limits publication opportunities. Epistemic trespassing is also likely to see growth, as researchers new to an area familiarise themselves with a topic using GenAI and see that it offers an easier route to publication. Addressing these challenges will not be straightforward. COPE definitions do not fully capture this type of misbehaviour, as their categorisation of ‘redundant publication’ deals with the same authors submitting to multiple journals. 8 We contend that this definition does not reflect the reality that the true author is concealed by contract cheating, whereby the disclosed authors appear to be different (but have in fact purchased authorship from the same source). Proving this in individual cases is not easy, but we see no other plausible explanation for the aggregated surge in redundant publications described here. Limitations Whilst the overall trends are informative and illustrate a growing problem, a number of limitations should be stressed. First, from reviewing manuscripts alone, it is not possible to differentiate between researchers with good intentions that have simply written papers where they lack the domain expertise to be aware of competitor groups and publications (epistemic trespassers), versus unethical actors such as paper mills deliberately using strategies such as simultaneous submission to multiple venues. Second, here we took a conservative approach to minimise false positives, by excluding associative studies that analysed specific subsets of data. This may understate the scale of the problem. Third, this work has identified the potential for using LLMs to quickly rewrite manuscripts. Here, however, we have shown illustrative examples. This is not a comprehensive analysis across a large number of manuscripts to identify how often tools such as iThenticate can be ‘fooled’. The evidence from the overall publication of duplicated topics, however, suggests that it is unlikely that iThenticate and similar tools are well prepared for the changing landscape, and it is the nature of LLMs that should a submission fail, the LLM can simply reword the document again until it passes. Fourth, our workflow was designed to test whether documents rewritten by LLMs would pass automated checks. We do not suggest that our documents would pass the desk editor or peer review checks in place at responsible journals, and further interventions and editing would be required to do so. Fifth, interventions were required to correct simple errors introduced, and this typically required an additional two hours of manual checking. The prompts used here were not able to produce new error-free manuscripts in a single click, albeit the reduction in time would in our view represent a massive time saving. Conclusion These results demonstrate that current checks for redundant publications and plagiarism are no longer fit for purpose in the GenAI era. There are, however, few easy solutions. We expect greater emphasis on AI and other forms of detection from publishers as another ‘automated’ check, especially as publication volumes appear to be expanding beyond the ability of manual checks and balances such as peer review to cope. For known ‘templates’ such as NHANES, FAERS or two-sample Mendelian randomization (e.g. derived from FinnGen or the UK Biobank), we believe publishers need to be especially vigilant for formulaic manuscripts. In the long-run, greater co-operation between publishers for a register of submissions in these areas might help to reduce redundancy. We believe that updated / modified guidelines will be required from COPE to address these new innovations in paper mill production, especially to move away from current narrow definitions of redundant publication that focus on visible author overlap, for example. Without action to address these new GenAI related challenges, however, we expect the problem to continue to grow, making scientometric and meta-analyses more challenging, wasting publisher and peer reviewer resources, and potentially reducing the integrity of the scientific record. Supplementary Materials The list of exposures and outcomes used to identify redundant publications is included in Supplementary Material, Table S1. The complete dataset of exposure versus outcome pairings including PMIDs is included in Supplementary Material, Table S2. All three synthetic manuscripts are additionally included as Supplementary Materials. Author Contributions’ Conceptualization, M.S. and A.B.; methodology, M.S. and A.B.; software, M.S.; T.S.; formal analysis, M.S. T.S. and D.M.; investigation, T.S. and D.M.; resources, M.S. and A.B.; data curation, T.S. and D.M.; writing—original draft preparation, M.S. T.S. and D.M.; writing— review and editing, M.S. and A.B.; visualization, M.S.; supervision and project administration, M.S. and A.B.; funding acquisition, M.S. All authors have read and agreed to the published version of the manuscript. Funding Matt Spick was supported by UK Research and Innovation (UKRI1095). Data Availability Statement The data underpinning this work can be reproduced from the PubMed database at https://pubmed.ncbi.nlm.nih.gov/ . The complete dataset of exposure versus outcome pairings including PMIDs for the 411 redundant publications included in this work is included in Supplementary Material, Table S2. Conflicts of Interest The authors declare no conflicts of interest. Acknowledgements The authors wish to acknowledge the work of the United2Act network in its work to highlight the threats to scientific integrity from paper mill activity. 34 The authors also wish to acknowledge the Collection of Open Science Integrity Guides (COSIG). 35 The authors also wish to declare that GenAI (Large Language Models) were used to prepare the synthetic manuscripts in this work, as described under Methods. References 1. ↵ Khalifa , M. & Albadawy , M. Using artificial intelligence in academic writing and research: An essential productivity tool . Computer Methods and Programs in Biomedicine Update 5 , 100145 ( 2024 ). OpenUrl 2. ↵ Yu , K.-H. , Beam , A. L. & Kohane , I. S. Artificial intelligence in healthcare . Nat Biomed Eng 2 , 719 – 731 ( 2018 ). OpenUrl PubMed 3. ↵ Jiang , F. et al. Artificial intelligence in healthcare: past, present and future . Stroke Vasc Neurol 2 , ( 2017 ). 4. ↵ Binz , M. et al. How should the advancement of large language models affect the practice of science? Proceedings of the National Academy of Sciences 122 , e2401227121 ( 2025 ). OpenUrl CrossRef PubMed 5. ↵ Handling text recycling (also known as self-plagiarism ). COPE: Committee on Publication Ethics https://publicationethics.org/guidance/cope-position/handling-text-recycling-also-known-self-plagiarism ( 2024 ). 6. ↵ Wager , L. Suspected Plagiarism in a Submitted Manuscript . https://publicationethics.org/guidance/flowchart/plagiarism-submitted-manuscript ( 2006 ) x doi: 10.24318/cope.2019.2.1 . OpenUrl CrossRef 7. ↵ Handling Duplicated or Redundant Content (Salami Slicing) . https://publicationethics.org/guidance/cope-position/handling-duplicated-or-redundant-content-salami-slicing ( 2024 ) x doi: 10.24318/RMZj3Nqm . OpenUrl CrossRef 8. ↵ Wager , E. Suspected Redundant Publication in a Submitted Manuscript . https://publicationethics.org/guidance/flowchart/redundant-duplicate-publication-submitted-manuscript ( 2015 ) x doi: 10.24318/cope.2019.2.12 . OpenUrl CrossRef 9. ↵ Systematic Manipulation of the Publication Process . https://publicationethics.org/guidance/flowchart/systematic-manipulation-publication-process ( 2018 ) x doi: 10.24318/cope.2019.2.23 . OpenUrl CrossRef 10. ↵ Richardson , R. A. K. , Hong , S. S. , Byrne , J. A. , Stoeger , T. & Amaral , L. A. N. The entities enabling scientific fraud at scale are large, resilient, and growing rapidly . Proceedings of the National Academy of Sciences 122 , e2420092122 ( 2025 ). OpenUrl PubMed 11. ↵ Byrne , J. A. et al. A call for research to address the threat of paper mills . PLoS Biol 22 , e3002931 ( 2024 ). OpenUrl CrossRef PubMed 12. Spick , M. et al. Quantifying new threats to health and biomedical literature integrity from rapidly scaled publications and problematic research . 2025.07.07.25331008 Preprint at doi: 10.1101/2025.07.07.25331008 ( 2025 ). OpenUrl Abstract / FREE Full Text 13. ↵ Scancar , B. , Byrne , J. A. , Causeur , D. & Barnett , A. G. Revealing the Paper Mill Iceberg: AI-Based Screening of Cancer Research Publications . 2025.08.29.673016 Preprint at doi: 10.1101/2025.08.29.673016 ( 2025 ). OpenUrl Abstract / FREE Full Text 14. ↵ Stender , S. , Gellert-Kristensen , H. & Smith , G. D. Reclaiming mendelian randomization from the deluge of papers and misleading findings . Lipids in Health and Disease 23 , 286 ( 2024 ). OpenUrl 15. ↵ Suchak , T. et al. Explosion of formulaic research articles, including inappropriate study designs and false discoveries, based on the NHANES US national health database . PLOS Biology 23 , e3003152 ( 2025 ). OpenUrl PubMed 16. ↵ stmassoc. STM Integrity Hub launches new research integrity tool . STM Association https://stm-assoc.org/stm-integrity-hub-launches-new-research-integrity-tool/ ( 2023 ). 17. ↵ Génova , G. , Astudillo , H. & Fraga , A. The Scientometric Bubble Considered Harmful . Sci Eng Ethics 22 , 227 – 235 ( 2016 ). OpenUrl PubMed 18. ↵ Olenick , M. , Flowers , M. , Maltseva , T. & Diez-Sampedro , A. Research in Academia: Creating and Maintaining High Performance Research Teams . Nursing Research and Practice 2019 , 8423460 ( 2019 ). OpenUrl 19. ↵ Ioannidis , J. P. A. The Mass Production of Redundant, Misleading, and Conflicted Systematic Reviews and Meta-analyses . Milbank Q 94 , 485 – 514 ( 2016 ). OpenUrl CrossRef PubMed 20. ↵ Hauser , A. S. The future of reviews . EMBO reports 1 – 5 ( 2025 ) doi: 10.1038/s44319-025-00560-z . OpenUrl CrossRef 21. ↵ CDC. National Health and Nutrition Examination Survey . National Health and Nutrition Examination Survey https://www.cdc.gov/nchs/nhanes/index.html ( 2025 ). 22. ↵ Turnitin LLC . Plagiarism Detection Software | iThenticate . https://www.ithenticate.com . 23. ↵ Springer Nature . USING CROSSREF SIMILARITY CHECK – TIPS AND TRICKS Version 2.2 . ( 2018 ). 24. ↵ Angell , M. & Relman , A. S. Redundant Publication . New England Journal of Medicine 320 , 1212 – 1214 ( 1989 ). OpenUrl CrossRef PubMed Web of Science 25. ↵ Wager , E. Why Is Redundant Publication a Problem? Int J Occup Environ Med 6 , 3 – 6 ( 2015 ). OpenUrl PubMed 26. ↵ Hanson , M. A. , Barreiro , P. G. , Crosetto , P. & Brockington , D. The strain on scientific publishing . Quantitative Science Studies 5 , 823 – 843 ( 2024 ). OpenUrl 27. ↵ Peterson , C. J. , Orticio , C. & Nugent , K. The challenge of recruiting peer reviewers from one medical journal’s perspective . Baylor University Medical Center Proceedings 35 , 394 – 396 ( 2022 ). OpenUrl PubMed 28. ↵ Paper Mills Research . https://publicationethics.org/node/55256 ( 2022 ) x doi: 10.24318/jtbG8IHL . OpenUrl CrossRef 29. ↵ Cabanac , G. , Labbé , C. & Magazinov , A. Tortured phrases: A dubious writing style emerging in science. Evidence of critical issues affecting established journals . Preprint at doi: 10.48550/arXiv.2107.06751 ( 2021 ). OpenUrl CrossRef 30. ↵ Diaz-Amaya , Y. , Star , Z. & McClure , S. T. Food security and diet quality, not vitamin D status are significantly associated with depression: Results from NHANES 2015-2018 . J Affect Disord 347 , 150 – 155 ( 2024 ). OpenUrl PubMed 31. ↵ Byrne , J. A. & Stender , S. More science friction for less science fiction . PLOS Biology 23 , e3003167 ( 2025 ). OpenUrl PubMed 32. ↵ Parker , L. , Boughton , S. , Bero , L. & Byrne , J. A. Paper mill challenges: past, present, and future . Journal of Clinical Epidemiology 176 , 111549 ( 2024 ). OpenUrl CrossRef PubMed 33. ↵ Byrne , J. A. & Christopher , J. Digital magic, or the dark arts of the 21 st century—how can journals and peer reviewers detect manuscripts and publications from paper mills? FEBS Letters 594 , 583 – 589 ( 2020 ). OpenUrl CrossRef PubMed 34. ↵ Home . United2Act https://united2act.org/ . 35. ↵ Richardson , R. The Collection of Open Science Integrity Guides (COSIG): Expanding participation in post-publication peer review . Preprint at https://zenodo.org/records/15588204 ( 2025 ). View the discussion thread. Back to top Previous Next Posted September 12, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Dramatic increases in redundant publications in the Generative AI era Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Dramatic increases in redundant publications in the Generative AI era Danny Maupin , Tulsi Suchak , Adrian Barnett , Matt Spick medRxiv 2025.09.09.25335401; doi: https://doi.org/10.1101/2025.09.09.25335401 Share This Article: Copy Citation Tools Dramatic increases in redundant publications in the Generative AI era Danny Maupin , Tulsi Suchak , Adrian Barnett , Matt Spick medRxiv 2025.09.09.25335401; doi: https://doi.org/10.1101/2025.09.09.25335401 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Epidemiology Subject Areas All Articles Addiction Medicine (567) Allergy and Immunology (863) Anesthesia (297) Cardiovascular Medicine (4411) Dentistry and Oral Medicine (443) Dermatology (380) Emergency Medicine (606) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1505) Epidemiology (15205) Forensic Medicine (30) Gastroenterology (1119) Genetic and Genomic Medicine (6574) Geriatric Medicine (666) Health Economics (994) Health Informatics (4511) Health Policy (1365) Health Systems and Quality Improvement (1608) Hematology (537) HIV/AIDS (1263) Infectious Diseases (except HIV/AIDS) (15903) Intensive Care and Critical Care Medicine (1103) Medical Education (620) Medical Ethics (144) Nephrology (666) Neurology (6573) Nursing (345) Nutrition (998) Obstetrics and Gynecology (1139) Occupational and Environmental Health (954) Oncology (3319) Ophthalmology (968) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (662) Pediatrics (1689) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5422) Public and Global Health (9205) Radiology and Imaging (2191) Rehabilitation Medicine and Physical Therapy (1367) Respiratory Medicine (1191) Rheumatology (593) Sexual and Reproductive Health (709) Sports Medicine (529) Surgery (709) Toxicology (99) Transplantation (288) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9febc0b23ed28e2e',t:'MTc3OTI4NDQyMQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.