Full text
47,680 characters
· extracted from
preprint-html
· click to expand
Large Language Model-assisted text mining reveals bacterial pathogen diversity | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results Large Language Model-assisted text mining reveals bacterial pathogen diversity View ORCID Profile Michiel Vos , View ORCID Profile Markus Göker , Richard Bendall , View ORCID Profile Fabrizio Costa doi: https://doi.org/10.1101/2025.07.29.667369 Michiel Vos 1 European Centre for Environment and Human Health, University of Exeter Medical School, Environment and Sustainability Institute , Penryn Campus, TR10 9FE, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Michiel Vos For correspondence: m.vos{at}exeter.ac.uk f.costa{at}exeter.ac.uk Markus Göker 2 Phylogenomics and Nomenclature Group, German Collection of Microorganisms and Cell Cultures GmbH (DSMZ) , Braunschweig, 38124, Germany Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Markus Göker Richard Bendall 3 Kai Ping , Goonvrea, St. Agnes, TR5 0NW, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site Fabrizio Costa 4 Department of Computer Science, Faculty of Environment, Science and Economy, University of Exeter , Streatham Campus, EX4 4QF, United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Fabrizio Costa For correspondence: m.vos{at}exeter.ac.uk f.costa{at}exeter.ac.uk Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Compiling and characterising the diversity of bacterial pathogens of humans is a critical challenge to tackle infection risk, especially in the context of global antimicrobial resistance, climate change, and changing demographics. Here, we present a scalable, automated pipeline that harnesses large language models (LLMs) to systematically mine the biomedical literature for information of human pathogenicity across the bacterial domain. By interrogating tens of thousands of PubMed abstracts we identify 1,222 species with at least one abstract documenting human infection, of which 783 species are supported by ≥3 abstracts and are regarded as ‘confirmed pathogens’. We extract, summarise, visualise and interpret data on infection contexts using both expert-curated LLM prompts and unsupervised text vectorisation. We show that these methods enable fine-grained trait mapping across taxa, including quantifying the degree of specialism or generalism in body site specificity for different taxa and the classification of pathogen species into 75 ‘pathogen types’. An objective measure of the rate at which species are reported in the literature coupled to species clustering offers insights into the drivers of pathogen emergence. Our LLM-driven strategy generates an open, updatable, evidence-based catalogue of bacterial human pathogens and their ecological and clinical traits, providing a foundation for public health surveillance, diagnostics, and predictive modelling. This work demonstrates the potential of AI-assisted literature synthesis to transform our understanding of microbial diversity, including its impact on human health. Introduction Pathogenic bacteria form a major burden on global health being linked to one in eight deaths around the world ( 1 ). Bacterial infections are likely to become more problematic in the future as the result of a host of anthropogenic changes, including the global antimicrobial resistance (AMR) crisis ( 2 ), medical advances causing more nosocomial infections ( 3 ), the emergence of new zoonoses ( 4 ), climate change ( 5 , 6 ) and changing lifestyles ( 7 ). To better understand the origin and drivers of bacterial infections in humans, and ultimately mitigate the burden of disease, it is a prerequisite to obtain a comprehensive overview of pathogen diversity. However, the vast amount of literature in which knowledge on the biological and societal complexity of infectious diseases is contained, forms a major barrier to obtain a broad overview of pathogens, detect general trends and test hypotheses. A recent literature study compiled the names of 1513 species associated with human infection reported pre-2021 ( 8 ). Although this has proven a useful resource (e.g. resulting in the inclusion of results in the AMR Package for R ( 9 )), manual curation proved labour intensive and false positives and false negatives were subsequently detected. We here describe an automated, LLM-based pipeline which obviates human effort, allows future workflow re-runs to integrate the continuous publication of new journal articles and consider nomenclatural revisions and extraction of additional species information. This approach can efficiently and reliably extract salient information at scale, producing a checklist of pathogens available to public health workers, (clinical) microbiologists and other researchers interested in the wide variety of bacterial species able to cause human disease. We also demonstrate that the wide range of infection characteristics this approach harvests, permits a rich and nuanced view of pathogen diversity. For instance, it is possible to quantify the diversity of infection-context for different species and genera, uncover groupings of more or less distinct pathogen types through hierarchical clustering and analyse trends of pathogen emergence over time. Results An automated pipeline uncovers species diversity of bacterial pathogens from the scientific literature To generate an automated list of bacterial pathogens of humans, we start with a large-scale keyword search to harvest all (∼24 million) indexed abstracts from PubMed ( https://pubmed.ncbi.nlm.nih.gov/ ) containing one or more prokaryotic species names validly published under the International Code of Nomenclature of Prokaryotes ( 10 ) (∼25,000 species names listed in the LPSN List of Prokaryotic names with Standing in Nomenclature https://lpsn.dsmz.de/ ; ( 11 )) in combination with keywords related to human hosts and infection to constrain the number of abstracts ( Fig. 1 ). A subset of maximum 200 randomly selected abstracts per species are fed to a dedicated Large Language Model prompt (Table S1) in a zero-shot learning setup ( 16 ) to detect whether the authors explicitly assert a causal link between species and infection in each abstract. After obtaining a YES/NO answer to the question whether the bacterium is interpreted by the authors to be the causative agent of disease, each abstract with a YES answer is subjected to a second LLM prompt (Table S1) to exclude false positives (such as microbiome studies describing bacteria associated with disease states). In a final step, causative abstracts describing (homotypic or heterotypic) synonyms or misspellings are merged under a single species name as indicated by LPSN. For instance, abstracts mentioning Haemophilus aphrophilus are merged with records for Aggregatibacter aphrophilus . Download figure Open in new tab Figure 1. Schematic of the pathogen species identification workflow. A: description of main steps of an automated workflow compiling all bacterial pathogens of humans described in all PuBMed abstracts. Some abstracts contain more than one species name; these abstracts can be sampled multiple times and so the abstract count does not refer to unique abstracts but rather to ‘abstract mentions’. As species records are merged as a late step in the workflow (to prevent synonymic names being analysed separately), more than 200 abstracts are interrogated by the LMMs for some species. B: key descriptive statistics. Based on our automated workflow, we initially identified 1222 human pathogen species based on 22,015 evidential (‘causative’) abstracts, out of a total of 88,989 abstracts analysed by the LLM ( Fig. 1 ; https://github.com/fabriziocosta/bacterial_pathogen_atlas ). To provide a ‘hurdle’ against potentially spurious cases (due to erroneous attribution by the LLM or by authors), only the 783 species with 3 or more abstracts evidencing pathogenicity available were considered ‘confirmed’ pathogens. Our method performs extremely well in comparison with a previous, manually curated list ( 8 ), adding species, discounting species wrongly deemed to be pathogenic and only missing a single pathogen species (Supplemental Material). For species with 3 to 200 abstracts this was counted directly, for species with more abstracts it was estimated from a random sample of 200. This index is thus determined by the significance (reflected in publication numbers) and causal association with infection. When species are ranked by pathogen index, species of high priority in ‘real-world’ clinical microbiology laboratory settings ( Clostridioides difficile, Salmonella enterica and Streptococcus pneumoniae ) top the list (Table S2), validating our approach. Large Language Models illustrate the broad diversity of bacterial pathogens of humans The abstracts describing bacterial infection uncovered above contain a host of relevant information which due to their sheer scale and complexity is not readily interpretable by human readers. We here convert text contained in all abstracts describing pathogen species into multidimensional vectors, making relationships between species amenable to visualisation and analysis. Specifically, we adopt two main approaches. A first method, ‘one-hot embedding’, is based on asking 78 author-compiled YES/NO questions concerning key concepts in bacterial infection biology (Table S3) of each abstract (e.g. “patient-related” - “PREGNANCY: Is the patient pregnant at the time of the infection?” or “body system-related” - “NERVOUS SYSTEM: Does the infection affect the brain, spinal cord, or nerves—for example, a brain abscess?”). Since each abstract typically addresses only a limited subset of infection-related concepts, the resulting question-abstract matrix is highly sparse. However, by aggregating vectors across all abstracts associated with each species, we obtain a much denser and more informative species-level representation. A second method of much higher dimensionality involves vectorising individual abstracts using OpenAI’s text-to-vector conversion tools ( https://platform.openai.com/docs/models/embeddings ). Rather than using word frequency, the LLM converts words and phrases into high-dimensional vectors that capture their meaning based on context, for example being able to distinguish homonyms. To solely focus on the meaning of each abstract and prevent the injection of ‘world knowledge’ (i.e. any preconceived notions the LLM has about a particular species), we systematically replaced all occurrences of bacterial species names with placeholders (e.g., ‘bacterium_1’) prior to vectorisation. After converting each abstract into a single 1,536-dimensional vector, we compute the average across all vectors associated with a given bacterial species to obtain a single species-level embedding. Both abstract interpretation methods perform well, as illustrated by an analysis of ten Klebsiella species where the outlier species K. granulomatis causing sexually transmitted disease is clearly separated from the other species which are transmitted mainly by other pathways ( Fig. 2 ). Figs. S1-S4 give examples of consistent detection of outlier species based on a further four genera. Download figure Open in new tab Figure 2. Abstract vectorisation of ten Klebsiella species demonstrating consistent separation of species with different infection characteristics Individual abstract vectors and species-aggregated vectors based on PubMed-extracted abstracts on human infection caused by ten different Klebsiella species. A: OneHot embedding (78 dimensions). B: Species-anonymised text vectorisation (1,536 dimensions). The dimensionality reduction technique t-SNE ( 38 ) was used to plot vectors in two dimensions. We applied the anonymised text vectorisation method to the 783 confirmed pathogen species, to project these high-dimensional vectors into two dimensions for visualisation purposes ( Fig. 3 ). Individual species are differentially coloured and their sizes are represented according to their pathogen index ( Fig. 3A ). When colouring species by phylum affiliation ( Fig. 3B ), it is clear that phylogenetic relatedness explains clustering to some extent, although phyla also show a significant degree of dispersion. Text vectorisation can be combined with One-Hot embedding-based vectorisation by using heat mapping to represent One-Hot categories. Fig. 3C uses this technique to show species that can be sexually transmitted with four species examples indicated. Similarly, Fig. 3D shows zoonotic pathogens, highlighting a zoonotic and a non-zoonotic Chlamydia species. In both heatmap plots, it is apparent that species that share one trait can nevertheless be widely dispersed across total ‘trait space’. Download figure Open in new tab Figure 3. Broad structuring of bacterial pathogen diversity based on text vectorisation. High-dimensional relations between 783 confirmed bacterial pathogen species of humans superimposed in two dimensions via multidimensional scaling t-SNE ( 38 ). The symbol size of individually coloured species is scaled according to the pathogen index (see main text). A: all species, coloured by genus. B: species labelled by Phylum (n=8), three actinomycete species are indicated. C: a heatmap of species for which at least one abstract indicates sexual transmission, with four named examples. D: a heatmap reflective of the mention of zoonotic context for each species; here, symbol colour is reflective of the absolute number of mentions in zoonotic context (rather than the proportion of abstracts) to not miss out on low-zoonotic risk-but highly prevalent pathogen species. Within- and between-genus variation in infection site specificity The One-Hot embedding approach is structured in thematic categories, such as ‘BODY SITE’, which contains eleven concepts related to the capacity to infect different parts of the body (nervous, visual, ear, circulatory, respiratory, digestive, skin, musculoskeletal, genitourinary, systemic, sterile site; Table S3). By investigating the distribution of concepts in a category it is possible to analyse generalist versus specialist traits. We calculate the Shannon Entropy using all eleven concepts for nine species-rich genera, enabling comparisons of within- and between-genus variation. The results fit known clinical microbiology diagnostics ( Fig. 4A ). For instance, the genus Mycobacterium displays a wide range of variation in the range of affected organ systems. The most specialised (i.e. having the lowest Shannon Entropy) Mycobacterium species, M. ulcerans , causes the cutaneous infection Buluri ulcer ( 12 ), and the most generalist species in the genus is M. flavescens , which is a ‘low-level’ pathogen that has been documented from a variety of body sites ( 13 ) ( Fig. 4B ). The genera Streptococcus, Staphylococcus (with the exception of the mainly uropathogenic S. saprophyticus ( 14 )) and Pseudomonas show a narrower distribution of predominantly generalist species able to infect a variety of body sites. The genus Legionella has the lowest mean entropy because most species predominantly cause respiratory infections ( 15 ). Analyses focusing on transmission mode or environmental- and animal reservoirs (Table S3) gave less clear-cut results (results not shown). This can be explained by the fact that we filtered abstracts initially for their description of human infection, rather than for abstracts specifically obtained via keyword filters and LLMs designed for answering epidemiology or environmental distribution-related questions. Download figure Open in new tab Figure 4. Variation in body site occupation calculated for nine species-rich genera. A: Shannon Entropy is used as a summary statistic for generalism (1 maximum generalism, 0 maximum specialism) based on Onehot Embedding Questions in the ‘body site’ categories (nervous, visual, ear, circulatory, respiratory, digestive, skin, musculoskeletal, genitourinary, systemic, sterile site; Table S3). Species in eleven species-rich genera with ≥3 available abstracts were used. B: ‘Radar plot’ depicting body site infection recorded for Mycobacterium ulcerans (low entropy; orange) and M. flavescens (high entropy; blue). Hierarchical clustering delineates species into distinct pathogen types As phylogenetic affiliation or trait-by-trait comparisons cannot always be relied on to meaningfully distinguish different pathogen species, we explore whether species naturally group into distinct pathogen types using hierarchical clustering ( 16 ) of text-vectorised abstracts. Using only species with a minimum of ten available abstracts to ensure sufficient data, we partitioned the data into 75 clusters ( Fig. 5 ). The resulting 75 ‘pathogen types’, consist of 1 one to 24 species (Table S4) and comprise a posteriori logical species groupings. For example, Cluster 5 is made up by eleven relatively infrequently occurring Gram-negative species causing GI-infections ( Aeromonas caviae, A. hydrophila, A. sobria, A. veronii, Campylobacter fetus, C. lari, Edwardsiella tarda, Grimontia hollisae, Laribacter hongkongensis, Plesiomonas shigelloides and Vibrio fluvialis ) ( Fig. 5 ). Cluster 6 consists of nine species associated with dog bites ( Bergeyella zoohelcum, Capnocytophaga canimorsus, Pasteurella canis, P. dagmatis, P. multocida, Rodentibacter pneumotropicus, Staphylococcus intermedius, S. pseudintermedius and Streptococcus canis ). Cluster 24 contains 14 species which favour anaerobic body sites ( Cutibacterium acnes, C. avidum, Finegoldia magna, Gleimia europaea, Mediterraneibacter gnavus, Metamycoplasma hominis, Paraclostridium bifermentans, Parvimonas micra, Peptoniphilus asaccharolyticus, Prevotella bivia, Schaalia turicensis, Trueperella bernardiae, Veillonella parvula and Winkia neuii ). As a final example, Cluster 34 contains five sexually transmitted pathogens ( Chlamydia trachomatis, Haemophilus ducreyi, Mycoplasmoides genitalium, Neisseria gonorrhoeae and Ureaplasma urealyticum ). Such a ‘pathogen type’-based approach could be useful for diagnostic laboratory protocols, where the identification of one species could prompt a search for species with similar infection characteristics. Download figure Open in new tab Figure 5. Hierarchical clustering of pathogen species into 75 ‘pathogen types’. High-dimensional relations between all bacterial pathogen species of humans with ≥10 causative abstracts available clustered into 75 ‘pathogen types’ (Table S4) superimposed via multidimensional scaling t-SNE ( 38 ) in two dimensions. The symbol size of individually coloured species is scaled according to the pathogen index (see main text), each cluster is assigned an arbitrary colour. Four clusters are indicated: cluster 5 ‘infrequently occurring gram-negative species causing GI-infections’, cluster 6 ‘species associated with dog bites’, cluster 24 species favouring anaerobic body sites and cluster 34 ‘sexually transmitted pathogens’ (individual species are listed in the main text). Detecting trends in the emergence of bacterial pathogen species While many well-recognised bacterial pathogens have infected human populations for millennia ( 17 ), species threatening human health continue to be described at an increasing rate ( 8 ). Pathogen species that increase in frequency (globally or in a specific geographic location) are commonly referred to as Emerging Infectious Diseases (EIDs) ( 18 ). The majority of EIDs recognised by (inter)national public health organisations such as the WHO, CDC or UKHSA are viral. The bacteriological literature frequently refers to bacterial species as ‘emerging’ (e.g. ( 19 - 21 )), but an accepted definition is lacking, and published lists are not in agreement ( 22 - 25 ), or only consider a subset of emerging pathogen types ( 26 - 28 ). Our dataset lacks information on case numbers, strain variants or geographical information and so we are unable to compile a list of EIDs sensu stricto . However, our data allow analyses of publication rates on pathogen types through time, which could shed light on which species have risen to prominence in recent history and elucidate their potential drivers. We first filter the dataset for species with ten or more available causative abstracts (i.e. species of some significance) first recorded as a pathogen post-1971 (n=208, Table S5). Counting the number of causative abstracts in the ten years since the first record is used as a measure of importance as an emerging pathogen. Ranking pathogens by this measure identifies important emerging species such as Bartonella henselae and Burkholderia cenocepacea , but also lower-ranked species which some authors have identified as emerging, including Mycobacterium genavense ( 29 ), Corynebacterium kroppenstedtii ( 30 ) and Actinobaculum schaalii ( 31 ). It is clear that significant new pathogen species continue to be recognised apace ( Fig. 6A ). Download figure Open in new tab Figure 6. Detecting trends in the emergence of bacterial pathogen species in the literature Literature analysis reveals trends in the emergence of bacterial pathogens in the literature. A: Number of newly emerging bacterial pathogen species (species described 1971 or later with ≥10 causative abstracts; Table S5) described per half-decade. Species are divided into time periods based on the year in which the tenth causative abstract was published. (Note that the decrease in number of described significant pathogen species in the last time period is attributable to i) not counting abstracts published in the last two years and ii) limited time having passed to accumulate many references). B: Dendrogram of the species most rapidly emerging in the literature (i.e. species that needed ten or fewer years to reach ten causative abstracts, n=81) divided into 13 clusters. C: Heatmap showing species association with the concept ‘zoonosis’. Symbol size reflects the number of causative abstracts in the first ten years of emergence for each species; top scoring species are indicated. D: Heatmap showing species association with the concept ‘multi-drug resistance’ with top scoring species indicated. Symbol size reflects the number of causative abstracts in the first ten years of emergence for each species; top scoring species are indicated. Applying hierarchical clustering to the most rapidly emerging species (those reaching a count of ten causative abstracts in ten or fewer years, n=81, Table S5) and constraining to 13 clusters with a maximum of twelve species per cluster produces logical groupings, including zoonotic species (cluster 6), species causing endocarditis (cluster 8) and gram positive species causing UTIs (cluster 9) as well as two species clusters where shared infection characteristics are not immediately obvious (clusters 10, 11) ( Fig. 6B ; Table S6). These species can be plotted on a timeline, depicting importance and cluster association (Fig. S5); but it is also possible to use a heatmap to depict general trends, such as the association with the concept zoonosis ( Fig. 6C ) or multi-drug resistance ( Fig. 6D ). The latter approach suggests that antimicrobial resistance is currently a more important factor than zoonoses in the emergence of bacterial pathogens. This stands in contrast to viruses where zoonotic emergence is of greatest concern ( 32 ). Discussion We here use LLMs to untap the ‘wisdom of the crowd’ contained in a vast body of peer-reviewed microbiological literature, enabling analyses of bacterial pathogen diversity. We make available an objective, robust pathogen ‘checklist’ with evidential PubMed IDs which is superior to a previous manually curated effort ( 8 ), as well as a separate list of ‘emerging bacterial pathogens’ based on objective publication trend criteria. A combination of user-generated key LLM questions and abstract-text vectorisation could be shown to accurately extract key species characteristics at scale, allowing intuitive and statistical exploration. This extends from the extraction of ‘knowable’ traits, i.e. those that with considerable effort could manually be extracted from the literature, to complex ‘unknowable’ information, for instance revealing natural groupings of pathogen species. Our LLM-based approach holds promise for applied purposes such as clinical diagnostics, an area that is currently for a large part based on expert knowledge of clinicians and technicians. Our pathogen list can serve as a benchmark and, with harvesting of more data, can form the basis of a relational database aided by visualisation options. It is also possible to use information on infection contexts from the literature for predictive modelling, for instance training ML models on information retrieved from literature on zoonotic pathogen species to predict the emergence potential of animal pathogens, or to combine contextual information from the literature with information of antimicrobial resistance gene carriage. Future analyses will require expansion of our methodology to include different filters and LLM queries (e.g. incorporating antibiotic resistance, disease classification and strain-level descriptions) and could also be applied to other pathogen types such as viruses or fungi. Here, analyses were based on Open Access abstracts, which limits monetary- and carbon costs and more readily allows scrutiny by others, but they could be extended to full papers. We demonstrate that Artificial Intelligence-based literature research allows the exploration of broad patterns in bacterial pathogen diversity. LLM-based text mining could hold promise for a wide variety of questions in microbiology, provided there is a rich literature available on a diversity of species, traits, environments or mechanisms. Possible examples include the evolution of traits ( 33 ), specialism versus generalism ( 34 ), adaptive radiations ( 35 ) or host shifts ( 36 ). As literature continues to grow exponentially, LLM-based mining will usable increasingly essential in bridging the gap between ‘information abundance’ and actionable insights such as are needed in clinical and public health contexts. Our analyses illustrate that human expertise can be amplified by artificial intelligence to enhance reproducibility, transparency and accelerate scientific discovery. Methods Automated Literature Search We compiled a local database of PubMed-indexed abstracts ( https://pubmed.ncbi.nlm.nih.gov/ ) that referenced full binomial bacterial species names— e.g., Acinetobacter baumannii , but not abbreviated forms like A. baumannii —in conjunction with at least one term from each of two curated keyword sets ( Fig. 1 ). The first set comprised terms related to human association, while the second targeted infection-related descriptions. This dual-filtering approach ensured that retrieved abstracts were thematically relevant to human infectious disease. A maximum of 10,000 abstracts were downloaded per species name to ensure computational tractability. The list of validly published species names was obtained from the List of Prokaryotic names with Standing in Nomenclature (LPSN) ( https://lpsn.dsmz.de/ ) and used as the basis for the search. To prevent redundancy due to heterotypic or homotypic synonyms—as well as common misspellings—species names were consolidated in the final stage of the workflow by merging variant names under a single, standardised species name ( Fig. 1 ). Keyword matching was performed across multiple PubMed fields, including title, abstract, MeSH terms, and keyword metadata (OT). The human-association keyword set included broad demographic and clinical descriptors such as: case, cases, clinical, clinically, human, humans, male, males, neonates, neonate, neonatal, female, females, girl, girls, boy, boys, adolescent, adolescents, individual, individuals, baby, babies, elderly, man, men, women, woman, infant, infants, patient, patients, child, children, adult, adults . The second keyword set for association with infection consisted of the terms: cause, causation, caused, causative, infected, infection, infections, infectious, infective, disease, morbidity, mortality, sepsis, septic, mycetoma, septicaemia, sinonasal, diarrhoea, bacteraemia, bacteremia, blood, fever, pus, cystic, purulent, pyogenic, abscess, empyema, pneumonia, phlegmon, erysipelas, ulcer, ecthyma, dysentery and systemic . The analysis was run 12/10/2024. Only abstracts published in the period 1950-2024 were analysed. Large Language Models to establish infection causation To support language understanding and representation tasks, we implemented a modular framework incorporating API-based Large Language Models (LLMs) from Anthropic (Claude 3.5 Sonnet, for prompt based query-answer processing) and OpenAI (text-embedding-3-small) to map natural language inputs into dense numerical vectors that capture semantic meaning for tasks like clustering) providing unified interfaces for both text generation (‘answer’) and embedding extraction (‘transform’). The Anthropic LLM class interacts with Anthropic’s Claude models, while the OpenAI LLM class interfaces with OpenAI’s models for embedding via the text-embedding-3-small model. Both classes include custom mechanisms for robust API communication, including key management, configurable generation parameters, and retry logic for fault tolerance. A Python custom LLM wrapper was created for flexible pairing of generation and embedding backends, facilitating integration into downstream tasks such as clustering and visualization. Intermediate workflow results were stored in structured data frames for reproducibility. Question based embedding vectorization To ensure a standardized data collection approach, 78 “YES” or “NO” questions on infection biology (Table S3) were devised by domain experts RB and MV for interrogation of each causation abstract. Each abstract was transformed into a binary vector of length 78 where a value of 1 indicates a ‘YES’ response and 0 a ‘NO’ response in a one-hot encoding approach ( 37 ). Since individual abstracts typically are concerned with only a few concepts, information for a given species was aggregated by computing the element-wise average of all binary vectors associated abstracts for that species. The result is a real-valued vector (also length 78) in which each value represents the proportion of abstracts for that species that mention a given concept. This averaged vector serves as a compact, interpretable species ‘fingerprint’ and can be used to compute scalar values (e.g. via norms or projections) for heat map colouring or clustering (see below). Anonymised text vectorisation Prior to LLM processing, each binomial species name was replaced with a generic placeholder (‘bacterial_species1’, ‘bacterial_species2’) to limit LLM interpretation strictly to information present within each abstract, ensuring that its output is not influenced by its pre-existing knowledge base. Next, OpenAI’s text vectorisation method was used to convert abstracts into fixed-length numerical embeddings into a 1,536-dimensional vector. Each abstract’s raw text undergoes an initial step of tokenization using a specialized tokenizer. Tokens then pass through a pre-trained deep transformer model where each token is given a contextualized representation, capturing both syntactic and semantic relationships. Finally, these individual token representations are combined to create a single, fixed-size 1,536-dimensional vector encoding the abstract’s overall meaning. Visualization To visualise (dis)similarities between bacterial species based on the content of averaged associated scientific abstracts, we apply t-distributed Stochastic Neighbor Embedding (t-SNE) ( 38 ) to reduce high-dimensional vectors to two dimensions. To visualize species-level patterns in concept prevalence, we use the averaged binary vectors that represent the proportion of abstracts affirming each infection-related concept for a given species. We calculate the Euclidean norm of the vector projection of this 78-dimensional vector onto the specific questions that identify the properties of interest for our analysis to yield a single scalar value reflecting the overall prominence of selected concepts in the literature associated with that particular species. Clustering To identify groups of bacterial species with similar literature-derived profiles, we employ a two-stage clustering approach designed to enhance robustness against noise and outliers. First, we apply an Isolation Forest algorithm ( 39 ) to isolate data points by randomly selecting features and splitting values. Using a predefined contamination parameter, this step identifies species whose embeddings deviate markedly from the rest, likely due to sparse or inconsistent textual evidence. These outliers (‘-1’) are flagged and excluded from further clustering, ensuring they do not distort the resulting groupings. Each excluded species is assigned a cluster label of -1 to indicate its anomalous nature. Next, we perform agglomerative hierarchical clustering on the remaining inlier species. We use Ward’s linkage method ( 16 ), which iteratively merges clusters in a manner that minimises the total within-cluster variance. This linkage criterion is well suited for continuous vector representations like the averaged text embeddings used here, as it encourages the formation of compact, homogeneous clusters. The hierarchical structure produced by this method allows for flexible control over granularity e.g. by cutting the dendrogram at a fixed number of clusters or using distance thresholds. Together, this two-step procedure yields interpretable and reliable groupings of pathogen species based on semantic similarity. Data availability Raw data and scripts to generate the figures in this paper can be found at: https://github.com/fabriziocosta/bacterial_pathogen_atlas . Acknowledgments M.V. acknowledges funding support from the National Environment Research Council (NERC; NE/T008083/1) and M.V. and F.C. thank the University of Exeter Institute for Data Science and Artificial Intelligence (IDSAI) for seedcorn funding. We thank Aneesh Krishnaraj Nejikar, Finley Gibson and Michael Saunby for their help with initial stages of this research and Elze Hesse and Mario Recker for helpful comments on the manuscript. Funder Information Declared NERC Environmental Bioinformatics Centre, https://ror.org/03rjhjc18 , NE/T008083/1 Footnotes https://github.com/fabriziocosta/bacterial_pathogen_atlas References 1. ↵ Ikuta KS , Swetschinski LR , Aguilar GR , Sharara F , Mestrovic T , Gray AP , et al. Global mortality associated with 33 bacterial pathogens in 2019: a systematic analysis for the Global Burden of Disease Study 2019 . The Lancet . 2022 ; 400 ( 10369 ): 2221 – 48 . OpenUrl 2. ↵ Murray CJ , Ikuta KS , Sharara F , Swetschinski L , Aguilar GR , Gray A , et al. Global burden of bacterial antimicrobial resistance in 2019: a systematic analysis . The Lancet . 2022 ; 399 ( 10325 ): 629 – 55 . OpenUrl CrossRef 3. ↵ Kollef MH , Torres A , Shorr AF , Martin-Loeches I , Micek ST . Nosocomial infection . Critical Care Medicine . 2021 ; 49 ( 2 ): 169 – 87 . OpenUrl PubMed 4. ↵ Jones BA , Grace D , Kock R , Alonso S , Rushton J , Said MY , et al. Zoonosis emergence linked to agricultural intensification and environmental change . Proceedings of the National Academy of Sciences . 2013 ; 110 ( 21 ): 8399 – 404 . OpenUrl Abstract / FREE Full Text 5. ↵ Vos M. The evolution of bacterial pathogens in the Anthropocene . Infection, Genetics and Evolution . 2020 : 104611 . 6. ↵ Hauser N , Conlon KC , Desai A , Kobziar LN . Climate change and infections on the move in North America . Infection and Drug Resistance . 2021 : 5711 – 23 . 7. ↵ Huttunen R , Syrjänen J. Obesity and the risk and outcome of infection . International Journal of Obesity . 2013 ; 37 ( 3 ): 333 – 40 . OpenUrl CrossRef 8. ↵ Bartlett A , Padfield D , Lear L , Bendall R , Vos M. A comprehensive list of bacterial pathogens infecting humans . Microbiology . 2022 ; 168 ( 12 ): 001269 . OpenUrl CrossRef 9. ↵ Berends MS , Luz CF , Friedrich AW , Sinha BN , Albers CJ , Glasner C. AMR: an R package for working with antimicrobial resistance data . Journal of Statistical Software . 2022 ; 104 : 1 – 31 . OpenUrl 10. ↵ Oren A , Arahal DR , Göker M , Moore ER , Rossello-Mora R , Sutcliffe IC . International code of nomenclature of prokaryotes . Prokaryotic code (2022 revision). International Journal of Systematic and Evolutionary Microbiology . 2023 ; 73 ( 5a ): 005585 . OpenUrl 11. ↵ Meier-Kolthoff JP , Carbasse JS , Peinado-Olarte RL , Göker M. TYGS and LPSN: a database tandem for fast and reliable genome-based classification and nomenclature of prokaryotes . Nucleic Acids Research . 2022 ; 50 ( D1 ):D801-D7. 12. ↵ Portaels F , Silva MT , Meyers WM . Buruli ulcer . Clinics in Dermatology . 2009 ; 27 ( 3 ): 291 – 305 . OpenUrl CrossRef PubMed Web of Science 13. ↵ Sethi S , Gupta V , Bhattacharyya S , Sharma M. Post-laparoscopic wound infection caused by scotochromogenic nontuberculous Mycobacterium . Japanese Journal of Infectious Diseases . 2011 ; 64 ( 5 ): 426 – 7 . OpenUrl PubMed 14. ↵ Hovelius B , Mårdh P-A. Staphylococcus saprophyticus as a common cause of urinary tract infections. Reviews of Infectious Diseases . 1984 ; 6 ( 3 ): 328 – 37 . OpenUrl PubMed 15. ↵ Edelstein PH , Lück C. Legionella . Manual of clinical microbiology . 2015 : 887 – 904 . 16. ↵ Ward Jr JH . Hierarchical grouping to optimize an objective function . Journal of the American Statistical Association . 1963 ; 58 ( 301 ): 236 – 44 . OpenUrl CrossRef Web of Science 17. ↵ Kennedy J. Pathogenesis: How germs made history . London, Torva . 2023 . 18. ↵ Jones KE , Patel NG , Levy MA , Storeygard A , Balk D , Gittleman JL , et al. Global trends in emerging infectious diseases . Nature . 2008 ; 451 ( 7181 ): 990 . OpenUrl CrossRef PubMed Web of Science 19. ↵ Angrup A , Sharma B , Sehgal IS , Biswal M , Ray P. Emerging Bacterial Pathogens in the COVID-19 Era: Chryseobacterium gleum—A Case in Point . Journal of Laboratory Physicians . 2023 ; 15 ( 01 ): 097 – 105 . OpenUrl 20. Gerrard J , Waterfield N , Vohra R. Human infection with Photorhabdus asymbiotica: an emerging bacterial pathogen . Microbes and Infection . 2004 ; 6 ( 2 ): 229 – 37 . OpenUrl CrossRef PubMed Web of Science 21. ↵ Igbinosa IH , Igumbor EU , Aghdasi F , Tom M , Okoh AI . Emerging Aeromonas species infections and their significance in public health . The Scientific World Journal . 2012 ; 2012 ( 1 ): 625023 . OpenUrl PubMed 22. ↵ Houpikian P , Raoult D. Traditional and molecular techniques for the study of emerging bacterial diseases: one laboratory’s perspective . Emerging Infectious Diseases . 2002 ; 8 ( 2 ): 122 . OpenUrl CrossRef PubMed 23. Levitt AM , Khan AS , Hughes JM . Emerging and re-emerging pathogens and diseases . Infectious Diseases . 2012 : 56 . 24. Vouga M , Greub G. Emerging bacterial pathogens: the past and beyond . Clinical Microbiology and Infection . 2016 ; 22 ( 1 ): 12 – 21 . OpenUrl CrossRef PubMed 25. ↵ Woolhouse ME , Gowtage-Sequeria S. Host range and emerging and reemerging pathogens . Emerging Infectious Diseases . 2005 ; 11 ( 12 ): 1842 . OpenUrl CrossRef PubMed Web of Science 26. ↵ Manfredi R , Nanetti A , Ferri M , Chiodo F. Pseudomonas organisms other than Pseudomonas aeruginosa as emerging bacterial pathogens in patients with human immunodeficiency virus infection . Infectious Diseases in Clinical Practice . 2000 ; 9 ( 2 ): 79 – 87 . OpenUrl 27. Mor-Mur M , Yuste J. Emerging bacterial pathogens in meat and poultry: an overview . Food and Bioprocess Technology . 2010 ; 3 ( 1 ): 24 – 35 . OpenUrl CrossRef 28. ↵ Parkins MD , Floto RA . Emerging bacterial pathogens and changing concepts of bacterial pathogenesis in cystic fibrosis . Journal of Cystic Fibrosis . 2015 ; 14 ( 3 ): 293 – 304 . OpenUrl PubMed 29. ↵ Böttger E. Mycobacterium genavense: an emerging pathogen . European Journal of Clinical Microbiology and Infectious Diseases . 1994 ; 13 ( 11 ): 932 – 6 . OpenUrl CrossRef PubMed 30. ↵ Wong SC , Poon RW , Chen JH , Tse H , Lo JY , Ng T-K , et al. , editors. Corynebacterium kroppenstedtii is an emerging cause of mastitis especially in patients with psychiatric illness on antipsychotic medication. Open Forum Infectious Diseases ; 2017 : Oxford University Press US . 31. ↵ Cattoir V. Actinobaculum schaalii: review of an emerging uropathogen . Journal of Infection . 2012 ; 64 ( 3 ): 260 – 7 . OpenUrl CrossRef PubMed Web of Science 32. ↵ Olival KJ , Hosseini PR , Zambrana-Torrelio C , Ross N , Bogich TL , Daszak P. Host and viral traits predict zoonotic spillover from mammals . Nature . 2017 ; 546 ( 7660 ): 646 – 50 . OpenUrl CrossRef PubMed 33. ↵ Martiny JB , Jones SE , Lennon JT , Martiny AC . Microbiomes in light of traits: a phylogenetic perspective . Science . 2015 ; 350 ( 6261 ): aac9323 . OpenUrl Abstract / FREE Full Text 34. ↵ Visher E , Boots M. The problem of mediocre generalists: population genetics and eco-evolutionary perspectives on host breadth evolution in pathogens . Proceedings of the Royal Society B . 2020 ; 287 ( 1933 ): 20201230 . OpenUrl CrossRef PubMed 35. ↵ Vos M , Padfield D , Quince C , Vos R. Adaptive radiations in natural populations of prokaryotes: innovation is key . FEMS Microbiology Ecology . 2023 ; 99 ( 12 ): fiad154 . OpenUrl PubMed 36. ↵ Shaw LP , Wang AD , Dylus D , Meier M , Pogacnik G , Dessimoz C , et al. The phylogenetic range of bacterial and viral pathogens of vertebrates . Molecular ecology . 2020 ; 29 ( 17 ): 3361 – 79 . OpenUrl CrossRef 37. ↵ Manning C , Schutze H. Foundations of statistical natural language processing : MIT press , Cambridge MA ; 1999 . 38. ↵ Maaten Lvd , Hinton G. Visualizing data using t-SNE . Journal of Machine Learning Research . 2008 ; 9 ( Nov ): 2579 – 605 . OpenUrl 39. ↵ Liu FT , Ting KM , Zhou Z-H , editors. Isolation forest. Eighth IEEE International Conference on Data Mining ; 2008 . View the discussion thread. Back to top Previous Next Posted July 29, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Large Language Model-assisted text mining reveals bacterial pathogen diversity Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Large Language Model-assisted text mining reveals bacterial pathogen diversity Michiel Vos , Markus Göker , Richard Bendall , Fabrizio Costa bioRxiv 2025.07.29.667369; doi: https://doi.org/10.1101/2025.07.29.667369 Share This Article: Copy Citation Tools Large Language Model-assisted text mining reveals bacterial pathogen diversity Michiel Vos , Markus Göker , Richard Bendall , Fabrizio Costa bioRxiv 2025.07.29.667369; doi: https://doi.org/10.1101/2025.07.29.667369 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Microbiology Subject Areas All Articles Animal Behavior and Cognition (7636) Biochemistry (17698) Bioengineering (13896) Bioinformatics (41959) Biophysics (21457) Cancer Biology (18597) Cell Biology (25523) Clinical Trials (138) Developmental Biology (13381) Ecology (19904) Epidemiology (2067) Evolutionary Biology (24323) Genetics (15612) Genomics (22511) Immunology (17738) Microbiology (40401) Molecular Biology (17184) Neuroscience (88624) Paleontology (667) Pathology (2833) Pharmacology and Toxicology (4825) Physiology (7644) Plant Biology (15158) Scientific Communication and Education (2046) Synthetic Biology (4296) Systems Biology (9825) Zoology (2271)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.