Full text
70,995 characters
· extracted from
preprint-html
· click to expand
PRSformer: Disease Prediction from Million-Scale Individual Genotypes | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results PRSformer: Disease Prediction from Million-Scale Individual Genotypes Payam Dibaeinia , Chris German , Suyash Shringarpure , Adam Auton , Aly A. Khan doi: https://doi.org/10.1101/2025.10.26.684578 Payam Dibaeinia 1 23andMe , Palo Alto, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Chris German 1 23andMe , Palo Alto, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Suyash Shringarpure 1 23andMe , Palo Alto, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Adam Auton 1 23andMe , Palo Alto, CA, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site Aly A. Khan 1 23andMe , Palo Alto, CA, USA 2 University of Chicago , Chicago, IL, USA Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: aakhan{at}uchicago.edu Abstract Full Text Info/History Metrics Preview PDF Abstract Predicting disease risk from DNA presents an unprecedented emerging challenge as biobanks approach population scale sizes ( N > 10 6 individuals) with ultra-high-dimensional features ( L > 10 5 genotypes). Current methods, often linear and reliant on summary statistics, fail to capture complex genetic interactions and discard valuable individual-level information. We introduce PRSformer , a scalable deep learning architecture designed for end-to-end, multitask disease prediction directly from million-scale individual genotypes. PRSformer employs neighborhood attention, achieving linear O ( L ) complexity per layer, making Transformers tractable for genome-scale inputs. Crucially, PRSformer utilizes a stacking of these efficient attention layers, progressively increasing the effective receptive field to model local dependencies (e.g., within linkage disequilibrium blocks) before integrating information across wider genomic regions. This design, tailored for genomics, allows PRSformer to learn complex, potentially non-linear and long-range interactions directly from raw genotypes. We demonstrate PRSformer’s effectiveness using a unique large private cohort ( N ≈ 5M) for predicting 18 autoimmune and inflammatory conditions using L ≈ 140k variants. PRSformer significantly outperforms highly optimized linear models trained on the same individual-level data and state-of-the-art summary-statistic-based methods (LDPred2) derived from the same cohort , quantifying the benefits of non-linear modeling and multitask learning at scale. Furthermore, experiments reveal that the advantage of non-linearity emerges primarily at large sample sizes ( N > 1M), and that a multi-ancestry trained model improves generalization, establishing PRSformer as a new framework for deep learning in population-scale genomics. 1 Introduction Learning predictive models from high-dimensional, complex structured data is a fundamental machine learning challenge. This challenge is acutely relevant in modern genomics, where biobanks are rapidly scaling towards million-sample sizes ( N > 1, 000, 000) and individual genomes are characterized by hundreds of thousands to millions of genetic variants (e.g., Single Nucleotide Polymorphisms, SNPs), yielding a regime of ultra-high dimensionality ( L > 100, 000). Effectively leveraging individual-level genomic data at this unprecedented N × L scale is critical for unlocking deeper insights into complex trait genetics, such as predicting disease susceptibility [ 1 , 2 ]. Current state-of-the-art methods for disease risk prediction primarily rely on Polygenic Risk Scores (PRS) derived from Genome-Wide Association Study (GWAS) summary statistics [ 3 , 4 , 5 , 6 ]. While effective to a degree, these methods are predominantly linear, capturing additive genetic effects. Furthermore, by operating on summary statistics, they discard potentially valuable individual-level information and struggle to model non-additive genetic interactions (epistasis) [ 7 , 8 , 9 , 10 ]. These limitations may cap predictive performance, especially as dataset sizes scale towards the million-sample regime where subtle interaction effects might become detectable. Transformer architectures have revolutionized sequence modeling in other domains by capturing complex, long-range dependencies via self-attention [ 11 ]. We hypothesize that the attention mechanism provides a powerful inductive bias for genomics, enabling more effective modeling of pairwise linear and non-linear interactions between genetic loci compared to traditional architectures or inherently linear models. However, a critical barrier persists: the prohibitive 𝒪 ( L 2 ) computational complexity of standard self-attention renders its direct application to genome-scale sequences ( L > 100, 000) computationally infeasible. Addressing this scalability bottleneck is critical to harnessing the power of Transformers for large-scale genomics. Here we introduce PRSformer , a novel Transformer-based architecture specifically engineered for scalable, end-to-end, multitask disease risk prediction directly from ultra-high-dimensional ( L > 100k) individual-level genotypes ( N > 1M). PRSformer’s core innovation lies in its scalability, achieved by incorporating neighborhood attention (NA) [ 12 ], an efficient attention mechanism restricting computations to local genomic windows, resulting in 𝒪 ( L ) complexity per layer. This design aligns with the biological structure of the genome, which we treat as a series of linked regions, called linkage disequilibrium (LD) blocks, where genetic variants are often inherited together. PRSformer stacks NA layers, which first model interactions within LD blocks, then progressively integrates information between neighboring LD blocks in deeper layers, capturing larger-scale genetic patterns influencing disease. To evaluate PRSformer’s ability to harness the shared genetics underlying multiple, related traits, we trained and validated it in a multitask setting across 18 autoimmune and inflammatory conditions. This trait set provides an ideal testbed for multitask learning due to immune-mediated inflammatory diseases frequently exhibiting shared inflammatory pathophysiology and overlapping genetic factors [ 13 ]. The multitask formulation allows us to test PRSformer’s ability to exploit shared genetic associations while learning disease-specific patterns, aiming to enhance predictive performance through a shared representation. We provide rigorous empirical validation using data from a unique large private cohort ( N ≈ 3.8M European-ancestry individuals) for training, validation, and evaluation across D = 18 autoimmune and inflammatory conditions using L ≈ 140k variants. The scale of this cohort significantly exceeds current public biobanks [ 14 ], providing a critical real-world testbed for methods designed to handle genomic data of the million-scale magnitude. We conduct stringent comparisons against: (i) highly optimized linear models (regularized logistic regression with learnable embeddings) trained on the exact same individual-level genotype data , isolating the benefit of PRSformer’s non-linear architecture; and (ii) state-of-the-art summary-statistic-based PRS methods (LDPred2 [ 5 ]) derived from the exact same cohort , enabling a direct comparison between end-to-end and summary-statistic-based approaches. Our experiments demonstrate statistically significant performance gains for PRSformer. Our main contributions are: Scalable deep learning for genomics at population scale: We present an efficient multitask Transformer architecture applied to population-scale data ( N ≈ 5M) with ultra-high-dimensional features ( L ≈ 140K), establishing a blueprint for tackling other genome sequence prediction tasks. Critical scaling law for non-linear models: We empirically establish and quantify a key scaling law demonstrating that the predictive advantage of non-linear models over linear methods emerges primarily at the million-sample scale for complex immune-related conditions. Our analysis quantifies this effect, showing that performance gains grow consistently as the training set size increases beyond one million individuals. Multitask learning improves genomic prediction: We show that multitask training across related traits consistently outperforms the standard single-task paradigm, demonstrating the benefit of learning a shared genetic representation across complex immune-mediated inflammatory diseases. Improved cross-ancestry generalization: We show that training PRSformer on multiancestry data, including an additional ∼ 1.1M non-European individuals, markedly improves prediction accuracy for held-out non-European individuals compared to a model trained only on European-ancestry data, offering a path toward more equitable genomic prediction. 2 Related work This work is situated at the intersection of statistical genetics, genomics, and deep learning. We specifically advance upon prior work in three key areas: polygenic risk prediction, the application of deep learning to genomic data, and the development of efficient Transformer architectures for ultra-long sequences. 2.1 Polygenic risk score methods Traditional complex trait prediction relies heavily on PRS derived from GWAS summary statistics [ 4 , 6 ]. Early methods often involved simple thresholding and summing of SNP effects [ 3 , 15 ]. More recent Bayesian approaches, such as LDpred2 [ 5 ] and PRS-CS [ 16 ], explicitly model LD patterns and utilize shrinkage priors to improve predictive accuracy. These methods represent the current state-of-the-art for prediction from summary statistics. However, PRS methods based on summary statistics are fundamentally limited in several ways. First, by discarding individual-level genotype and haplotype information, these methods cannot capture LD structure and must instead rely on LD estimates that are typically imputed from external reference panels, which can introduce biases due to population mismatches [ 17 , 18 ]. Second, they cannot capture variant-variant interactions such as epistasis as they are restricted to using marginal variant effects. Third, the use of precomputed summary statistics constrains these models to largely linear architectures, precluding the discovery of complex multi-locus or hierarchical genetic patterns. Recent summary-statistic approaches to leverage non-additive signal remain constrained by the lack of individual-level haplotype context [ 19 ]. Taken together, these limitations may cap predictive performance, particularly as biobank-scale datasets grow large enough to enable the detection of more subtle and nonlinear genetic effects. Alternative individual-level approaches, such as BOLT-LMM [ 20 ] and GEMMA [ 21 ], estimate SNP effect sizes under a linear mixed model framework to account for population structure and polygenic background effects. However, these methods are computationally demanding at our study’s scale ( N =3.8M, D =18 traits): GEMMA’s cubic complexity in N renders it intractable, while BOLT-LMM, though more scalable, operates on a single trait at a time, requiring 18 separate runs. Prior work has shown that LDPred2 achieves predictive performance comparable to BOLT-LMM across multiple traits [ 22 , 23 ], supporting its use as a strong linear baseline for comparison. 2.2 Deep learning in genomics and trait prediction Deep learning has been successfully applied to various supervised genomic prediction tasks. Much work has focused on modeling sequence-level information (DNA base pairs) to predict molecular phenotypes like transcription factor binding [ 24 , 25 , 26 ], chromatin accessibility [ 27 ], or gene expression [ 28 , 29 ]. These approaches have predominantly utilized Convolutional Neural Networks (CNNs) or Transformers incorporating CNN-style tokenization, which are well-suited to capturing biologically meaningful motifs and local patterns at base-pair resolution. However, this paradigm is less intuitive when modeling the influence of genetic variants (e.g., SNPs) on complex traits, as causal variants can be spread across the genome and may interact over long distances, often without strong local sequence motifs defining their impact. The application of deep learning to predict complex traits (like disease status) directly from individual-level genotype data (i.e., SNP arrays) remains relatively underexplored, particularly at the population scale addressed in this paper [ 30 , 31 ]. This is largely due to the challenges of ultra-high dimensionality ( L ) and, until recently, the limited statistical power of publicly available cohorts with both individual-level genotypes and phenotypes ( N ). Prior work in this specific domain has often relied on: (i) tree ensemble models such as gradient boosting or simple neural networks trained on reduced feature sets (e.g., using LASSO feature selection); (ii) smaller cohorts where complex interactions are difficult to detect; and (iii) models operating on precomputed PRS or summary statistics rather than raw genotype data [ 8 , 32 , 33 , 34 , 35 , 36 ]. Recently, Phenformer [ 37 ] proposed a multi-scale Transformer that predicts disease risk from DNA sequences by linking genetic variation, gene expression, and phenotype through a pretrained sequence-to-expression backbone. While conceptually similar to our end-to-end genotype-to-phenotype goal, Phenformer operates on DNA sequences covering approximately ≈ 3% of the genome and is trained on ∼ 150K individuals, whereas our approach models variant-level genotype data and scales to millions of individuals, enabling systematic analysis of how nonlinearity interacts with data scale in complex trait prediction. 2.3 Efficient transformer architectures Applying standard Transformers to genome-scale data ( L > 100k) is computationally prohibitive due to the 𝒪 ( L 2 ) complexity of self-attention [ 11 , 38 ]. A wide range of efficient attention mechanisms have been proposed to address this limitation, including sparse attention patterns (e.g., Longformer [ 39 ], BigBird[ 40 ]), low-rank approximations (e.g., Linformer [ 41 ]), and kernel-based methods (e.g., Performer [ 42 ]). Other architectures exploit locality through sliding windows or blockwise mechanisms (e.g., Swin Transformer [ 43 ]) to reduce complexity while capturing local dependencies. In our work, we adopt neighborhood attention [ 12 ], a variant of self-attention in which each query attends only to a fixed-size local window of neighboring tokens, rather than the full sequence. This inductive bias aligns well with the block-like correlation structure of genomic data driven LD. By limiting attention to a neighborhood of size k ≪ L , NA reduces both computational and memory complexity to ( L · k ) -achieving linear scaling in sequence length. We employ the optimized GPU implementation provided by the NATTEN library [ 12 , 44 ], which supports scalable training on long sequences while maintaining the expressiveness of content-based attention. 3 Methods 3.1 Problem definition We aim to predict susceptibility to multiple ( D = 18) autoimmune and inflammatory conditions from individual-level genotypes. Formally, given a dataset of N individuals, the input for individual i is their genotype profile x i ∈ {0, 1, 2, UNKN } L , representing genotypes of L pre-selected genetic variants, where UNKN indicates missing data. The target output is a vector y i ∈ {0, 1, UNKN } D , representing the binary status (case/control) for D diseases, where UNKN indicates unrecorded status. Our goal is to learn a multitask function f : ℤ L → [0, 1 ] D that predicts the probability of each disease Ŷ i = f ( x i ). The primary challenges lie in the ultra-high dimensionality ( L ≈ 140k in this work), the need to capture potentially non-linear and long-range interactions, and leveraging the statistical power of population-scale datasets ( N ≈ 5M total used in this study). We propose a Transformer-based architecture adapted for this task, leveraging efficient attention mechanisms for scalability and multi-task learning for joint prediction across diseases. 3.2 PRSformer architecture PRSformer adapts the Transformer architecture for disease prediction from ultra-long ( L ≈ 140k) individual-level genotype sequences using the following key designs: Scalability via Neighborhood Attention Standard 𝒪 ( L 2 ) self-attention is computationally infeasible. We replace it with Neighborhood Attention (NA) [ 12 ], restricting each query token’s attention to a symmetric local window of size k . This reduces complexity to 𝒪 ( L · k ), enabling efficient processing of the L = 137, 245 input variants used in this study. We use k = 385, chosen via hyperparameter tuning ( Section 3.5 , Supplementary Table F8 ), which conceptually aligns with capturing dependencies within local LD blocks ( Figure 1A ) and corresponds to roughly ≈ 100 kilobases along the genome [ 45 ]. Download figure Open in new tab Figure 1. Schematic overview of PRSformer and its core components. (A) A preselected set of variants across the 22 chromosomes, sorted by genomic position, allows each query variant (e.g., blue or red) to attend within a local block ( k = 385) via neighborhood attention. (B) Model architecture with input, Transformer blocks, and output layer. (C) Variant embedding layer, which encodes each variant and its corresponding genotype (0, 1, 2, or UNKN) into a 64-dimensional representation. (D) Transformer block with pre-layer normalization, neighborhood attention, and GELU activation. Genome-ordered input without explicit positional encodings Input variants are ordered by their chromosomal position and then concatenated from Chr1 to Chr22. This fixed order is used for all individuals. We omit standard positional encodings (e.g., sinusoidal or learned absolute). The fixed genomic order provides implicit relative positional information that NA inherently leverages within its local attention windows. We also experimented with learned positional encodings, but did not observe measurable improvement in performance. The overall data flow of PRSformer proceeds as follows ( Figure 1B ): 1. Learnable variant embedding layer ( Figure 1C ) Each variant in the input sequence of variant genotypes is mapped to a d model -dimensional vector. For each variant j and individual i , an observed genotype x ij ∈ {0, 1, 2 }is represented as E j · x ij using a learned variant-specific embedding E j . A missing genotype ( x ij = UNKN) is represented by a separate learned variant-specific embedding M j . This allows the model to distinguish missingness from observed genotypes distinctly for each variant. We use d model = 64. 2. Transformer blocks ( Figure 1D ) The embedded sequence is processed by N layers = 2 Transformer blocks. Each block applies pre-layer normalization [ 46 ] followed by multi-head NA ( N heads = 4) and a feed-forward network with GELU activation [ 47 ]. The choice of N layers = 2 was based on validation performance, where deeper models did not show significant improvement for this task ( Supplementary Table F7 ), suggesting two stacked NA layers provide a sufficient receptive field to capture interactions between adjacent LD blocks. 3. Output layer The normalized sequence representation from the last block is flattened and optionally concatenated with covariates such as sex and age, and passed to a fully-connected layer generating D = 18 independent disease likelihood predictions. We also evaluated mean pooling and dedicated [CLS] tokens as alternatives to simple flattening of normalized representations, but neither outperformed the flattening-based design (see Supplementary Table F10 ). Key architectural hyperparameters ( d model , N heads , N layers , NA window size k ) were optimized based on validation set performance (Supplementary Tables F5-10). 3.3 Datasets We utilized data from a large, private biobank consisting of individuals who consented to participate in research under an IRB-approved protocol. Starting from an internal GWAS data freeze timestamped 08-2021 (used to prevent information leakage, see Section 3.4 ), we identified individuals genotyped on the same platform and excluded all pairs of individuals related by more than 700 cM (i.e., first cousins or closer), thereby minimizing the risk of learning simple familial signals. Individuals included had recorded phenotypes (i.e., self-reported status) for at least one of the D = 18 autoimmune and inflammatory conditions considered. We also excluded individuals under age 10 who did not have a case diagnosis. This resulted in a training set of N train = 3, 838, 549 individuals of European genetic ancestry (throughout this work, ancestry was determined via an internal classifier [ 48 ]). We constructed temporally distinct validation and test sets using individuals who enrolled and consented after the 08-2021 data freeze date and up to 12-2024, applying the identical filtering criteria. The validation dataset was used for hyperparameter tuning, while the test dataset was used to report final performance metrics. This yielded N val = 525, 448 and N test = 494, 265 individuals. To assess whether models capture familial relationships versus causal genotype–phenotype associations, we constructed a kinship-controlled European test set ( N ≈ 148k) by subsetting test individuals related to any training sample by no more than 300 cM and ensuring that no pair within the subset is related by more than 700 cM. In total, the European dataset comprised N = 4,858,302 individuals across training, validation, and test sets, with an additional N ≈ 1.1M non-European individuals included for cross-ancestry training (see Section 3.7 ). Case/control counts per disease and further details regarding cohort construction (including differences from the subset used for GWAS computations) are provided in Appendix A and Supplementary Tables F2-4. 3.4 Variant selection To define the input feature space ( L ), we selected variants associated with at least one of the D = 18 diseases based on internal GWAS summary statistics (European ancestry cohort, computed prior to an 08-2021 data freeze to prevent information leakage into model training). For each disease, variants passing standard QC, located on autosomal chromosomes, had a genotyping rate ≥ 0.95, MAF > 0.001, and exhibited nominal association with the disease (GWAS p-value < 1 × 10 −2 ). The final PRSformer input set was the union across all 18 diseases, resulting in L = 137, 245 variants. Further details on GWAS procedures, exploration of variant’s pruning by LD and per-disease variant counts are in Appendix B and Supplementary Table F9 ) 3.5 Training Given the training data , we trained PRSformer by minimizing the following loss, summed over individuals and their available (non-UNKN) disease labels t ∈ T ( i ): where T ( i ) denotes the set of recorded disease statuses for individual i . We also evaluated focal loss [ 49 ], task-uncertainty–weighted loss [ 50 ], and standard averaged cross-entropy, all of which were outperformed by the proposed loss function in terms of validation AUROC. We used the AdamW optimizer [ 51 ] ( β 1 = 0.9, β 2 = 0.999, weight decay=0.05) with an initial learning rate of 5 × 10 −4 , decreased via a Cosine Annealing scheduler. Training was performed efficiently on this large-scale dataset for 2 epochs consisting of ≈ 120,000 gradient updates in an effective batch size of 64 across four NVIDIA A100 GPUs, leveraging Distributed Data Parallel and mixed-precision (FP16) training (training duration was tuned based on validation AUROC across diseases ( Supplementary Table F5 ); most models showed signs of overfitting beyond two epochs). Hyperparameters, including architecture choices ( N layers , d model , N heads , k ), were selected based on optimal AUROC on the validation set after extensive searches (e.g., See Supplementary Tables F5-10), following standard ML best practices to minimize overfitting to validation data and ensure that test performance provides an unbiased estimate of generalization. To isolate the contribution of genotype to model performance, all models presented in the main text were trained without including covariates such as sex or age. 3.6 Baseline models To comprehensively evaluate PRSformer and validate our main claims regarding the utility of end-to-end non-linear modeling on large-scale individual-level genotypes, we established three rigorous baselines. These baselines are specifically designed to: (1) compare against the current state-of-the-art using conventional summary-statistic inputs (LDPred2), (2) benchmark against an enhanced version of this state-of-the-art (Stacked LDPred2), and (3) isolate the specific performance gains attributable to PRSformer’s non-linear Transformer architecture via a carefully matched linear counterpart. 1. LDPred2: state-of-the-art summary-statistic method We selected LDPred2 [ 5 ] due to its strong empirical performance and widespread adoption in the field [ 36 , 52 ]. To ensure the most direct comparison possible, we configured LDPred2 meticulously: Matched data source: LDPred2 was applied to GWAS summary statistics derived from the same European training data freeze used for PRSformer’s data and variant selection. Cohort-specific LD: An LD reference panel from our research cohort was used. Standard QC: Input variants (∼ 445K per disease) passed standard GWAS QC and LDPred2-specific filtering [ 18 ]. Tuning: LDPred2 hyperparameters ( p, h 2 ) were extensively tuned (up to 100 models per disease) by maximizing AUROC on the same validation set used for PRSformer (details in Appendix C). 2. Stacked LDPred2: enhanced summary-statistic baseline To create a stronger summary-statistic baseline, we ensembled the converged LDPred2 models from the hyperparameter search using elastic-net regression trained via cross-validation on the validation set (Appendix C). PRSformer+ Since Stacked LDPred2 uses the validation set for training ensemble weights, we develop and compare it against PRSformer+ , which is the final PRSformer model retrained on the combined training and validation datasets, ensuring parity in total data usage ( Supplementary Figure E2 ). 3. Linear model: direct architectural ablation This crucial baseline isolates the contribution of PRSformer’s non-linear Transformer architecture. It mirrors PRSformer precisely except for omitting the Transformer blocks: Identical data, inputs & training: Uses the exact same L ≈ 140k input variants and individual-level train/validation/test splits. Employs the same multitask framework ( D = 18), loss function, AdamW optimizer, and training schedule ( Section 3.5 ). Uses the same embedding layer ( Figure 1C ) for genotypes and missingness. Architecture difference: The input embeddings are fed directly to the final linear output layer, bypassing the Transformer blocks ( Figure 1D ). This provides a multitask linear model on the same large-scale individual data, allowing direct assessment of the performance gain from PRSformer’s non-linear processing. 3.7 Cross-ancestry experiments To assess generalization, we developed PRSformer-ME (Multi-Ethnic). We performed ancestry-specific GWAS (African American (AFR), European (EUR), Latino (LAT), East Asian (EAS), and South Asian (SAS); determined by internal classifier) using the same 08-2021 data freeze and variant selection criteria ( Section 3.4 , p-value < 1 × 10 −2 , QC) where sample sizes permitted ( Supplementary Table F12 ). We defined an expanded input set ( L = 251, 538 variants) as the union of selected variants across all available disease-ancestry pairs (including Europeans). We constructed a multiancestry training set (∼ 5M total individuals) by combining the European training set ( Section 3.3 ) with N = 1, 136, 746M non-European individuals meeting the same filtering criteria. PRSformer-ME was trained on this combined dataset using the same architecture and hyperparameters as the European-only PRSformer, without additional ancestry-specific tuning. Evaluation was performed on a combined test set including the European test set and held-out non-European individuals processed identically ( Supplementary Table F3 ). 4 Experiments and results We present results evaluating PRSformer’s performance against baselines, analyzing the impact of non-linearity and sample scale, assessing the benefit of multitask learning, and testing cross-ancestry generalization using AUROC as the primary metric unless otherwise stated. 4.1 PRSformer outperforms state-of-the-art baselines We first benchmarked PRSformer against the highly optimized linear model and the state-of-the-art summary-statistic method, LDPred2, on the European test set ( N test ≈ 494k). As shown in Figure 2 , PRSformer consistently achieves higher AUROC scores than its linear counterpart across all 18 autoimmune and inflammatory conditions. This comparison, using identical data and training setups except for the Transformer blocks, directly quantifies the predictive benefit derived from PRSformer’s non-linear architecture. Download figure Open in new tab Figure 2. Benchmarking PRSformer against baseline methods using AUROC. PRSformer consistently outperforms the linear model (trained on identical individual-level data) and LDPred2 (state-of-the-art summary statistic method derived from the same cohort). Error bars: 95% CI (10k bootstraps). p-values: estimated using a one-sided paired bootstrap test (10,000 replicates), sampling with replacement from the test set and comparing AUROCs of PRSformer and LDPred2 on identical sample pairs. The p-value reflects the fraction of replicates where LDPred2’s AUROC ≥ PRSformer’s. Crucially, PRSformer also significantly outperforms LDPred2 (using summary statistics derived from the same cohort) on 16 out of 18 diseases, with 11 differences being statistically significant ( p < 0.05, one-sided paired bootstrap test). This demonstrates the advantage of end-to-end modeling on individual-level data compared to state-of-the-art methods relying on summary statistics. Consistent improvements were also observed in area under the precision–recall curve ( Supplementary Figure E1 ) and explained variance ( Supplementary Table F11 ), as well as when comparing against an enhanced Stacked LDPred2 baseline (PRSformer+, Supplementary Figure E2 ), confirming the robustness of PRSformer’s advantage. We also evaluated PRSformer and LDPred2 on the kinship-controlled test set, reproducing similar trends ( Supplementary Figure E3 ): PRSformer outperformed LDPred2 in 14 of 18 diseases, maintaining its lead in 13 of the 16 and newly improving Alopecia Areata, with six remaining statistically significant ( p < 0.05). The smaller number of significant improvements is expected given the reduced power of the kinship-controlled test set (∼ 148k vs. ∼ 494k). These results confirm that PRSformer’s advantage is not driven by familial confounding and persists under stringent kinship control, reinforcing the validity of our findings. 4.2 Benefit of non-linearity emerges at million-sample scale To understand when the non-linear modeling capabilities of PRSformer become advantageous, we compared its performance against the linear baseline across varying training dataset sizes (down-sampling the N ≈ 3.8M training set). Figure 3 reveals a critical insight: at smaller sample sizes, comparable to current large public cohorts like UK Biobank [ 53 ] (up to N ≈ 1M), the performance of PRSformer is similar to the simpler linear model. However, as the training size exceeds one million individuals, a clear advantage for the non-linear PRSformer emerges and progressively widens. This trend holds across multiple diseases ( Supplementary Figure E4 ) and persists even when using appropriately subsetted variant sets for smaller scales ( Supplementary Figure E5 ). These results indicate a scaling law: the benefits of non-linear architectures like PRSformer manifest only when sample sizes are sufficient to resolve higher-order genetic interactions. Below this threshold, linear models remain competitive, whereas beyond the million-sample regime, PRSformer achieves measurable gains (although with higher computational cost in FLOPs per sample). Download figure Open in new tab Figure 3. Impact of training scale on non-linear model advantage. Performance (AUROC) of PRSformer (non-linear) and the linear baseline across downsampled training sets (multitask setting). The benefit of non-linearity becomes apparent only at N > 1M scale. 4.3 Multitask learning consistently improves performance We investigated the benefit of PRSformer’s multitask design by comparing it against single-task (ST) models trained independently for each disease. Figure 4 shows that multitask (MT) training consistently yields superior AUROC compared to ST training for both PRSformer and the linear baseline, across different data scales (see Supplementary Figure E6 for other diseases). This improvement was robust even when ST models used disease-specific optimized variant sets ( Supplementary Figure E7 ). Thus, the gain stems from leveraging shared information across related immune-mediated inflammatory diseases allowing shared model components (variant embeddings and Transformer blocks in PRSformer, and variant embedding in the linear baseline) to be optimized more effectively. By training these shared layers across multiple related diseases, the model can capture generalizable representations that enhance performance beyond what is achievable with isolated, ST training. Download figure Open in new tab Figure 4. Multitask (MT) vs. Single-Task (ST) training for Celiac disease (left) and T1D (right). MT consistently out-performs ST for both non-linear (based on PRSformer) and the linear baseline across different scales (X-axis: ST sizes / (MT sizes)). 4.4 Improved cross-ancestry generalization via multitask multi-ancestry training Recognizing the need for equitable genomic prediction [ 54 ], we trained PRSformer-ME on a combined multi-ancestry cohort (∼ 5M individuals, including ∼ 1.1M non-Europeans) using an expanded variant set ( L ≈ 252k, see Section 3.7 Methods). We evaluated its performance on a held-out test set containing individuals from European (EUR), African American (AFR), Latino (LAT), East Asian (EAS), and South Asian (SAS) ancestries, comparing it to the original PRSformer trained only on EUR individuals. As summarized in Table 1 , PRSformer-ME demonstrates significantly improved generalization to non-EUR populations. It achieves substantially higher AUROC scores across most diseases in AFR, LAT, EAS, and SAS individuals compared to the EUR-only model. Importantly, this gain in non-EUR populations is achieved with minimal to no degradation in performance on EUR individuals. These results indicate that training on diverse, aggregated individual-level data allows PRSformer-ME to capture both shared and ancestry-specific genetic signals, leading to more accurate and potentially more equitable predictions across populations compared to models trained on a single ancestry group (incorporating covariates such as sex and age further improves predictive performance, see Supplementary Table F14 ). This is an important finding since state-of-the-art methods generally rely on single-ancestry summary statistics, preventing them from jointly training on individual-level data across multiple ancestries and from leveraging shared cross-population genetic signals. View this table: View inline View popup Download powerpoint Table 1: AUROC of EUR-only PRSformer vs. multi-ancestry PRSformer-ME on the multi-ancestry test set. PRSformer-ME shows improved performance in non-EUR ancestries often without sacrificing EUR performance. Bold font denotes the higher AUROC in each pairwise comparison. 5 Conclusion We introduced PRSformer, a scalable Transformer architecture leveraging neighborhood attention to enable end-to-end, multitask disease prediction from population-scale individual genotypes ( N ≈ 5M, L ≈ 140k). Our rigorous evaluation on a unique large private cohort, conducted under IRB and using consented research participant data, demonstrates that PRSformer significantly outperforms strong linear and state-of-the-art summary-statistic baselines (LDPred2) derived from the same cohort. A key finding of this work is that the benefit of PRSformer’s non-linear modeling for complex immune-mediated inflammatory diseases emerges primarily at the million-sample scale ( N > 1M). This advantage varies across diseases, with traits like celiac disease and type 1 diabetes benefiting substantially from non-linear modeling to explain disease risk variance [ 55 ]. This scaling law, alongside our findings that multitask training improves performance and multi-ancestry data enhances generalization, establishes a new framework for genomic prediction. While PRSformer advances predictive accuracy, its gains come with higher computational demands that may limit immediate clinical scalability. Furthermore, future work is required to develop interpretation methods to understand the learned non-linear interactions, which is essential for biological hypothesis generation and experimental validation. A key future direction is to extend the framework beyond a single disease domain to a phenome-scale setting spanning thousands of traits. This approach is motivated by widespread genetic pleiotropy, where a single variant can influence multiple, seemingly disparate conditions. A unified model could therefore capture the shared genetic underpinnings linking diverse biological systems, such as the contribution of immune pathways to neurodegeneration and cancer. Our research prioritizes fairness across diverse populations and the responsible deployment of genomic models. Recognizing the sensitivity of genomic data, we have taken steps to balance transparency with participant privacy: we provide detailed methodological descriptions and have released our implementation code at https://github.com/23andMe/PRSformer ; however, the data and trained models are not publicly available. Taken together, our results and scaling analyses position PRSformer as a foundation for phenome-scale genetic risk modeling that can fully leverage genetic pleiotropy to improve prediction and generalization at population scale. A Details of the GWAS runs The internal ancestry classifier assigns individuals to one of five major genetic ancestry groups - African American (AFR), European (EUR), East Asian (EAS), South Asian (SAS), or Latino (LAT) - based on local ancestry inference [ 48 ]. To reduce confounding introduced by population structure, GWAS analyses were stratified by these genetically inferred ancestry groups. Principal component analysis (PCA) was conducted separately within each ancestry group using a subset of <100,000 high-quality genotyped variants shared across all internal platforms. A randomly selected subset of individuals was used for each group: 513K for AFR, 398K for EAS, 1M for EUR, 1M for LAT, and 111K for SAS [ 48 ]. For each disease and ancestry group, GWAS was performed using logistic regression with additive allelic effects as predictors. Covariates included age, sex, genotype platform (to adjust for batch effects), and top principal components - specifically, the top 5 PCs for EUR, EAS, and SAS; the top 6 for AFR; and the top 9 for LAT. Association p-values were derived using a likelihood ratio test, comparing a reduced model fitted using covariates only to a full model fitted with both additive genetic effects and covariates [ 56 ]. B Exploration of LD-based variant pruning We additionally experimented with training PRSformer on a subset of variants that had been LD-pruned using PLINK 2 . 0 [ 57 ]. LD pruning removes highly correlated SNPs to retain approximately independent markers. In this procedure, a sliding window is moved across the genome, pairwise linkage disequilibrium ( r 2 ) is computed among variants, and SNPs exceeding a specified correlation threshold with nearby variants are iteratively removed until no pair within each window remains above that threshold. Starting from a union variant set constructed similarly to that in Section 3.4 (but with slightly adjusted filtering thresholds), we applied PLINK 2 . 0 with a window size of 6,000 kb (6 Mb), a step size of one variant, and an r 2 threshold of 0.5. Supplementary Table F11 compares two models from the hyperparameter tuning round trained with and without LD-based variant pruning. Interestingly, despite reducing multicollinearity among variant features, LD pruning led to lower validation performance, suggesting that PRSformer benefits from leveraging the local correlation structure within LD blocks to capture causal signals more effectively. C Details of LDpred2 runs For each disease we used the Gibbs sampler LDpred2 software [ 5 ] on the summary statistics with an internal LD panel. LD matrix computation included variants with minor allele frequency greater than 0.1%, and genotype call rate greater than 90%. Variants greater than 5cM apart were assumed to be independent. Summary statistics were filtered to keep variants that had a minor allele frequency greater than 0.1% and had a genotype call rate greater than 95%. This consisted of variant sets with roughly 445,000 variants. We estimated posterior SNP-effect sizes using the grid option with a set of 100 combinations of hyperparameters, leading to up to 100 sets of polygenic risk scores (PRS) per disease (depending on convergence). The hyperparameters that LDpred2 takes are an estimate for the proportion of causal variants, p , and trait heritability, h 2 . We used LD score regression to estimate h 2 , then used a grid of the h 2 estimate multiplied by 0.6, 0.8, 1, 1.2, and 1.4. For p , we used a sequence of values equally spaced on a logarithmic scale from 10 −5 to 1. The best hyperparameters for each disease were selected based on validation AUROC leading to the final LDPred2 PRS models. For Stacked LDPred2, however, we ensembled all of the converged PRSs per disease (up to 100) by training elastic net on the validation data using 5-fold cross validation. D Subsetting variants for down-sampled experiments When training on smaller datasets, we may not have access to the same high-powered variant selection as in the full-data setting. To account for this, we repeated variant selection using GWAS summary statistics adjusted to reflect the reduced sample size of each downsampled dataset. For each downsampled dataset, we estimated GWAS p-values under the reduced sample size using the original GWAS summary statistics: where β and SE denote the effect size and standard error from the original GWAS, N is the original sample size, N ds is the downsampled sample size, and Φ is the standard normal cumulative distribution function. Subsequently, variant selection was performed independently for each disease and each downsampled dataset using the estimated p-values, applying a threshold of p < 1 e − 2. Multi-task model training was then conducted using the union of the selected variant sets across diseases at each downsampled scale, following the same procedure as in the full-data experiments. Additional details on variant sets and data sizes and model configurations are provided in Supplementary Tables F1,F2 and F13. E Supplementary figures Download figure Open in new tab Figure E1: Benchmarking PRSformer against baseline methods using AUPRC as the evaluation metric. Numbers above the bars indicate test set AUPRC values; error bars denote 95% confidence intervals estimated via bootstrapped test samples. The reported p-values reflect the one-sided statistical significance of PRSformer outperforming LDPred2. Download figure Open in new tab Figure E2: Benchmarking of models trained on the combined training and validation datasets. Numbers above the bars indicate test set AUROC; error bars represent 95% confidence intervals computed via bootstrapped test samples. The two sets of p-values reflect the one-sided statistical significance of PRSformer+ outperforming stacked LDPred2, and PRSformer outperforming non-stacked LDPred2—the latter being the same as those reported in Figure 2 of the main text. Download figure Open in new tab Figure E3: Comparison of PRSformer and LDPred2 on the kinship-controlled European test set, evaluated by AUROC. Numbers above the bars indicate AUROC values, and error bars represent 95% confidence intervals estimated from 10,000 bootstrapped samples. The reported p-values reflect the one-sided statistical significance of PRSformer outperforming LDPred2. Download figure Open in new tab Figure E4: Prediction performance across different training scales for the non-linear model and linear baseline, both trained in a multitask (MT) setting using the same input variant set as PRSFormer. For most diseases, performance improves with more training data, with the non-linear model surpassing the linear baseline at larger scales. Fluctuations in performance for Polymyalgia Rheumatica and Axial Spondyloarthritis likely stem from the former’s rarity and the latter’s relatively small training size (see Supplementary Table F2 ). Download figure Open in new tab Figure E5: Prediction performance across different training scales for the non-linear model and linear baseline, both trained in a multitask (MT) setting using subsetted variant sets at each down-sampled scale (see Supplementary section D and Table F13). For most diseases, performance improves with increasing training data, with the non-linear model outperforming the linear baseline at larger scales. These trends are consistent with those observed using a fixed input variant set across scales (see Supplementary Figure E4 ). Download figure Open in new tab Figure E6: Comparison of multitask (MT) versus single-task (ST) training for three additional diseases across different training scales, all using the same input variant set as PRSformer. X-axis values outside parentheses indicate ST training sizes, and those inside indicate corresponding MT training sizes. Across all tested diseases, MT training outperforms ST training for both the non-linear model (left) and the linear baseline (right). Download figure Open in new tab Figure E7: Comparison of multitask (MT) versus single-task (ST) training for four tested diseases across different training scales. Both MT and ST models were trained on downsampled datasets using subsetted variants; additionally, ST models used disease-specific variant sets (see Supplementary section D and Table F13). X-axis values outside parentheses indicate ST training sizes, while those inside indicate the corresponding MT sizes. Across all tested diseases, MT training outperforms ST training for both the non-linear model (left) and the linear baseline (right). These results are consistent with those observed using a fixed input variant set for both MT and ST across scales (see Supplementary Figure E6 ). F Supplementary tables View this table: View inline View popup Download powerpoint Table F1: Number of selected variants per disease across European datasets View this table: View inline View popup Download powerpoint Table F2: Training sample sizes (case / control) per disease across European and multi-ancestry datasets View this table: View inline View popup Download powerpoint Table F3: Test data sample sizes (case / control) per disease across ancestry groups View this table: View inline View popup Download powerpoint Table F4: Validation dataset sample sizes (case/control) per disease (only European individuals) View this table: View inline View popup Download powerpoint Table F5: Tuning of the training steps based on validation AUROC View this table: View inline View popup Download powerpoint Table F6: Tuning of attention heads and model dimension based on validation AUROC View this table: View inline View popup Download powerpoint Table F7: Tuning of attention dilation and number of transformer blocks based on validation AUROC View this table: View inline View popup Download powerpoint Table F8: Tuning of the window size of neighborhood attention based on validation AUROC View this table: View inline View popup Download powerpoint Table F9: Exploring the impact of LD-based variant set pruning ( r 2 = 0.5) on validation AUROC. View this table: View inline View popup Download powerpoint Table F10: Exploring different output heads based on validation AUROC. View this table: View inline View popup Download powerpoint Table F11: Comparison of explained variance between PRSformer and LDpred2 across diseases on the European test set View this table: View inline View popup Download powerpoint Table F12: Number of selected variants per disease across non-European ancestries View this table: View inline View popup Download powerpoint Table F13: Characteristics of multitask models trained on down-sampled datasets with subsetted input variants View this table: View inline View popup Download powerpoint Table F14: AUROC comparison across diseases under different covariate settings (sex and age). For each disease and ancestry group, bold font denotes the best performance. Acknowledgments and Disclosure of Funding The authors thank the past and present employees and research participants of 23andMe for making this work possible. We are grateful to Akele Reed, Teague Sterling, David Hinds, Steve Pitts, Wei Wang, Bertram Koelsch, Michael Holmes, Stella Aslibekyan, Cordell Blakkan and Barry Hicks for their valuable contributions and insightful comments on the manuscript, and to Ali Hassani for helpful discussions on employing Neighborhood Attention. The authors also gratefully acknowledge the support of AWS for providing GPU computing resources and credits. A. A. Khan is supported in part by a Chan Zuckerberg Investigator Award. Footnotes p.dibaeinia{at}gmail.com chrisg{at}23andme.com suyashss{at}gmail.com aauton{at}23andme.com References [1]. ↵ Arnór I Sigurdsson , Ioannis Louloudis , Karina Banasik , David Westergaard , Ole Winther , Ole Lund , Sisse Rye Ostrowski , Christian Erikstrup , Ole Birger Vesterager Pedersen , Mette Nyegaard , DBDS Genomic Consortium , Søren Brunak , Bjarni J Vilhjálmsson , and Simon Rasmussen . Deep integrative models for large-scale human genomics . Nucleic Acids Research , 51 ( 12 ): e67 – e67 , 05 2023 . OpenUrl PubMed [2]. ↵ Erping Long , Peixing Wan , Qingyu Chen , Zhiyong Lu , and Jiyeon Choi . From function to translation: Decoding genetic susceptibility to human diseases via artificial intelligence . Cell Genomics , 3 ( 6 ): 100320 , 2023 . OpenUrl PubMed [3]. ↵ Naomi R Wray , Michael E Goddard , and Peter M Visscher . Prediction of individual genetic risk to disease from genome-wide association studies . Genome research , 17 ( 10 ): 1520 – 1528 , 2007 . OpenUrl Abstract / FREE Full Text [4]. ↵ Ali Torkamani , Nathan E Wineinger , and Eric J Topol . The personal and clinical utility of polygenic risk scores . Nature Reviews Genetics , 19 ( 9 ): 581 – 590 , 2018 . OpenUrl CrossRef PubMed [5]. ↵ Florian Privé , Julyan Arbel , and Bjarni J Vilhjálmsson . Ldpred2: better, faster, stronger . Bioinformatics , 36 ( 22-23 ): 5424 – 5431 , 2020 . OpenUrl [6]. ↵ Cathryn M Lewis and Evangelos Vassos . Polygenic risk scores: from research tools to clinical instruments . Genome medicine , 12 ( 1 ): 44 , 2020 . OpenUrl PubMed [7]. ↵ Jana Schwarzerova , Martin Hurta , Vojtech Barton , Matej Lexa , Dirk Walther , Valentine Provaznik , and Wolfram Weckwerth . A perspective on genetic and polygenic risk scores—advances and limitations and overview of associated tools . Briefings in bioinformatics , 25 ( 3 ): bbae240 , 2024 . OpenUrl PubMed [8]. ↵ Michael Elgart , Genevieve Lyons , Santiago Romero-Brufau , Nuzulul Kurniansyah , Jennifer A Brody , Xiuqing Guo , Henry J Lin , Laura Raffield , Yan Gao , Han Chen , et al. Non-linear machine learning models incorporating snps and prs improve polygenic prediction in diverse human populations . Communications biology , 5 ( 1 ): 856 , 2022 . OpenUrl PubMed [9]. ↵ Pankhuri Singhal , Yogasudha Veturi , Scott M Dudek , Anastasia Lucas , Alex Frase , Kristel Van Steen , Steven J Schrodi , David Fasel , Chunhua Weng , Rion Pendergrass , et al. Evidence of epistasis in regions of long-range linkage disequilibrium across five complex diseases in the uk biobank and emerge datasets . The American Journal of Human Genetics , 110 ( 4 ): 575 – 591 , 2023 . OpenUrl CrossRef PubMed [10]. ↵ Juannan Zhou , Mandy S Wong , Wei-Chia Chen , Adrian R Krainer , Justin B Kinney , and David M McCandlish . Higher-order epistasis and phenotypic prediction . Proceedings of the National Academy of Sciences , 119 ( 39 ): e2204233119 , 2022 . OpenUrl CrossRef PubMed [11]. ↵ Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N Gomez , Łukasz Kaiser , and Illia Polosukhin . Attention is all you need . Advances in neural information processing systems , 30 , 2017 . [12]. ↵ Ali Hassani , Steven Walton , Jiachen Li , Shen Li , and Humphrey Shi . Neighborhood attention transformer . In Proceedings of the IEEE/CVF conference on computer vision and pattern recognition , pages 6185 – 6194 , 2023 . [13]. ↵ Alexandra Zhernakova , Cleo C Van Diemen , and Cisca Wijmenga . Detecting shared pathogenesis from the shared genetics of immune-related diseases . Nature Reviews Genetics , 10 ( 1 ): 43 – 55 , 2009 . OpenUrl CrossRef PubMed Web of Science [14]. ↵ Clare Bycroft , Colin Freeman , Desislava Petkova , Gavin Band , Lloyd T Elliott , Kevin Sharp , Allan Motyer , Damjan Vukcevic , Olivier Delaneau , Jared O’Connell , et al. The uk biobank resource with deep phenotyping and genomic data . Nature , 562 ( 7726 ): 203 – 209 , 2018 . OpenUrl CrossRef PubMed [15]. ↵ Florian Privé , Bjarni J Vilhjálmsson , Hugues Aschard , and Michael GB Blum . Making the most of clumping and thresholding for polygenic scores . The American journal of human genetics , 105 ( 6 ): 1213 – 1221 , 2019 . OpenUrl CrossRef PubMed [16]. ↵ Tian Ge , Chia-Yen Chen , Yang Ni , Yen-Chen Anne Feng , and Jordan W Smoller . Polygenic prediction via bayesian regression and continuous shrinkage priors . Nature communications , 10 ( 1 ): 1776 , 2019 . OpenUrl PubMed [17]. ↵ Bjarni J Vilhjálmsson , Jian Yang , Hilary K Finucane , Alexander Gusev , Sara Lindström , Stephan Ripke , Giulio Genovese , Po-Ru Loh , Gaurav Bhatia , Ron Do , et al. Modeling linkage disequilibrium increases accuracy of polygenic risk scores . The american journal of human genetics , 97 ( 4 ): 576 – 592 , 2015 . OpenUrl CrossRef PubMed [18]. ↵ Florian Privé , Julyan Arbel , Hugues Aschard , and Bjarni J Vilhjálmsson . Identifying and correcting for misspecifications in gwas summary statistics and polygenic scores . Human Genetics and Genomics Advances , 3 ( 4 ), 2022 . [19]. ↵ Rikifumi Ohta , Yosuke Tanigawa , Yuta Suzuki , Manolis Kellis , and Shinichi Morishita . A polygenic score method boosted by non-additive models . Nature Communications , 15 ( 1 ): 4433 , 2024 . OpenUrl PubMed [20]. ↵ Po-Ru Loh , George Tucker , Brendan K Bulik-Sullivan , Bjarni J Vilhjálmsson , Hilary K Finucane , Rany M Salem , Daniel I Chasman , Paul M Ridker , Benjamin M Neale , Bonnie Berger , et al. Efficient bayesian mixed-model analysis increases association power in large cohorts . Nature genetics , 47 ( 3 ): 284 – 290 , 2015 . OpenUrl CrossRef PubMed [21]. ↵ Xiang Zhou and Matthew Stephens . Genome-wide efficient mixed-model analysis for association studies . Nature genetics , 44 ( 7 ): 821 – 824 , 2012 . OpenUrl CrossRef PubMed [22]. ↵ Guiyan Ni , Jian Zeng , Joana A Revez , Ying Wang , Zhili Zheng , Tian Ge , Restuadi Restuadi , Jacqueline Kiewa , Dale R Nyholt , Jonathan RI Coleman , et al. A comparison of ten polygenic score methods for psychiatric disorders applied across multiple cohorts . Biological psychiatry , 90 ( 9 ): 611 – 620 , 2021 . OpenUrl CrossRef PubMed [23]. ↵ Ruilin Li , Christopher Chang , Yosuke Tanigawa , Balasubramanian Narasimhan , Trevor Hastie , Robert Tibshirani , and Manuel A Rivas . Fast numerical optimization for genome sequencing data in population biobanks . Bioinformatics , 37 ( 22 ): 4148 – 4155 , 2021 . OpenUrl CrossRef PubMed [24]. ↵ Babak Alipanahi , Andrew Delong , Matthew T Weirauch , and Brendan J Frey . Predicting the sequence specificities of dna-and rna-binding proteins by deep learning . Nature biotechnology , 33 ( 8 ): 831 – 838 , 2015 . OpenUrl CrossRef PubMed [25]. ↵ Daniel Quang and Xiaohui Xie . Danq: a hybrid convolutional and recurrent deep neural network for quantifying the function of dna sequences . Nucleic acids research , 44 ( 11 ): e107 – e107 , 2016 . OpenUrl CrossRef PubMed [26]. ↵ Žiga Avsec , Melanie Weilert , Avanti Shrikumar , Sabrina Krueger , Amr Alexandari , Khyati Dalal , Robin Fropf , Charles McAnany , Julien Gagneur , Anshul Kundaje , et al. Base-resolution models of transcription-factor binding reveal soft motif syntax . Nature genetics , 53 ( 3 ): 354 – 366 , 2021 . OpenUrl CrossRef PubMed [27]. ↵ David R Kelley , Jasper Snoek , and John L Rinn . Basset: learning the regulatory code of the accessible genome with deep convolutional neural networks . Genome research , 26 ( 7 ): 990 – 999 , 2016 . OpenUrl Abstract / FREE Full Text [28]. ↵ Žiga Avsec , Vikram Agarwal , Daniel Visentin , Joseph R Ledsam , Agnieszka Grabska-Barwinska , Kyle R Taylor , Yannis Assael , John Jumper , Pushmeet Kohli , and David R Kelley . Effective gene expression prediction from sequence by integrating long-range interactions . Nature methods , 18 ( 10 ): 1196 – 1203 , 2021 . OpenUrl PubMed [29]. ↵ Johannes Linder , Divyanshi Srivastava , Han Yuan , Vikram Agarwal , and David R Kelley . Predicting rna-seq coverage from dna sequence as a unifying model of gene regulation . Nature Genetics, pages 1–13 , 2025 . [30]. ↵ Pau Bellot , Gustavo de Los Campos , and Miguel Pérez-Enciso . Can deep learning improve genomic prediction of complex human traits? Genetics , 210 ( 3 ): 809 – 819 , 2018 . OpenUrl Abstract / FREE Full Text [31]. ↵ Rostam Abdollahi-Arpanahi , Daniel Gianola , and Francisco Peñagaricano . Deep learning versus parametric and ensemble methods for genomic prediction of complex phenotypes . Genetics Selection Evolution , 52 : 1 – 15 , 2020 . OpenUrl CrossRef PubMed [32]. ↵ Arnór I Sigurdsson , Ioannis Louloudis , Karina Banasik , David Westergaard , Ole Winther , Ole Lund , Sisse Rye Ostrowski , Christian Erikstrup , Ole Birger Vesterager Pedersen , Mette Nyegaard , et al. Deep integrative models for large-scale human genomics . Nucleic Acids Research , 51 ( 12 ): e67 – e67 , 2023 . OpenUrl PubMed [33]. ↵ Adrien Badré , Li Zhang , Wellington Muchero , Justin C Reynolds , and Chongle Pan . Deep neural network improves the estimation of polygenic risk scores for breast cancer . Journal of Human Genetics , 66 ( 4 ): 359 – 369 , 2021 . OpenUrl PubMed [34]. ↵ Clara Albiñana , Zhihong Zhu , Andrew J Schork , Andrés Ingason , Hugues Aschard , Isabell Brikell , Cynthia M Bulik , Liselotte V Petersen , Esben Agerbo , Jakob Grove , et al. Multi-pgs enhances polygenic prediction by combining 937 polygenic scores . Nature communications , 14 ( 1 ): 4702 , 2023 . OpenUrl PubMed [35]. ↵ Han Li , Jianyang Zeng , Michael P Snyder , and Sai Zhang . Prs-net: Interpretable polygenic risk scores via geometric learning . In International Conference on Research in Computational Molecular Biology , pages 377 – 380 . Springer , 2024 . [36]. ↵ Zijie Zhao , Tim Gruenloh , Meiyi Yan , Yixuan Wu , Zhongxuan Sun , Jiacheng Miao , Yuchang Wu , Jie Song , and Qiongshi Lu . Optimizing and benchmarking polygenic risk scores with gwas summary statistics . Genome Biology , 25 ( 1 ): 260 , 2024 . OpenUrl CrossRef PubMed [37]. ↵ Frederik Träuble , Lachlan Stuart , Andreas Georgiou , Pascal Notin , Arash Mehrjou , Ron Schwessinger , Mathieu Chevalley , Kim Branson , Bernhard Schölkopf, Cornelia van Duijn , et al. Multi-megabase scale genome interpretation with genetic language models . arXiv preprint arxiv: 2501.07737 , 2025 . [38]. ↵ Tri Dao , Dan Fu , Stefano Ermon , Atri Rudra , and Christopher Ré . Flashattention: Fast and memory-efficient exact attention with io-awareness . Advances in neural information processing systems , 35 : 16344 – 16359 , 2022 . OpenUrl [39]. ↵ Iz Beltagy , Matthew E Peters , and Arman Cohan . Longformer: The long-document transformer . arXiv preprint arxiv: 2004.05150 , 2020 . [40]. ↵ Manzil Zaheer , Guru Guruganesh , Kumar Avinava Dubey , Joshua Ainslie , Chris Alberti , Santiago Ontanon , Philip Pham , Anirudh Ravula , Qifan Wang , Li Yang , et al. Big bird: Transformers for longer sequences . Advances in neural information processing systems , 33 : 17283 – 17297 , 2020 . OpenUrl CrossRef [41]. ↵ Sinong Wang , Belinda Z Li , Madian Khabsa , Han Fang , and Hao Ma . Linformer: Self-attention with linear complexity . arXiv preprint arxiv: 2006.04768 , 2020 . [42]. ↵ Krzysztof Choromanski , Valerii Likhosherstov , David Dohan , Xingyou Song , Andreea Gane , Tamas Sarlos , Peter Hawkins , Jared Davis , Afroz Mohiuddin , Lukasz Kaiser , et al. Rethinking attention with performers . arXiv preprint arxiv: 2009.14794 , 2020 . [43]. ↵ Ze Liu , Yutong Lin , Yue Cao , Han Hu , Yixuan Wei , Zheng Zhang , Stephen Lin , and Baining Guo . Swin transformer: Hierarchical vision transformer using shifted windows . In Proceedings of the IEEE/CVF international conference on computer vision , pages 10012 – 10022 , 2021 . [44]. ↵ A. Globerson , L. Mackey , D. Belgrave , A. Fan , U. Paquet , J. Tomczak , and C. Zhang Ali Hassani , Wen-mei Hwu , and Humphrey Shi . Faster neighborhood attention: Reducing the o(n^2) cost of self attention at the threadblock level . In A. Globerson , L. Mackey , D. Belgrave , A. Fan , U. Paquet , J. Tomczak , and C. Zhang , editors, Advances in Neural Information Processing Systems , volume 37 , pages 64717 – 64734 . Curran Associates, Inc ., 2024 . OpenUrl [45]. ↵ Tomaz Berisa and Joseph K Pickrell . Approximately independent linkage disequilibrium blocks in human populations . Bioinformatics , 32 ( 2 ): 283 , 2015 . OpenUrl PubMed [46]. ↵ Mohammad Shoeybi , Mostofa Patwary , Raul Puri , Patrick LeGresley , Jared Casper , and Bryan Catanzaro . Megatron-lm: Training multi-billion parameter language models using model parallelism . arXiv preprint arxiv: 1909.08053 , 2019 . [47]. ↵ Hugo Touvron , Thibaut Lavril , Gautier Izacard , Xavier Martinet , Marie-Anne Lachaux , Timothée Lacroix , Baptiste Rozière , Naman Goyal , Eric Hambro , Faisal Azhar , et al. Llama: Open and efficient foundation language models . arXiv preprint arxiv: 2302.13971 , 2023 . [48]. ↵ Eric Y Durand , Chuong B Do , Peter R Wilton , Joanna L Mountain , Adam Auton , G David Poznik , and J Michael Macpherson . A scalable pipeline for local ancestry inference using tens of thousands of reference haplotypes . bioRxiv , pages 2021 – 01 , 2021 . [49]. ↵ Tsung-Yi Lin , Priya Goyal , Ross Girshick , Kaiming He , and Piotr Dollár . Focal loss for dense object detection . In Proceedings of the IEEE international conference on computer vision , pages 2980 – 2988 , 2017 . [50]. ↵ Alex Kendall , Yarin Gal , and Roberto Cipolla . Multi-task learning using uncertainty to weigh losses for scene geometry and semantics . In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 7482 – 7491 , 2018 . [51]. ↵ Ilya Loshchilov and Frank Hutter . Decoupled weight decay regularization . arXiv preprint arxiv: 1711.05101 , 2017 . [52]. ↵ Oliver Pain , Kylie P Glanville , Saskia P Hagenaars , Saskia Selzam , Anna E Fürtjes , Héléna A Gaspar , Jonathan RI Coleman , Kaili Rimfeld , Gerome Breen , Robert Plomin , et al. Evaluation of polygenic prediction methodology within a reference-standardized framework . PLoS genetics , 17 ( 5 ): e1009021 , 2021 . OpenUrl [53]. ↵ Hannah Taylor , Melissa Lewins , M George B Foody , Oliver Gray , Jelena Bešević , Megan C Conroy , Rory Collins , Ben Lacey , Naomi Allen , and Lucy Burkitt-Gray . Uk biobank—a unique resource for discovery and translation research on genetics and neurologic disease . Neurology: Genetics , 11 ( 1 ): e200226 , 2025 . OpenUrl [54]. ↵ Alicia R Martin , Masahiro Kanai , Yoichiro Kamatani , Yukinori Okada , Benjamin M Neale , and Mark J Daly . Clinical use of current polygenic risk scores may exacerbate health disparities . Nature genetics , 51 ( 4 ): 584 – 591 , 2019 . OpenUrl CrossRef PubMed [55]. ↵ Tobias L Lenz , Aaron J Deutsch , Buhm Han , Xinli Hu , Yukinori Okada , Stephen Eyre , Michael Knapp , Alexandra Zhernakova , Tom WJ Huizinga , Goncalo Abecasis , et al. Widespread nonadditive and interaction effects within hla loci modulate the risk of autoimmune diseases . Nature genetics , 47 ( 9 ): 1085 – 1090 , 2015 . OpenUrl CrossRef PubMed [56]. ↵ The 23andMe Research Team. 23andme technical white paper: Overview of 23andMe GWAS release: r8_g1. Technical report, 23andMe, Inc ., 2023 . [57]. ↵ Christopher C Chang , Carson C Chow , Laurent CAM Tellier , Shashaank Vattikuti , Shaun M Purcell , and James J Lee . Second-generation plink: rising to the challenge of larger and richer datasets . Gigascience , 4 ( 1 ): s13742 – 015 , 2015 . OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted October 27, 2025. Download PDF Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following PRSformer: Disease Prediction from Million-Scale Individual Genotypes Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share PRSformer: Disease Prediction from Million-Scale Individual Genotypes Payam Dibaeinia , Chris German , Suyash Shringarpure , Adam Auton , Aly A. Khan bioRxiv 2025.10.26.684578; doi: https://doi.org/10.1101/2025.10.26.684578 Share This Article: Copy Citation Tools PRSformer: Disease Prediction from Million-Scale Individual Genotypes Payam Dibaeinia , Chris German , Suyash Shringarpure , Adam Auton , Aly A. Khan bioRxiv 2025.10.26.684578; doi: https://doi.org/10.1101/2025.10.26.684578 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Genetics Subject Areas All Articles Animal Behavior and Cognition (7618) Biochemistry (17633) Bioengineering (13856) Bioinformatics (41841) Biophysics (21399) Cancer Biology (18529) Cell Biology (25422) Clinical Trials (138) Developmental Biology (13352) Ecology (19860) Epidemiology (2067) Evolutionary Biology (24282) Genetics (15582) Genomics (22462) Immunology (17700) Microbiology (40295) Molecular Biology (17140) Neuroscience (88419) Paleontology (666) Pathology (2823) Pharmacology and Toxicology (4813) Physiology (7632) Plant Biology (15107) Scientific Communication and Education (2042) Synthetic Biology (4284) Systems Biology (9808) Zoology (2267)
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.