Full text
61,893 characters
· extracted from
preprint-html
· click to expand
Machine learning and natural language processing for the early detection of potential mental disorders among school-age children: a prospective birth cohort study | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Machine learning and natural language processing for the early detection of potential mental disorders among school-age children: a prospective birth cohort study View ORCID Profile Shanquan Chen , Ting Dang , Mengjie Qian , Huizhi Liang , Diribsa Tsegaye Bedada , Quinette Abegail Louw , Anna Moore , Rudolf N. Cardinal , Tamsin J. Ford doi: https://doi.org/10.1101/2025.09.10.25335509 Shanquan Chen 1 International Centre for Evidence in Disability, London School of Hygiene & Tropical Medicine , London, United Kingdom , WC1E 7HT Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Shanquan Chen For correspondence: shanquan0301{at}gmail.com Ting Dang 2 School of Computing and Information Systems, University of Melbourne , Melbourne, Australia , VIC 3010 Find this author on Google Scholar Find this author on PubMed Search for this author on this site Mengjie Qian 3 Department of Engineering, University of Cambridge , Cambridge, United Kingdom , CB2 1PZ Find this author on Google Scholar Find this author on PubMed Search for this author on this site Huizhi Liang 4 School of Computing, Newcastle University , United Kingdom Find this author on Google Scholar Find this author on PubMed Search for this author on this site Diribsa Tsegaye Bedada 5 Department of Health and Rehabilitation Sciences, Faculty of Medicine and Health Sciences, Stellenbosch University , Cape Town, South Africa 6 Department of Statistics and Actuarial Science, University of Waterloo , Canada Find this author on Google Scholar Find this author on PubMed Search for this author on this site Quinette Abegail Louw 7 Department of Health and Rehabilitation Sciences, Faculty of Medicine and Health Sciences, Stellenbosch University , Cape Town, South Africa , 7505 Find this author on Google Scholar Find this author on PubMed Search for this author on this site Anna Moore 8 Department of Psychiatry, University of Cambridge , Cambridge, United Kingdom , CB2 0SZ Find this author on Google Scholar Find this author on PubMed Search for this author on this site Rudolf N. Cardinal 8 Department of Psychiatry, University of Cambridge , Cambridge, United Kingdom , CB2 0SZ 9 Cambridgeshire and Peterborough NHS Foundation Trust , United Kingdom , CB21 5EF Find this author on Google Scholar Find this author on PubMed Search for this author on this site Tamsin J. Ford 8 Department of Psychiatry, University of Cambridge , Cambridge, United Kingdom , CB2 0SZ 9 Cambridgeshire and Peterborough NHS Foundation Trust , United Kingdom , CB21 5EF Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Supplementary material Preview PDF Abstract Background Early detection of childhood mental health disorders remains challenging due to gaps in current screening approaches that lack sensitivity to subtle psychological indicators and rely heavily on observable behaviors. We investigated whether integrating machine learning with natural language processing of children’s written expressions could enhance early detection of potential mental disorders among school-age children. Methods This prospective birth cohort study used National Child Development Study (NCDS) data, analyzing 8,981 children born in 1958 in the United Kingdom. Mental health outcomes were assessed using the Bristol Social Adjustment Guide (BSAG) and Rutter A Scale at age 11, with cases defined by scores above 95th and 90th percentiles. Predictive models combined traditional risk factors with natural language features extracted from children’s essays describing their imagined future at age 25. We developed eight machine learning models using various predictor combinations, evaluating performance through receiver operating characteristic (ROC) values. Results Using BSAG 95th percentile threshold, models combining top five selected variables with essay features achieved significantly higher predictive capability (ROC:0.77, 95%CI:0.71-0.83) compared to models using all variables (ROC:0.70, 95%CI:0.63-0.76) or essay features alone (ROC:0.67, 95%CI:0.60-0.74). At 90th percentile threshold, this integrated approach showed similar improvement (ROC:0.81, 95%CI:0.78-0.85). Key predictors included gestational length, maternal parity, parental age, residential characteristics, parental engagement metrics, and children’s BMI. Sensitivity analyses using Rutter A Scale confirmed these findings. Conclusion Combining machine learning with natural language processing of children’s future-oriented essays offers a promising approach for early detection of childhood mental health disorders. This integrated screening method could facilitate more timely intervention, though validation in contemporary populations is needed before clinical implementation. Introduction Mental health disorders in childhood represent a critical public health concern with far-reaching implications for individual development and societal well-being. Approximately 50% of mental health conditions manifest before age 14, significantly impacting educational attainment, social relationships, and long-term occupational outcomes( 1 ). These early-onset conditions often predict increased risk for physical health complications, substance use disorders, and reduced quality of life in adulthood( 2 - 4 ), underlining the paramount importance of early identification and intervention. Despite the well-documented significance of early mental health support, substantial underdiagnosis exists among school-age children( 5 , 6 ). While 10-20% of children worldwide experience clinically significant mental health problems( 5 ), only about one-third receive appropriate diagnosis and treatment( 6 ). This diagnostic gap stems from limited access to mental health professionals, stigma surrounding mental health, and inherent challenges in identifying mental health concerns in young populations( 5 - 8 ). Young children typically lack the emotional vocabulary and self-awareness to articulate psychological distress, instead expressing difficulties through behavioral changes, physical complaints, or academic struggles. Additionally, they depend on caregivers to recognize these signs and initiate appropriate assessment. Traditional diagnostic approaches such as clinical interviews and standardized assessments rely heavily on observable behavioral symptoms or parent/teacher reports, potentially missing subtle early indicators of psychological distress. Conventional screening methods are also resource-intensive and time-consuming, making widespread implementation challenging. The effectiveness of observational screening is highly dependent on healthcare worker training and confidence( 9 ), while screening tools developed in Western settings may not adequately reflect mental health manifestations in diverse cultural contexts( 10 , 11 ). Machine learning and natural language processing (NLP) technologies offer promising avenues for enhancing early detection of childhood mental health concerns by analyzing complex patterns across multiple data sources( 12 - 16 ). However, current approaches face two distinct challenges. First, automatic screening tools using commonly recognized risk factors remain limited in accuracy and scalability( 17 ). Second, there is critical need to explore children’s inner psychological experiences that are inaccessible through conventional assessment methods. Research has established connections between language development and mental health in children( 18 , 19 ). Language patterns—including vocabulary usage, emotional tone, and narrative coherence—can reveal underlying psychological states when children cannot directly express their emotions( 18 , 19 ). Children’s inner world often remains implicit and easily overlooked from an adult perspective, with traditional observable indicators potentially missing subtle psychological manifestations. Natural language processing of children’s written expressions offers a unique window into their psychological well-being, potentially capturing indicators of emotional and mental states missed through conventional observational methods. The present study aims to evaluate the effectiveness of integrated machine learning and natural language processing approaches for early detection of potential mental disorders among school-age children. While recent research has applied NLP to NCDS data for predicting outcomes such as reading comprehension( 20 ), application specifically for childhood mental health prediction remains underdeveloped. We address three key research questions: ( 1 ) To what extent can machine learning algorithms predict childhood mental health outcomes using traditional risk factors? ( 2 ) What is the predictive value of natural language processing analysis of children’s written narratives? ( 3 ) Does integrating machine learning with natural language processing features enhance predictive accuracy compared to either approach independently? Through addressing these questions, we aim to develop an approach facilitating earlier identification of children at risk for mental health disorders, ultimately enabling more timely intervention and improved outcomes. Methods Study design and participants This prospective birth cohort study utilized data from the National Child Development Study (NCDS), which follows 18,558 individuals born during one week (March 3-9) in 1958 across England, Wales, and Scotland( 21 ). The NCDS was selected for its unique collection of children’s written essays at age 11, offering invaluable linguistic data unavailable in newer cohorts. We focused on data from three assessment periods: birth, age 7, and age 11. At birth, detailed information was collected regarding prenatal conditions, birth characteristics, and parental characteristics. Subsequent waves at ages 7 and 11 captured physical, educational, and social development, economic circumstances, family employment, family life, health behaviours, and wellbeing. Of particular importance was an essay-writing task administered at age 11, where participants responded to: “Imagine that you are now 25 years old. Write about the life you are leading, your interests, your home life, and your work at the age of 25.” This task was completed under standardized conditions in school classrooms with 30 minutes allocated. We included participants who completed the age-11 essay task. Participants were excluded if they had more than 30% missing values across variables of interest or if their essays contained fewer than 50 words. The final analytical sample and participant flow are presented in Figure 1 . A comprehensive description of NCDS methodology has been published previously by Power and Elliott( 21 ). Outcomes and measures School-children mental health The primary outcome was childhood mental health, assessed using two validated instruments: the Bristol Social Adjustment Guide (BSAG) and the Rutter A Scale, with the latter serving as sensitivity analysis. The BSAG, administered by teachers at ages 7 and 11, evaluates children’s social adjustment and behavior in educational settings. This instrument encompasses 12 behavioral domains: unforthcomingness, withdrawal, depression, anxiety for acceptance by adults, hostility towards adults, writing off of adults and adult standards, anxiety for acceptance by children, hostility towards children, restlessness, inconsequential behaviour, and miscellaneous symptoms (both general and nervous)( 22 , 23 ). Scoring sums coded items across domains, with higher scores indicating more pronounced mental and behavioral difficulties. The Rutter A Scale, completed by mothers at ages 7 and 11, assesses childhood mental health problems using a modified version consisting of 14 items on a three-point Likert scale (0 = ‘Does not apply’, 1 = ‘Applies somewhat’, 2 = ‘Certainly applies’)( 24 ). The scale evaluates emotional problems, peer problems, behavioral problems, hyperactivity, and prosocial behaviour, including items on fidgeting, destructive behavior, fighting, worry, social preferences, irritability, emotional state, physical mannerisms, thumb-sucking, nail-biting, disobedience, attention difficulties, adaptation to new situations, and peer victimization( 24 ). The instrument demonstrates good psychometric properties, with reported inter-rater reliability (r = 0.64) and test-retest reliability (r = 0.74)( 25 ). Potential mental disorders at age 11 were operationalized using threshold-based approaches. Cases were classified using both 95th and 90th percentile thresholds of total aggregate BSAG scores, with parallel analyses conducted using Rutter A Scale scores at the same thresholds to assess model robustness. Potential predictors Our selection of potential predictors was informed by previous research and encompassed three primary domains: socioeconomic position, adverse childhood experiences, and environmental factors. Socioeconomic position We analyzed multiple harmonized indicators of socioeconomic position from the NCDS. Paternal occupation and employment status were assessed at birth, age 7, and age 11, using the 1990 Registrar General’s Social Class system( 26 , 27 ). Occupations were categorized as manual or non-manual, and employment as employed or unemployed( 26 , 27 ). Maternal and paternal education was measured by completion of post-compulsory education, which distinguishing between those who remained in education beyond the compulsory period and those who left at the minimum permissible age ( 26 , 27 ). Housing tenure at ages 7 and 11 distinguished between owned/mortgaged properties and other arrangements( 26 , 27 ). Additional indicators included bedroom count at ages seven and eleven, housing difficulties, financial hardships, and free school meals eligibility. These measures provided comprehensive assessment of socioeconomic circumstances throughout childhood( 28 , 29 ). Adverse childhood experiences (ACEs) Adverse childhood experiences (ACEs) were operationalized based on established definitions from the National Child Development Study (NCDS) research ( 30 , 31 ). These experiences are defined as traumatic and stressful psychosocial conditions within the familial environment ( 30 , 31 ). Such conditions typically share three key characteristics: they tend to co-occur, persist over time, and remain outside the child’s control. Survey data were collected at two time points, ages seven and eleven years, capturing seven distinct dimensions of childhood adversity. These dimensions encompassed parental separation or divorce, substance misuse by parents, presence of family conflict, parental death, mental health problems among parents, physical neglect experienced from parents, and parental involvement in criminal activities. Each dimension was assessed as a binary variable, creating a comprehensive framework for evaluating early-life adversity. The methodology for measuring these adversities has been extensively documented in previous publications, with detailed protocols available for references( 30 , 31 ). Environmental factors The analysis incorporated a set of established environmental and psychosocial risk factors associated with mental health outcomes( 28 , 29 ). Perinatal factors included infant health conditions at birth (such as haemolytic and respiratory diseases), sex, body mass index z-scores (Zbmi, measured at birth, age seven, and eleven, calculated according to WHO age- and sex-standardized reference values( 32 )), and maternal health conditions during pregnancy including hypertension and diabetes. Maternal characteristics encompassed age at birth, parity, length of gestational period, marital status at birth, and smoking behavior during pregnancy. Parental engagement was assessed through multiple dimensions across two time points (ages seven and eleven). For mothers, these dimensions included interest in the child’s education, engagement in outdoor activities such as walking or park visits, and participation in reading activities. Similarly, father’s involvement was evaluated through educational interest, participation in childcare, engagement in outdoor activities, and reading behaviors. Detailed measurement protocols for all environmental factors are available in previous publications( 28 , 29 ). Demographic variables such as age and race were not included as covariates due to the homogeneous nature of the cohort: all participants were born in March 1958, and the sample was predominantly white (98.7%). Missing Data Cases with substantial missing data, those lacking information for 30% or more variables of interest, were excluded from the analysis. For the remaining cases, multiple imputation with chained equations were employed to generate 25 imputed datasets using all variables of interest as predictors in the imputation model. The multiple imputation approach was selected to address potential bias from missing data patterns while preserving the statistical relationships among variables and maintaining analytical power. Statistics and Modeling Categorical variables were reported as numbers (percentages) and continuous variables as means (standard deviations). Our modeling approach used mental health outcomes at age 11 (BSAG and Rutter A Scale scores) as the dependent variables, while incorporating predictor variables collected across multiple time points—specifically at birth, age 7, and age 11. The cohort data, comprising outcomes, potential predictors, and essays, was randomly partitioned into training (80%) and test (20%) datasets. To address class imbalance in the training dataset, given the relatively small number of children with potential mental disorders, we employed the synthetic minority oversampling technique (SMOTE) to generate synthetic samples for the minority class( 33 ). To maximize practice utility, potential predictors were maintained in their original coded form rather than being aggregated (such as summing the seven ACEs) or transformed through dimensional reduction techniques (such as principal component analysis for socioeconomic position). To avoid overfitting, feature selection was performed using recursive feature elimination (RFE) with the caretFuncs algorithm from the ‘caret’ package( 34 ). This model-agnostic approach was chosen to ensure selected variables would be compatible across multiple machine learning algorithms. The importance scores of selected predictors were derived using this RFE process, which ranks variables based on their contribution to model performance when iteratively removed. The RFE process was optimized through 5-fold cross-validation in the training dataset, with the optimal number of features determined by the maximum ROC value in the test dataset. For natural language processing, the children’s essays underwent standardized preprocessing steps including lowercase conversion, punctuation removal, stop word filtering, and tokenization to normalize the text data. Then, essays were embedded using the “all-mpnet-base-v2” model, which is based on a pretrained transformer language model architecture developed by Microsoft( 35 ). This model was chosen for its superior performance in capturing semantic relationships and contextual information in text data( 35 ). Unlike traditional word-level embeddings, this sentence-transformer model generates contextualized representations that capture the nuanced meaning of complete sentences and paragraphs. The resulting high-dimensional embeddings (exceeding 700 dimensions) were reduced using uniform manifold approximation and projection (UMAP), selected for its ability to preserve both local and global structure while maintaining computational efficiency( 36 ). We explored reduced dimensionality ranging from 5 to 25 components, with the dimension-reduced essays also evaluated using caretFuncs from the ‘caret’ package. This process underwent 5-fold cross-validation in the training dataset, with optimal UMAP components determined by maximizing ROC values in the test dataset. Eight prediction models were developed to ensure our findings were not model-specific and to identify the most robust approach for mental health prediction: linear approaches (linear discriminant analysis using the “sda” method and logistic regression using “glm” with the binomial family), non-linear algorithms (classification and regression trees using “xgbTree”, k-nearest neighbours using “knn”, neural networks using “nnet”, and naive Bayes using “naive_bayes”), and advanced ensemble methods (support vector machines using “svmRadial” and random forest using “rf”). All models were implemented using the ‘caret’ R package and underwent 5-fold cross-validation during training. The models were trained using six distinct predictor combinations: all selected features; all UMAP components; and four incremental combinations pairing UMAP components with top selected features (top 5, 10, 15, and 20 features where applicable). Trained models’ performance was assessed exclusively on the test dataset using receiver operating characteristic (ROC) values. Other metrics including sensitivity, specificity, accuracy, AUC (area under the ROC curve), F-score, kappa statistic, precision, and recall were also provided. These modeling processes were replicated for each mental health outcome measure. All analyses were conducted using R version 4.3, with essay embedding performed in Python. Results In this study, 8,981 children (49.3% female) were included. Table 1 presents the basic characteristics of included participants across three key domains: socioeconomic position, adverse childhood experiences, and environmental factors. Socioeconomic indicators revealed a predominance of manual occupations among fathers (71.9% at birth, declining to 64.5% at age 11), with approximately one-third of parents pursuing post-compulsory education. Housing stability improved moderately through childhood, as evidenced by increased home ownership from 42.1% to 46.0% between ages 7 and 11. Notable adverse childhood experiences included family conflict and parental physical neglect (both 6.3%), alongside parental mental health challenges (6.2%). Environmental factors demonstrated high parental engagement, with sustained maternal interest in children’s education (85.7% at age 7, 86.2% at age 11) and widespread participation in outdoor activities. Most mothers were in stable unions (96.8%), though 23.3% experienced pregnancy-related health conditions. Birth weight and BMI distributions predominantly fell within normal ranges across all assessment points. Mental health outcomes at age 11, measured via the Bristol Social Adjustment Guide and Rutter A Scale, yielded mean scores of 15.81 (SD=17.01) and 6.56 (SD=3.65), respectively ( Table 1 ). Figure 2 illustrates the relative predictive importance of variables for childhood mental health outcomes, stratified by two BSAG score thresholds (95th and 90th percentiles). For the more stringent 95th percentile threshold ( Panel A ), gestational period emerged as the strongest predictor, followed by parental age at birth and residential characteristics. The analysis revealed that environmental factors, particularly those related to early-life conditions, demonstrated higher predictive importance than socioeconomic indicators. Parental engagement metrics, including father’s interest in children’s education and maternal smoking during pregnancy, ranked among the top ten predictive factors. The analysis at the broader threshold ( Panel B , 90th percentile) corroborated these findings, with consistent prominence of the identified key predictive factors. Supplementary Table 1 presents a comparison between included and excluded children. Children excluded from the analysis (those with more than 30% missing values or essays containing fewer than 50 words) exhibited significantly poorer mental health than their included counterparts (p values<0.01). Figure 3 and Table 2 demonstrate the comparative performance metrics of machine learning models in detecting childhood mental health disorders. The analysis, stratified by BSAG score thresholds at the 95th and 90th percentiles, revealed significant improvements through feature integration. At the more stringent 95th percentile threshold, the combination of top 5 selected variables with essay features yielded substantial enhancement in predictive capability (ROC: 0.77, 95% CI: 0.71-0.83), markedly outperforming models using either all selected variables (ROC: 0.70, 95% CI: 0.63-0.76) or essay features alone (ROC: 0.67, 95% CI: 0.60-0.74). Model performance was further refined through incremental feature integration. The incorporation of top 10 selected variables with essay features preserved the enhanced predictive capacity while improving specificity metrics (0.78, 95% CI: 0.76-0.80). Additional optimization was achieved through the integration of top 15 and 20 selected variables, ultimately reaching peak ROC values of 0.81 (95% CI: 0.76-0.86). Parallel analyses at the 90th percentile threshold exhibited analogous patterns of improvement. The integration of top 5 selected variables with essay features demonstrated marked enhancement in model performance (ROC: 0.81, 95% CI: 0.78-0.85) over single-feature approaches. This performance was slightly elevated through the incorporation of top 10 selected variables (ROC: 0.82, 95% CI: 0.79-0.85), with subsequent feature additions maintaining superior predictive metrics while optimizing sensitivity and specificity parameters. The sensitivity analysis using Rutter A Scale scores corroborated our primary findings. The analysis demonstrated similar patterns in feature selection importance ( Supplementary Figure 1 ) and progressive enhancement in machine learning model performance through the combined use of multiple feature types ( Supplementary Figure 2 and Supplementary Table 2 ), specifically, the additive combination of top 5 selected variables with essay features also yielded substantial improvements in predictive capability. Discussion In this prospective birth cohort study leveraging data from the National Child Development Study (NCDS), we examined the predictive capability of machine learning and natural language processing approaches for early detection of potential mental disorders among school-age children. Our findings demonstrated that the integration of traditional risk factors with natural language features derived from childhood essays significantly enhanced predictive performance, with the most notable improvement observed in the parsimonious combination of the top 5 selected variables and essay features. This integrated approach outperformed models using either all selected variables or essay features alone, even though subsequent feature integration achieved incremental improvements. Environmental factors, particularly gestational period, maternal parity, parental age at birth, and early-life socioeconomic status (e.g. residential bedroom count), emerged as the strongest predictors. Parental engagement metrics, including father’s interest in children’s education and maternal smoking during pregnancy, as well as children’s zBMI almost ranked among the top ten predictive factors. The robustness of these findings was further validated through sensitivity analyses using Rutter A Scale scores, underscoring the potential utility of combining targeted traditional risk factors with natural language processing for efficient early mental health screening in pediatric populations. The integration of traditional risk factors with natural language features derived from childhood essays demonstrated superior predictive performance for early detection of mental health concerns, with optimal results achieved through a parsimonious combination of five key variables and essay features. This finding aligns with previous research, which demonstrated the value of multimodal approaches in psychiatric assessment( 13 , 37 , 38 ). However, while prior studies typically emphasized the need for comprehensive data collection across multiple domains, our results suggest that a more streamlined approach may achieve comparable predictive accuracy. This apparent divergence might be attributed to our novel application of advanced natural language processing techniques, which potentially capture more nuanced psychological indicators than traditional assessment methods. For example, our approach may detect subtle linguistic patterns such as metaphor usage, emotional tone shifts, self-referential language, and future orientation that could reflect underlying psychological states not readily observable through conventional behavioral assessments or parental reports. The superior performance of our integrated approach compared to single-modality models corroborates previous meta-analytic findings, which identified significant advantages in combining behavioral and linguistic markers for clinical assessment( 39 - 41 ). Our analysis revealed the predominant predictive power of environmental factors, particularly perinatal and early-life conditions, in determining childhood mental health outcomes. The emergence of gestational period as a primary predictor corresponds with previous longitudinal studies, which established robust associations between prenatal development and subsequent psychological outcomes( 42 - 44 ). The significant predictive value of parental characteristics (e.g. maternal parity and parental age) and the parental engagement metrics (e.g. paternal involvement) extends previous findings of parental influences on mental health or psychological development of children( 45 - 47 ). However, while earlier research emphasized socioeconomic indicators as primary predictors( 48 , 49 ), our results suggest that environmental factors may have greater predictive utility. This discrepancy might be attributed to our study’s assessment of early-life conditions and the inclusion of subtle indicators such as residential characteristics. The implications of these findings extend across potential clinical practice, public health policy, and future research directions. First, it is important to note that our study assessed school functioning and behavioral adjustment rather than formal clinical diagnoses of mental health disorders. The BSAG and Rutter A Scale measure behavioral and social adjustment problems that, while strongly correlated with mental health conditions, represent functional outcomes rather than diagnostic classifications. This distinction is critical when interpreting our findings within the broader context of mental health screening. Our integrated approach demonstrated strong predictive performance, it should be viewed as a complement to, rather than a replacement for, comprehensive clinical evaluation. While both BSAG and Rutter scales were collected at the same time as the essays in our study, it is worth noting that these traditional scales usually depend on teachers, parents, or caregivers and often require specific training for proper utilization. In contrast, the essay-based approach demonstrated in our study offers potential for wider application with less dependency on observer training or subjective interpretation. The model’s primary utility lies in its potential to efficiently identify children who might benefit from more detailed clinical assessment, thereby enabling earlier intervention for those at greatest risk. The demonstrated efficacy of integrating machine learning with natural language processing suggests a potential approach to early mental health screening in pediatric populations. If established to be useful, this integrated methodology could facilitate more efficient resource allocation in mental health services by enabling targeted interventions for high-risk children, particularly in resource-constrained settings where comprehensive clinical assessments may be limited. Furthermore, the identification of key environmental predictors, especially those related to perinatal and early-life conditions, underscores the importance of preventive interventions during critical developmental periods. From a practical implementation perspective, it is noteworthy that several of our strongest predictors – such as gestational period, residential bedroom count, and parental age – are objective measures that are relatively easy to collect in contemporary settings compared to more intensive assessments like parental engagement metrics. This suggests the possibility of developing streamlined screening protocols using readily available data points, which could enhance feasibility in diverse healthcare and educational contexts. The superior performance of parsimonious models combining selected traditional risk factors with linguistic features suggests the possibility of developing streamlined screening protocols that maintain high predictive accuracy while minimizing administrative burden. However, the implementation of essay-based assessments in contemporary settings requires careful consideration of several factors. The future-oriented nature of our essay prompt (“Imagine that you are now 25 years old…”) may have elicited particularly revealing content about children’s psychological outlook and self-perception. Alternative essay topics or formats, including existing school assignments, might yield different linguistic patterns and predictive value. Additionally, the optimal setting for such assessments—whether integrated into educational curricula, administered during healthcare visits, or implemented through digital platforms—requires further research. These findings also highlight the potential value of incorporating routine writing exercises in educational settings as a non-invasive means of monitoring children’s psychological well-being, though careful consideration must be given to ethical implications and implementation strategies. While our analysis identified paternal engagement as predictive in the 1960s data, contemporary research by Scott and colleagues confirms that responsive, boundary-setting parenting remains crucial for children’s mental wellbeing despite societal changes( 50 ). This reinforces the continued relevance of integrating family-centered approaches within assessment and intervention frameworks.. Strength and limitations This study presents several strengths in its methodological approach and analytical framework. First, the utilization of prospective birth cohort data from the NCDS provides a robust foundation for investigating developmental trajectories, minimizing recall bias and allowing for temporal sequence establishment between predictors and outcomes. Second, our integrated machine learning approach, combining traditional risk factors with natural language processing of children’s essays, represents a novel methodological advancement in pediatric mental health screening. Third, the inclusion of two validated mental health assessment tools (BSAG and Rutter A Scale) strengthens the reliability of our findings through complementary outcome measures. Fourth, our comprehensive consideration of socioeconomic, environmental, and adverse childhood experiences provides a holistic framework for understanding childhood mental health determinants. However, several limitations warrant consideration when interpreting these findings. First, while the NCDS cohort provides rich historical data, its composition (predominantly white British children born in 1958) may limit generalizability to contemporary, more diverse populations. Second, the mental health measures are constrained by their time period. The Rutter scales were later updated into the Strengths and Difficulties Questionnaire and included items like nail biting, then considered indicative of emotional disturbance but no longer viewed as such( 51 ). Third, some potentially relevant factors such as genetic predisposition, cultural influences, social networks, disability status, and general health conditions were not comprehensively incorporated( 10 , 11 , 52 ). Fourth, binary classification of mental health outcomes based on threshold scores, while pragmatic for screening, may not fully capture the nuanced spectrum of childhood mental health experiences. Fifth, our machine learning models require validation in contemporary and diverse populations before widespread clinical implementation. Sixth, we excluded children with substantial missing data or very short essays, potentially introducing selection bias. As shown in Supplementary Table 1 , excluded children demonstrated significantly poorer mental health outcomes and differed on key predictors. This exclusion may have affected model performance, though our integrated approach still demonstrated strong predictive capability (ROC: 0.77-0.81), suggesting potentially greater utility in more diverse populations with pronounced mental health needs. Seventh, we employed internal validation rather than external validation, with feature selection and model optimization performed within the same cohort. While we maintained a separate test dataset, validation on entirely independent cohorts would be more rigorous. This necessitates caution when interpreting performance metrics, as they may not fully generalize to new populations. Eighth, our natural language processing approach treated essays holistically rather than extracting specific linguistic features. While embedding-based approaches capture semantic meaning, their “black box” nature limits interpretation of which writing aspects most strongly predicted mental health outcomes, restricting understanding of underlying psychological mechanisms. Ninth, we focused exclusively on children completing the essay task, potentially introducing selection bias, as children producing very short essays or not completing assignments might systematically differ in mental health profiles. Finally, we did not explore how predictive value of children’s writing might vary across different ages, developmental stages, or essay topics. The future-oriented prompt may have uniquely elicited certain psychological themes, and different writing assignments might yield varying predictive patterns. Conclusion This prospective birth cohort study demonstrates that integrating machine learning with natural language processing of children’s written expressions obviously enhances early detection of potential mental health disorders in school-age children. The optimal predictive performance achieved through combining key environmental predictors with linguistic features suggests a promising pathway for developing efficient screening protocols. While this integrated approach shows potential as a screening tool complementary to clinical assessment, validation in contemporary, diverse populations remains essential. Further research is needed to determine the optimal implementation of essay-based screening across different age groups, cultural contexts, and writing topics before clinical application can be recommended. Our findings offer a practical direction for addressing the diagnostic gap in pediatric mental health care through more accessible and efficient screening methods, potentially enabling earlier intervention for children at risk of mental health disorders. Contributors SC contributed to the concept and study design. SC conducted the analysis. SC, TD, MQ, HL, QAL, AMW, RNC, and TJF. SC drafted the manuscript. TD, MQ, HL, QAL, AMW, RNC, and TJF made critical revisions to the manuscript for important intellectual content. All authors edited and approved the final manuscript. Funding SC’s research was supported by the PENDA, funded by the UK Foreign, Commonwealth and Development Office. RNC’s research was supported by the Medical Research Council (MR/Z504816/1). All research at the Department of Psychiatry in the University of Cambridge is supported by the NIHR Cambridge Biomedical Research Centre (NIHR203312) and NIHR Applied Research Collaboration East of England. Role of the funding source The funder of the study had no role in study design, data collection, data analysis, data interpretation, or writing of the article. The views expressed are those of the author(s) and not necessarily those of the NIHR or the Department of Health and Social Care. For the purpose of open access, the authors have applied a Creative Commons Attribution (CC BY) licence to any Author Accepted Manuscript version arising from this submission. Conflict of Interest RNC consults for Campden Instruments Ltd; receives royalties from Cambridge University Press, Cambridge Enterprise, and Routledge; and is an unpaid non-executive director of Cambridge University Health Partners. TJF’s research group receives funding for methodology consulting from Place2Be, a third sector organisation that provides mental health training and interventions to UK schools. SC and other authors declare no conflict of interest with this work. Ethics Statement This study uses data from the National Child Development Study (NCDS), a nationally representative longitudinal cohort initiated in 1958. Informed consent was obtained for all data collections, with parental consent provided for assessments conducted during childhood, including the age-11 survey and essay-writing task. Formal ethical approval for NCDS follow-ups has been obtained from the UK NHS Multi-Centre Research Ethics Committee (MREC) for all surveys conducted since 2000. Earlier waves, including those in 1958, 1965, 1969, 1974, 1981, and 1991, were conducted prior to the establishment of formal ethics committees or the MREC system. Available documentation indicates that internal ethical review processes were in place for these early waves. For example, the Biomedical Survey and age-55 follow-up were approved by the NHS London-Central Research Ethics Committee (REC) in 2012 (Ref: 12/LO/2010). The current study is a secondary analysis of de-identified public data accessed via the UK Data Service (Study No. 5790). As such, it is exempt from further institutional ethical review under the policies of the London School of Hygiene & Tropical Medicine, as it does not involve human participants or identifiable private information. Data Availability Statement The data that support the findings of this study are openly available in the UK Data Service at http://doi.org/10.5255/UKDA-SN-5790-2 , reference number SN: 5790. Reference 1. ↵ Jones PB . Adult mental health disorders and their age at onset . British Journal of Psychiatry . 2013 ; 202 ( 54 ): s5 – s10 . OpenUrl Abstract / FREE Full Text 2. ↵ Royal College of Psychiatrists College Report CR238 . Infant and early childhood mental health: the case for action . The Royal College of Psychiatrists 2023 . 3. ↵ Copeland WE , Wolke D , Shanahan L , Costello EJ . Adult Functional Outcomes of Common Childhood Psychiatric Problems: A Prospective, Longitudinal Study . JAMA Psychiatry . 2015 ; 72 ( 9 ): 892 – 9 . OpenUrl PubMed 4. ↵ Goodman A , Joyce R , Smith JP . The long shadow cast by childhood physical and mental problems on adult life . Proc Natl Acad Sci U S A . 2011 ; 108 ( 15 ): 6032 – 7 . OpenUrl Abstract / FREE Full Text 5. ↵ Kieling C , Baker-Henningham H , Belfer M , Conti G , Ertem I , Omigbodun O , et al. Child and adolescent mental health worldwide: evidence for action . Lancet . 2011 ; 378 ( 9801 ): 1515 – 25 . OpenUrl CrossRef PubMed Web of Science 6. ↵ Wang S , Li Q , Lu J , Ran H , Che Y , Fang D , et al. Treatment Rates for Mental Disorders Among Children and Adolescents: A Systematic Review and Meta-Analysis . JAMA Netw Open . 2023 ; 6 ( 10 ): e2338174 . OpenUrl 7. Cummings A , Shelton K. The prevalence of mental health disorders amongst careexperienced young people in the UK: A systematic review . Children and Youth Services Review . 2024 ; 156 : 107367 . OpenUrl 8. ↵ Patel V , Kieling C , Maulik PK , Divan G. Improving access to care for children with mental disorders: a global perspective . Archives of Disease in Childhood . 2013 ; 98 ( 5 ): 323 . OpenUrl Abstract / FREE Full Text 9. ↵ Koly KN , Baskin C , Khanam I , Rao M , Rasheed S , Law GR , et al. Educational and Training Interventions Aimed at Healthcare Workers in the Detection and Management of People With Mental Health Conditions in South and South-East Asia: A Systematic Review . Front Psychiatry . 2021 ; 12 : 741328 . OpenUrl PubMed 10. ↵ Salamanca-Buentello F , Seeman MV , Daar AS , Upshur REG . The ethical, social, and cultural dimensions of screening for mental health in children and adolescents of the developing world . PLoS One . 2020 ; 15 ( 8 ): e0237853 . OpenUrl PubMed 11. ↵ Achenbach TM . Multicultural evidence-based assessment of child and adolescent psychopathology . Transcult Psychiatry . 2010 ; 47 ( 5 ): 707 – 26 . OpenUrl CrossRef PubMed 12. ↵ Iyortsuun NK , Kim SH , Jhon M , Yang HJ , Pant S. A Review of Machine Learning and Deep Learning Approaches on Mental Health Diagnosis . Healthcare (Basel) . 2023 ; 11 ( 3 ). 13. ↵ Sadeghi M , Richer R , Egger B , Schindler-Gmelch L , Rupp LH , Rahimi F , et al. Harnessing multimodal approaches for depression detection using large language models and facial expressions . npj Mental Health Research . 2024 ; 3 ( 1 ): 66 . OpenUrl PubMed 14. Le Glaz A , Haralambous Y , Kim-Dufor DH , Lenca P , Billot R , Ryan TC , et al. Machine Learning and Natural Language Processing in Mental Health: Systematic Review . J Med Internet Res . 2021 ; 23 ( 5 ): e15708 . OpenUrl CrossRef PubMed 15. Scherbakov DA , Hubig NC , Lenert LA , Alekseyenko AV , Obeid JS . Natural Language Processing and Social Determinants of Health in Mental Health Research: AI-Assisted Scoping Review . JMIR Ment Health . 2025 ; 12 : e67192 . OpenUrl 16. ↵ Sweeney C , Ennis E , Mulvenna MD , Bond R , O’Neill S. Insights Derived From Text-Based Digital Media, in Relation to Mental Health and Suicide Prevention, Using Data Analysis and Machine Learning: Systematic Review . JMIR Ment Health . 2024 ; 11 : e55747 . OpenUrl PubMed 17. ↵ Shatte ABR , Hutchinson DM , Teague SJ . Machine learning in mental health: a scoping review of methods and applications . Psychol Med . 2019 ; 49 ( 9 ): 1426 – 48 . OpenUrl CrossRef PubMed 18. ↵ Law J , Reilly S , Snow PC . Child speech, language and communication need re-examined in a public health context: a new direction for the speech and language therapy profession . International Journal of Language & Communication Disorders . 2013 ; 48 ( 5 ): 486 – 96 . OpenUrl PubMed 19. ↵ Shu X , Xiao Y , Yang L. The effectiveness of language nursing intervention on mental health in children with poor language skills . PLOS ONE . 2024 ; 19 ( 11 ): e0313095 . OpenUrl PubMed 20. ↵ Wibaek R , Andersen GS , Dahm CC , Witte DR , Hulman A. Large Language Models for Epidemiological Research via Automated Machine Learning: Case Study Using Data From the British National Child Development Study . JMIR Med Inform . 2023 ; 11 : e43638 . OpenUrl 21. ↵ Power C , Elliott J. Cohort profile: 1958 British birth cohort (National Child Development Study) . Int J Epidemiol . 2006 ; 35 ( 1 ): 34 – 41 . OpenUrl CrossRef PubMed Web of Science 22. ↵ Stott DH . The social adjustment of children: Manual to the Bristol Social Adjustment Guides. (No Title) . 1966 . 23. ↵ Shepherd P. Bristol Social Adjustment Guides at 7 and 11 Years: 1958 National Child Development Study User Guide . Centre for Longitudinal Studies Institute of Education, University of London , London . 2013 . 24. ↵ Thapar AK , Riglin L , Blakey R , Collishaw S , Davey Smith G , Stergiakouli E , et al. Childhood attention-deficit hyperactivity disorder problems and mid-life cardiovascular risk: prospective population cohort study . Br J Psychiatry . 2023 ; 223 ( 4 ): 472 – 7 . OpenUrl PubMed 25. ↵ Rutter M , Tizard J , Whitmore K. Education, health, and behaviour. (No Title) . 1970 . 26. ↵ McElroy E , Tibber M , Fearon P , Patalay P , Ploubidis GB . Socioeconomic and sex inequalities in parent-reported adolescent mental ill-health: time trends in four British birth cohorts . J Child Psychol Psychiatry . 2023 ; 64 ( 5 ): 758 – 67 . OpenUrl CrossRef PubMed 27. ↵ Dodgeon B , Morris T , Crawford C , Parsons S , Vignoles A , Oldfield Z. CLOSER work package 2: Harmonised socio-economic measures user guide . London : CLOSER[Google Scholar] . 2018 . 28. ↵ Machlitt-Northen S , Keers R , Munroe PB , Howard DM , Pluess M. Polygenic risk scores for schizophrenia and major depression are associated with socio-economic indicators of adversity in two British community samples . Transl Psychiatry . 2022 ; 12 ( 1 ): 477 . OpenUrl PubMed 29. ↵ Machlitt-Northen S , Keers R , Munroe PB , Howard DM , Trubetskoy V , Pluess M. Polygenic scores for schizophrenia and major depression are associated with psychosocial risk factors in children: evidence of gene-environment correlation . J Child Psychol Psychiatry . 2022 ; 63 ( 10 ): 1140 – 52 . OpenUrl PubMed 30. ↵ Ugarteche Perez A , Berger E , Kelly-Irving M , Delpierre C , Capuron L , Castagne R. Early life stress in relation with risk of overweight, depression, and their comorbidity across adulthood: findings from a British birth cohort . Psychol Med . 2024 ; 54 ( 8 ): 1853 – 66 . OpenUrl PubMed 31. ↵ Gondek D , Patalay P , Lacey RE . Adverse childhood experiences and multiple mental health outcomes through adulthood: A prospective birth cohort study . SSM - Mental Health . 2021 ; 1 : 100013 . OpenUrl 32. ↵ World Health Organization . Growth reference data for 5–19 years .: 2007 . 33. ↵ Chawla NV , Bowyer KW , Hall LO , Kegelmeyer WP . SMOTE: synthetic minority over-sampling technique . Journal of artificial intelligence research . 2002 ; 16 : 321 – 57 . OpenUrl CrossRef 34. ↵ Kuhn M , Wing J , Weston S , Williams A , Keefer C , Engelhardt A , et al. Package ‘caret’ . The R Journal . 2020 ; 223 ( 7 ): 48 . OpenUrl 35. ↵ Galli C , Donos N , Calciolari E. Performance of 4 Pre-Trained Sentence Transformer Models in the Semantic Query of a Systematic Review Dataset on Peri-Implantitis . Information . 2024 ; 15 ( 2 ): 68 . OpenUrl 36. ↵ W. Härdle , L Simar , MR Fengler Härdle WK , Simar L , Fengler MR . Uniform Manifold Approximation and Projection . In: Applied Multivariate Statistical Analysis (eds W. Härdle , L Simar , MR Fengler ): 581 – 95 . Springer International Publishing , 2024 . 37. ↵ Falkai P , Adorjan K. Multimodal Clinical Evaluation and Treatment Planning . In: Tasman’s Psychiatry : 1 – 37 . Springer , 2024 . 38. ↵ D Stoyanov , B Draganski , P Brambilla , C Lamm Maggioni E , Piani MC , Bondi E , Bianchi AM , Brambilla P. Multimodal Integration in Psychiatry: Clinical Potential and Challenges . In: Computational Neuroscience (eds D Stoyanov , B Draganski , P Brambilla , C Lamm ): 235 – 56 . Springer US , 2023 . 39. ↵ Drougkas G , Bakker Erwin M , Spruit M. Multimodal machine learning for language and speech markers identification in mental health . BMC Medical Informatics and Decision Making . 2024 ; 24 ( 1 ): 354 . OpenUrl 40. Tang F , Chen J , Dodge HH , Zhou J. The Joint Effects of Acoustic and Linguistic Markers for Early Identification of Mild Cognitive Impairment . Front Digit Health . 2021 ; 3 : 702772 . OpenUrl PubMed 41. ↵ Spruit M , Verkleij S , de Schepper K , Scheepers F. Exploring Language Markers of Mental Health in Psychiatric Stories . Applied Sciences . 2022 ; 12 ( 4 ): 2179 . OpenUrl 42. ↵ Estinfort W , Huang J-P , Au H-K , Lin C-L , Chen Y-Y , Chao HJ , et al. Effects of prenatal subjective well-being on birth outcomes and child development: A longitudinal study . European Psychiatry . 2022 ; 65 ( 1 ): e77 . OpenUrl PubMed 43. Levendosky AA , Bogat GA , Lonstein J , Muzik M , Nuttall AK . Longitudinal prospective study examining the effects of the timing of prenatal stress on infant and child regulatory functioning: the Michigan Prenatal Stress Study protocol . BMJ Open . 2021 ; 11 ( 9 ): e054964 . OpenUrl Abstract / FREE Full Text 44. ↵ Espel EV , Glynn LM , Sandman CA , Davis EP . Longer Gestation among Children Born Full Term Influences Cognitive and Motor Development . PLOS ONE . 2014 ; 9 ( 11 ): e113758 . OpenUrl CrossRef PubMed 45. ↵ Melchior M , van der Waerden J. Parental influences on children’s mental health: the bad and the good sides of it . European Child & Adolescent Psychiatry . 2016 ; 25 ( 8 ): 805 – 7 . OpenUrl PubMed 46. Pan B , Wang Y , Xu P , Gong Y , Zhao C , Miao J , Li Y. The complex longitudinal influence of paternal and maternal parental psychological flexibility on child problem behavior: exploring the role of parenting styles . BMC Psychology . 2024 ; 12 ( 1 ): 793 . OpenUrl PubMed 47. ↵ Puglisi N , Rattaz V , Favez N , Tissot H. Father involvement and emotion regulation during early childhood: a systematic review . BMC Psychology . 2024 ; 12 ( 1 ): 675 . OpenUrl PubMed 48. ↵ Cadman T , Avraam D , Carson J , Elhakeem A , Grote V , Guerlich K , et al. Social inequalities in child mental health trajectories: a longitudinal study using birth cohort data 12 countries . BMC Public Health . 2024 ; 24 ( 1 ): 2930 . OpenUrl PubMed 49. ↵ Mirela Z , Tsvetomira D , Aaron R , Lucy B. What do we mean when we talk about socioeconomic status? Implications for measurement, mechanisms and interventions from a critical review on adolescent mental health . General Psychiatry . 2024 ; 37 ( 6 ): e101455 . OpenUrl Abstract / FREE Full Text 50. ↵ Scott Hoffman M , Hanson BJ , Brotherson SE , Zehnacker G. Boundaries: A Boundary Setting and Social Competence Program for Parents and Youth . Journal of Human Sciences and Extension . 2021 ; 9 ( 3 ): 15 . OpenUrl 51. ↵ Goodman R. Psychometric properties of the strengths and difficulties questionnaire . J Am Acad Child Adolesc Psychiatry . 2001 ; 40 ( 11 ): 1337 – 45 . OpenUrl CrossRef PubMed Web of Science 52. ↵ Paul SE , Colbert SMC , Gorelik AJ , Johnson EC , Hatoum AS , Baranger DAA , et al. A phenome-wide association study of cross-disorder genetic liability in youth genetically similar to individuals from European reference populations . Nature Mental Health . 2024 ; 2 ( 11 ): 1327 – 41 . OpenUrl View the discussion thread. Back to top Previous Next Posted September 12, 2025. Download PDF Supplementary Material Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Machine learning and natural language processing for the early detection of potential mental disorders among school-age children: a prospective birth cohort study Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Machine learning and natural language processing for the early detection of potential mental disorders among school-age children: a prospective birth cohort study Shanquan Chen , Ting Dang , Mengjie Qian , Huizhi Liang , Diribsa Tsegaye Bedada , Quinette Abegail Louw , Anna Moore , Rudolf N. Cardinal , Tamsin J. Ford medRxiv 2025.09.10.25335509; doi: https://doi.org/10.1101/2025.09.10.25335509 Share This Article: Copy Citation Tools Machine learning and natural language processing for the early detection of potential mental disorders among school-age children: a prospective birth cohort study Shanquan Chen , Ting Dang , Mengjie Qian , Huizhi Liang , Diribsa Tsegaye Bedada , Quinette Abegail Louw , Anna Moore , Rudolf N. Cardinal , Tamsin J. Ford medRxiv 2025.09.10.25335509; doi: https://doi.org/10.1101/2025.09.10.25335509 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Psychiatry and Clinical Psychology Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (300) Cardiovascular Medicine (4435) Dentistry and Oral Medicine (444) Dermatology (382) Emergency Medicine (608) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1509) Epidemiology (15229) Forensic Medicine (30) Gastroenterology (1124) Genetic and Genomic Medicine (6600) Geriatric Medicine (668) Health Economics (997) Health Informatics (4536) Health Policy (1368) Health Systems and Quality Improvement (1613) Hematology (541) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15916) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (146) Nephrology (667) Neurology (6599) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1144) Occupational and Environmental Health (957) Oncology (3332) Ophthalmology (974) Orthopedics (369) Otolaryngology (420) Pain Medicine (436) Palliative Medicine (130) Pathology (663) Pediatrics (1693) Pharmacology and Therapeutics (691) Primary Care Research (711) Psychiatry and Clinical Psychology (5447) Public and Global Health (9232) Radiology and Imaging (2198) Rehabilitation Medicine and Physical Therapy (1370) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (712) Sports Medicine (530) Surgery (712) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a00aaed60fd2db75',t:'MTc3OTYwODc0OA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.