Full text
69,412 characters
· extracted from
preprint-html
· click to expand
Empowering digital health management with on-device large language models for glucose prediction | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Empowering digital health management with on-device large language models for glucose prediction View ORCID Profile Taiyu Zhu , View ORCID Profile Joanna Howson , Alejo Nevado-Holgado doi: https://doi.org/10.1101/2025.07.12.25331188 Taiyu Zhu 1 Department of Psychiatry, University of Oxford , UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Taiyu Zhu For correspondence: taiyu.zhu{at}psych.ox.ac.uk Joanna Howson 2 Novo Nordisk Research Centre Oxford , UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Joanna Howson Alejo Nevado-Holgado 1 Department of Psychiatry, University of Oxford , UK Find this author on Google Scholar Find this author on PubMed Search for this author on this site Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Long-term management of chronic diseases such as diabetes is increasingly based on wearable technologies, particularly continuous glucose monitoring (CGM), integrated with smartphone-based digital health systems. When combined with artificial intelligence, especially deep learning, these systems offer highly personalized decision support, including glucose prediction. Although large language models (LLMs) have demonstrated strong performance across various healthcare tasks, their integration into day-to-day digital health remains limited, primarily due to privacy concerns associated with transmitting sensitive data to remote servers. Recent advances in lightweight LLMs create new opportunities for secure and local deployment. In this study, we first evaluated the zero-shot glucose prediction performance of eight pretrained lightweight LLMs across multiple model families. None achieved clinically viable outputs, highlighting the need for domain-specific adaptation. To address this, we propose GluLLM, a multimodal adapter-based framework that enhances pretrained LLMs for on-device glucose forecasting. GluLLM integrates CGM data, daily activity logs, and electronic health records using customized encoder and decoder modules while preserving the foundational capabilities of pretrained LLMs. We trained and evaluated GluLLM on the REPLACE-BG dataset, which includes 226 individuals with type 1 diabetes, and validated it on an external cohort comprising 207 individuals with type 2 diabetes or without diabetes. Compared with 15 state-of-the-art deep learning baseline models for time series prediction, GluLLM with a LLaMA 3.2 1B backbone achieved superior prediction performance, demonstrating the lowest root mean square error and the highest sensitivity for hypoglycemia detection in both datasets. Further-more, the deployment of GluLLM on two smartphone platforms demonstrated feasible computational requirements with acceptable CPU and memory usage, highlighting its practical utility for real-world LLM-driven digital health management. Introduction Characterized by insufficient insulin production or dysfunctional beta cells in the pancreas, diabetes remains one of the most significant global health challenges [ 1 ]. It is estimated that over half a billion people worldwide are living with diabetes [ 2 ], with approximately 90% diagnosed with type 2 diabetes (T2D) and around 10% with type 1 diabetes (T1D). Effective glucose regulation, or glycemic control, poses a substantial daily burden for individuals with T1D or insulin-dependent T2D, which requires frequent glucose monitoring and multiple insulin interventions. Failure to maintain glucose levels within a therapeutic range can lead to a range of complications [ 3 ], such as cardiovascular disease, neuropathy, and retinopathy. Therefore, consistent glucose monitoring and precise interventions are essential for effective diabetes management. Significant advances in wearable and mobile technologies in the past decade have facilitated the widespread adoption of continuous glucose monitoring (CGM) systems [ 4 ]. CGM measures interstitial glucose levels, which represent the glucose present in the fluid between cells, using a minimally invasive sensor inserted under the skin, typically on the abdomen or upper arm. These readings are then converted into real-time estimates of blood glucose levels, generally updated every five minutes, providing users with a dynamic and continuous view of glycemic trends. Multiple large-scale clinical studies have demonstrated that CGM technology substantially improves patient outcomes, reduces the burden of daily management, and improves quality of life through more comprehensive glycemic monitoring and reduced measurement invasiveness in both T1D and T2D [ 5 – 8 ]. Despite advances in diabetes management technologies, the effectiveness of real-world interventions remains limited by several physiological and temporal constraints. There is a notable time delay of five to ten minutes between changes in blood glucose levels and their detection by interstitial CGM sensors [ 9 ]. Insulin therapy introduces additional delays due to its phar-macokinetics, as most formulations begin acting about 15 minutes after administration and may continue to exert effects for several hours [ 10 ]. Dietary glucose absorption also varies significantly across individuals and food types, adding further complexity. One particularly important challenge is the detection of hypoglycemia, which often occurs without noticeable symptoms and can escalate to life-threatening episodes, such as seizures, loss of consciousness, or stroke [ 11 ]. These combined temporal factors emphasize the importance of proactive diabetes management strategies, with accurate glucose prediction serving as a crucial foundation for timely and effective intervention. However, achieving reliable glucose prediction is inherently difficult due to substantial inter- and intra-individual variability [ 12 , 13 ]. Various daily activities, such as food intake, insulin administration, and physical exercise, affect glucose levels, while individual characteristics, such as HbA1c and body mass index (BMI), are also closely associated with glucose dynamics [ 14 ]. Recent advances in artificial intelligence (AI), particularly deep learning [ 15 ], have enabled the effective use of CGM time series by leveraging their complex and heterogeneous nature. In the literature, a variety of deep neural network architectures designed for sequence modeling have been proposed to address this task, including convolutional neural networks (CNNs) [ 16 ], recurrent neural networks (RNNs) such as long short-term memory (LSTM) [ 17 ] and gated recurrent unit (GRU) [ 18 – 20 ] models, and more recently Transformer-based models [ 21 ]. These deep learning algorithms are capable of predicting glucose levels 30 to 60 minutes in advance, allowing individuals sufficient time to implement proactive interventions to mitigate adverse glycemic events, which have demonstrated superior predictive accuracy, as measured by metrics including the root mean square error (RMSE) [ 15 ]. Among these approaches, a critical consideration is the trade-off between personalized models tailored to individuals and population models applicable across diverse users. Although most of the literature focuses on personalized models to capture individual glycemic patterns [ 17 , 19 , 20 , 22 ], these approaches face substantial barriers to clinical deployment, including time-intensive data collection processes and complex regulatory requirements. Population models present a favorable alternative for real-world applications; however, the substantial inter-subject variability poses notable challenges for model generalization. Furthermore, many studies have adopted single prediction horizon frameworks (i.e., sequence-to-one) [ 15 , 17 , 19 , 20 , 22 ], which limit their effectiveness in supporting decision-making processes [ 23 ]. To address these challenges simultaneously, recently proposed large language models (LLMs) offer promising new directions, particularly for integrating complex clinical knowledge [ 24 , 25 ]. First, their generalization ability enables a trained model to provide accurate predictions for previously unseen individuals without requiring further fine-tuning. Second, their sequence-to-sequence architecture allows the model to predict a sequence of future glucose levels based on historical glucose measurements and relevant contextual information. The potential of LLMs for time series applications has been demonstrated across numerous recent studies [ 26 – 30 ]. These approaches leverage learned token transitions and language-based embeddings to encode both instructions and temporal data, consistently achieving state-of-the-art performance on diverse prediction benchmarks. Despite this success, the application of LLMs to glucose management remains relatively unexplored. Only two studies have ventured into this area: one evaluated TimeLLM [ 26 ], an existing LLM-based forecasting model, for personalized glucose prediction [ 31 ], while another employed GPT-4 to analyze CGM profiles [ 32 ]. However, a key challenge in the implementation of LLM in healthcare is to address the security and privacy concerns associated with processing sensitive data on remote platforms based on the Internet [ 25 ]. This challenge has sparked a promising trend toward smaller and more efficient LLMs, which is already becoming a reality for local deployment on personal devices. Apple Intelligence launched on-device LLMs on the latest iPhones in July 2024 providing API access to app developers in June 2025. This was followed by Meta’s release of LlaMA 3.2 edge AI models in September 2024. This evolution is perfectly in line with the practices of the CGM system, where smartphone apps already play a central role in facilitating data visualization and clinical decision support [ 33 ], and have demonstrated the ability to effectively run deep learning algorithms for glucose prediction [ 19 ]. Therefore, smartphones represent a natural choice to implement LLM-based clinical decision support in glucose management applications. In this paper, we first evaluate the zero-shot learning performance of lightweight LLMs for glucose prediction. Subsequently, after identifying the current limitations in their performance, we propose GluLLM, a novel modelagnostic adapter framework that significantly enhances the glucose prediction capabilities of pretrained LLMs, as shown in Fig. 1 . This framework leverages demographic information and temporal dependencies through a mixed embedding strategy for personalized prediction, which consistently outperforms 15 state-of-the-art deep learning-based time series prediction methods, including models based on Transformer, multilayer perceptrons (MLPs), CNNs, and RNNs. Finally, we implement GluLLM on two smartphone platforms and analyze CPU and memory consumption. This enables a new paradigm in smartphone-based LLM-driven digital health systems with strong potential for adaptation across a broad spectrum of chronic disease management applications. Download figure Open in new tab Fig. 1: Integration of LLMs in chronic disease management through adaptive frameworks. a , Implementation of GluLLM on smartphone platforms that seamlessly integrate with digital health ecosystems for comprehensive glucose management. The system processes CGM data transmitted via Bluetooth, captures daily activities through event logs, and accesses electronic health records (EHRs) stored locally. All data streams are processed through specialized components of the GluLLM architecture, including EHR prompt, time dependent interpreter (TDI), and wearable data encoder, to deliver personalized insights and predictions. In the schematic, ice icons indicate preserved pretrained LLM elements that remain frozen, while fire icons denote adapter components requiring targeted training. b Extensibility of the framework through multiple specialized adapter (ADP), each equipped with dedicated encoder and decoder modules for handling diverse data types. This design enables management of multiple chronic conditions through a unified on-device LLM platform. Results Zero-short glucose prediction performance Our initial experiment evaluated the zero-shot learning performance of LLMs for glucose prediction, which refers to the ability of a pretrained model to perform a specialized task without any additional fine-tuning or training. We constructed the following prompts that included comprehensive demographic information (age, sex, BMI, and HbA1c) alongside six hours of historical CGM readings. These structured prompts instructed the model to generate glucose predictions in a standardized format, allowing us to assess the baseline performance of pretrained LLMs. “A [age]-year-old [sex] individual has an HbA1c of [HbA1c]% and a BMI of [BMI]. Based on their continuous glucose monitoring (CGM) data recorded every 5 minutes over the past 6 hours [historical_readings], predict their glucose levels for the next hour at 5-minute intervals.” To ensure practical smartphone deployment of digital clinical decision support systems, we selected compact open-source models from leading LLM families, including LLaMA (Meta) [ 34 ], Mistral (Mistral AI) [ 35 ], Gemma (Google) [ 36 ], Phi (Microsoft) [ 37 ], OpenELM (Apple) [ 38 ], and GPT (OpenAI) [ 39 ]. Table 1 presents detailed information on model versions, parameter sizes, and response characteristics. View this table: View inline View popup Download powerpoint Table 1: Comparison of LLM performance on zero-shot glucose level prediction Our evaluation revealed a significant performance gap: none of these LLMs demonstrated the capability to produce clinically viable glucose predictions. Rather than generating the requested numerical predictions, most models produced theoretical explanations of time series prediction methodologies (e.g., linear regression and RNNs) accompanied by Python implementations. These responses represent a substantial deviation from the required clinical functionality, requiring technical expertise beyond that possessed by most users and healthcare providers, and demonstrate a fundamental misalignment between the models’ generative tendencies and the specific requirements of prediction tasks. These findings highlight the critical limitations of applying general-purpose LLMs directly to specialized clinical tasks without appropriate adaptations. Enhancing glucose prediction in pretrained LLMs with GluLLM To address the performance limitations of pretrained LLMs in glucose prediction, we propose the GluLLM frame-work (see Methods), which enhances predictive accuracy through targeted adaptation. Our results demonstrate that when augmented with GluLLM, all evaluated LLMs successfully provided viable glucose predictions with a one-hour forecast horizon. We conducted comprehensive performance comparisons between GluLLM and a diverse array of existing deep learning time series forecasting approaches. This comparison included ten state-of-the-art methods that, to our knowledge, are being applied to glucose prediction for the first time in this study. The baseline methods encompass three architectural categories: Transformer-based models including Autoformer [ 40 ], Crossformer [ 41 ], FEDformer [ 42 ], Informer [ 43 ], iTransformer [ 44 ], non-stationary Transformer (NST) [ 45 ], PatchTST [ 46 ], and vanilla Transformer [ 47 ]; MLP architectures such as N-Beats [ 48 ], DLinear [ 49 ], and TiDE [ 50 ]; convolutional neural networks including TCNs [ 16 ] and TimesNet [ 51 ]; and RNN architectures including LSTM [ 17 ] and GRU [ 19 ]. This extensive evaluation enables robust assessment of GluLLM’s performance relative to current methodological standards in time series forecasting. To ensure a fair evaluation, all models underwent consistent training and assessment using the REPLACE-BG dataset [ 52 ]. We employed five-fold cross-validation for hyperparameter tuning and model training using a development set ( n = 180), followed by performance evaluation on separate hold-out testing sets ( n = 46). Dataset partitioning was conducted on an individual basis to prevent data leakage and mimic real-world scenarios with heterogeneous populations. To enhance the generalization capabilities of non-LLM models, we applied a model-agnostic metalearning framework for domain generalization [ 53 ], which has effectively mitigated the challenges of high inter-individual variability and improved glucose prediction performance as evidenced in our recent work [ 21 ]. Fig. 2 presents the global RMSE values comparing predicted sequences with actual CGM measurements over the next 60 minutes. The results demonstrate that LLaMA 3.2 models achieved superior performance, with the 1B version ranking at the top. Other LLMs in our evaluation exhibited performance comparable to the average level of deep learning baseline methods. Among the tested LLMs, OpenELM delivered the lowest performance, which may be attributed to its significantly smaller parameter count compared to other architectures. Download figure Open in new tab Fig. 2: Global RMSE for glucose time series across different model architectures with a 60-minute prediction horizon. The color coding represents distinct architectural categories: violet for LLM-based models, light red for Transformer-based models, light yellow for MLP-based models, blue for CNN-based models, and green for RNN-based architectures. Black asterisk markers indicate models previously explored in glucose prediction literature, while the gold star highlights the best-performing model in our evaluation. Following the global RMSE evaluation, we selected the top-performing model from each architecture family for detailed comparison. Table 2 presents the results of four regression metrics at 30- and 60-minute prediction horizons, following standard evaluation settings in glucose prediction tasks [ 15 ]. Specifically, we report RMSE, mean absolute error (MAE), mean absolute relative difference (MARD), and glucose-specific RMSE (gRMSE). Lower values across all metrics indicate better predictive performance. Among the evaluated LLMs, we selected the LLaMA 3.2 1B model as the core architecture of GluLLM for all subsequent experiments due to its strong performance and efficient resource usage. It is worth noting that GluLLM achieved the best performance at both prediction horizons across all regression metrics when compared with the top models from each deep learning family. Furthermore, when evaluated on the external Móstoles dataset [ 54 ], GluLLM consistently ranked first across all metrics and both time horizons, demonstrating strong generalizability and robustness to different populations and clinical contexts. View this table: View inline View popup Download powerpoint Table 2: Regression metrics for glucose level prediction performance (Mean ± STD) on hold-out REPLACE-BG testing set and external Móstoles dataset at 30-minute and 60-minute prediction horizons. View this table: View inline View popup Download powerpoint Table 3: Hypoglycemic event prediction performance on hold-out REPLACE-BG testing set and external Móstoles dataset. Further evaluation was conducted to assess whether the glucose predictions could effectively indicate adverse glycemic events, particularly hypoglycemia, which can lead to severe complications in a short period of time. In this analysis, we used classification metrics to determine whether the predicted glucose time series could accurately detect hypoglycemic events, defined as glucose levels below 70 mg/dL Model performance was evaluated using standard classification metrics: accuracy (ACC), precision (PREC), sensitivity (SEN), and specificity (SPEC). In addition, we included three balanced metrics commonly used in imbalanced classification tasks: area under the receiver operating characteristic curve (AUC), F1 score, and Matthews correlation coefficient (MCC)—to account for the relatively low prevalence of hypoglycemia (Supplementary Table 4). For these metrics, higher values uniformly indicate better predictive performance. We chose to report overall performance across all testing data rather than individual-level results because some participants experienced no hypoglycemic events during the monitoring period, making certain classification metrics incalculable for these individuals. View this table: View inline View popup Download powerpoint Table 4: Demographic characteristics (Mean ± SD) of the individuals in the clinical datasets GluLLM achieved the highest scores for AUC, F1, and MCC, indicating the best overall classification performance among all models. Notably, GluLLM also significantly improved sensitivity, detecting a greater proportion of hypoglycemic events on both datasets: an improvement of 11.2% on the REPLACE-BG dataset and 10.2% on the Móstoles dataset compared with the second-best models, Crossformer and GRU, respectively. These gains suggest that implementing GluLLM could potentially lead to a significant reduction in the occurrence of hypoglycemic events. Memory and CPU footprint of GluLLM on smartphone platforms Smartphone platforms are widely used in modern glucose management systems, typically paired with CGMs via Bluetooth. However, implementing LLMs on smartphones presents significant challenges due to the substantial size of these models and the quadratic computational complexity of Transformer blocks. Despite these constraints, on-device and offline LLMs are essential in healthcare applications, where data privacy and continuous availability are critical. To demonstrate feasibility, we implemented GluLLM on two smartphone devices: the iPhone 14 Pro Max (representing a mainstream high-end device) and the iPhone SE 2020 (representing a resource-constrained platform). Fig. 3 shows the peak CPU and memory usage during GluLLM inference on both devices. Peak values are reported because sudden computational spikes can exceed device capabilities and risk freezing or crashing the system. The iPhone 14 Pro Max consumed a maximum of 55.5% CPU and 453.9 MB of memory (7.4% of the device’s total 6GB memory capacity), while the iPhone SE 2020 consumed a maximum of 85.8% CPU and 512.2 MB of memory (16.7% of the device’s total 3GB memory capacity). When examining the computational demands of individual components, we found that the encoder and decoder modules required substantially fewer resources compared with the time dependent interpreter (TDI) module and LLM body, consuming a maximum of 24.7% CPU and 10.2 MB of memory on the iPhone 14 Pro Max, and 36.6% CPU and 19.7 MB of memory on the iPhone SE 2020. Download figure Open in new tab Fig. 3: Peak CPU and memory usage of GluLLM across smartphone platforms. Blue and orange bars represent the performance of the iPhone 14 Pro Max (14PM) and iPhone SE 2020 (SE), respectively. Error bars indicate the mean ± standard deviation across 10 independent runs. The average memory consumption for running GluLLM was 139.1 MB (2.5%) on the iPhone 14 Pro Max and 178.1 MB (5.8%) on the iPhone SE, both exhibiting low energy impact. All operations were completed within 20 and 35 seconds on the 14 Pro Max and SE, respectively. These results demonstrate that deploying GluLLM on smartphone platforms for real-time clinical decision support is feasible. Discussion In this work, we proposed GluLLM, an adapter framework that enhances pretrained LLMs for digital health management, with a specific focus on glucose prediction. To our knowledge, this is the first study to introduce an on-device LLM framework that integrates multimodal data, including wearables, daily event logs, and electronic health records (EHRs), to support comprehensive and personalized care. We conducted a thorough evaluation by benchmarking against 15 state-of-the-art deep learning approaches for time series forecasting. The proposed GluLLM with LlaMA 3.2 1B model achieved the best regression performance and superior results in hypoglycemia detection, significantly increasing sensitivity to these adverse events. Further implementation on smartphone platforms demonstrated the feasibility of real-world deployment to enhance clinical decision support systems with minimal resource requirements. The performance gains achieved by GluLLM can be attributed to several key design elements. First, incorporating contextual information significantly enhances LLM performance in time series forecasting, as shown in prior work such as TimeLLM [ 26 ]. In our case, demographic characteristics closely related to an individual’s metabolic status, such as age, sex, BMI, and HbA1c, substantially influence glucose dynamics. Integrating this information enables the personalization of a population-level model, allowing it to generalize effectively to unseen individuals without the need for fine-tuning or post-training, as demonstrated in our ablation study on the Móstoles dataset (Supplementary Fig. 6). Second, the autoregressive architecture and next-token prediction framework allow GluLLM to translate token transitions learned during pretraining into meaningful glucose trajectories over time, which leverages the core strengths of language models. Finally, the TDI module enables real-time adaptation by integrating textual inputs describing daily activities. This allows the model to account for transient glucose fluctuations, such as those resulting from insulin interventions, thereby improving predictive accuracy (Supplementary Fig. 6). With this setup, GluLLM enables largescale deployment of personalized models across diverse population cohorts, without the need for individual-specific training. It can be readily integrated into a variety of clinical settings to deliver robust and generalizable performance. Moreover, it is straightforward to incorporate data from multiple wearable sensors through additional encoder modules, such as combining wristband sensors with CGMs [ 19 ], to enable more comprehensive physiological monitoring and further enhance predictive accuracy. The proposed framework is also extensible to other health condition management tasks, as well as biomarker and vital signs monitoring. For example, it could be applied to wearable-enabled Parkinson’s disease care [ 55 ] by incorporating data from wearable accelerometers and EHRs to support holistic and personalized care. Although LLM-driven approaches may appear computationally intensive for traditional forecasting tasks, they are particularly well suited to biomedical time series, which are often accompanied by rich textual information that directly influences prediction. This synergy between structured signals, such as wearable data, and unstructured context, such as EHR notes, makes LLMs especially powerful. These capabilities are further enhanced by the highly efficient design of GluLLM. The training complexity is substantially lower than that of full large language model fine-tuning or even many parameter-efficient fine-tuning (PEFT) approaches [ 56 ], such as LoRA [ 57 ]. We chose to use neural network–based adapters rather than PEFT methods, because the primary goal of our task is to improve how the model interprets input data from wearable devices, rather than to enhance the generation of natural language responses. In our context, the quality of the input representation is more critical than producing fluent textual outputs. Future work will explore integrating LoRA into GluLLM to investigate whether further improvements can be obtained. In particular, if predicted values are to be used for clinical decision support, such as suggesting diagnoses or feasible treatment plans based on model predictions, LoRA will be employed to fine-tune response generation. Such text outputs could then be evaluated in collaboration with clinicians to assess their utility in real-world healthcare settings. Moreover, generating explanatory text alongside time series predictions may provide reasoning and insight into why the model produces certain forecasts [ 58 ], representing an important direction for our future investigation in model interpretability. Beyond training efficiency, we also demonstrated the feasibility of deploying GluLLM in resource-constrained smartphones. This is particularly important for preserving the privacy of sensitive health data, as it ensures that personal information remains on the user’s own device. On-device deployment is a particularly viable approach for LLMs in healthcare applications that require continuous, high-demand management tasks. In addition, clinical prediction and decision support are inherently time sensitive, and people cannot afford service interruptions, especially in scenarios where Internet connectivity is limited or unstable. This limitation is a common issue in cloud-based LLM deployments, further highlighting the importance of enabling robust, offline-capable inference on edge devices. As shown in Fig. 3 , LLM body and the TDI module are the primary contributors to CPU, memory, and power consumption during model execution. However, it is important to note that if the contextual information processed by TDI were instead fed directly into the LLM, the significantly longer input sequence would substantially increase computational demands. This is due to the quadratic complexity of Transformer layers with respect to input length. By separating daily contextual information into the TDI module, we not only preserve modeling flexibility but also improve both training and inference efficiency. Furthermore, in many cases, particularly when no insulin has been administered in the last hour, the embedding of TDI remains unchanged over days. In such scenarios, the embedding can be precomputed and stored on the device, eliminating the need for repeated inference and further reducing computational overhead. The total consumption of GluLLM resources on a smartphone is feasible, demonstrating its potential for long-term clinical decision support on devices. Moreover, the lightweight nature of the encoder and decoder modules, consuming only minimal system resources, makes it practical to deploy multiple neural network-based adapters for managing various health conditions in a unified digital health system ( Fig. 1 ). A further step toward advancing model deployment is to perform parameter quantization [ 59 ], which compresses the model and accelerates inference by reducing storage and computational demands. This approach would enable LLM-driven health management to be implemented on devices with more resources constraints and could also allow the use of models with larger parameter sizes for more powerful capability under limited hardware conditions. Methods Multimodal data preprocessing In this work, we used two primary clinical datasets with varied patient conditions. The first is REPLACE-BG [ 52 ], containing data from 226 participants with T1D at 14 US sites. Participants were equipped with Dexcom G4 CGM during a 26-week trial period. The second dataset was collected by University Hospital of Móstoles [ 54 ] in Madrid, comprising data from 207 participants with previous diagnoses of essential hypertension and explicit exclusion of prior diabetes diagnoses or antidiabetic medication use. These participants were monitored using Medtronic MiniMed iPro devices for periods ranging from 24 hours to 3 days, with a 33-month follow-up identifying 17 new cases of T2D. Both datasets are publicly available and provide comprehensive demographic information on participants, with glucose readings sampled at five-minute intervals. In particular, REPLACE-BG includes insulin bolus data, which is absent from the Móstoles dataset, since insulin treatment was not used for participants without diabetes. Supplementary Table 4 summarizes the demographics of these datasets. It is worth highlighting the significant differences between them, particularly in mean glucose levels and time in target range (TIR), underscoring the importance of model generalization capabilities for effective cross-dataset performance. To provide comprehensive prediction, we utilized all available information from this complex health management scenario. This includes CGM data as a time series, self-reported or device-logged daily events, and EHRs. CGM data was first scaled using standardization. Although both datasets maintain high quality with minimal missing CGM readings, we apply a mixed interpolation and extrapolation strategy, as used in our previous work [ 21 ]. Specifically, interpolation is applied when missing values occur within the middle of an input sequence, while extrapolation is used when gaps appear at the end, thereby avoiding data leakage. Demographic information was extracted and formatted for input to the EHR prompt module as: “Characteristics: - Age: {age}, - Sex: {sex}, - BMI: {bmi}, - HbA1c: {hba1c}, Predict next value based on historical glucose embedding:”. Timestamp data from CGM was formatted in date format and combined with insulin bolus information (when available) as input to the TDI module, for instance: “Historical data from 13:30 to 14:30. 5 units of bolus insulin delivered.” In the context of glucose prediction, we utilize historical CGM data within a defined temporal window to forecast future glycemic trajectories. Specifically, at any timestep t , we process a sequence of historical glucose readings G t = [ g t − L +1 , g t − L +2 , …, g t ] ∈ ℝ L , where L represents the fixed observation window. This historical data, combined with relevant clinical and contextual information, serves as input to predict the corresponding future glucose sequence Ĝ t = [ ĝ t +1 , ĝ t +2 , …, ĝ t + τ ] ∈ ℝ τ where τ denotes the prediction horizon. This mathematical formulation establishes the foundation for our sequence-to-sequence modeling approach. GluLLM for multimodal data integration We first utilized the tokenizer and embedding modules from the original LLM architecture to convert the EHR prompt into embedding representations, following standard natural language processing procedures. The parameters of the LLMs are frozen and no training is needed at this stage. We denote this embedding output as EE . It is important to note that demographic information is considered consistent for each individual in our experiments, allowing this embedding operation to be performed just once per individual, which improves computational efficiency. This prefix-prompt approach was motivated by recent advances in prompt engineering that have demonstrated significant improvements in LLM generalization performance, particularly for medical applications that may not have been encountered during model training [ 60 ]. This technique also represents a core module in TimeLLM [ 26 ], which has shown a substantial reduction in error for time series forecasting across various scenarios. Subsequently, we converted historical glucose readings G t into representations similar to language embeddings to enable compatibility with the LLM architecture. Various methods exist in the literature for this dimension alignment, ranging from simple linear layers [ 61 ] to more complex multi-head attention mechanisms [ 26 ]. Inspired by a recent efficient approach proposed in AutoTimes [ 27 ], which effectively captures rich chronological information, we divided historical CGM data into segments, each represented as a token. These tokens were then converted by the GluLLM encoder module to embeddings with dimensions that match the embedding vectors of the language tokens. We implemented fully connected neural networks to map this dimensional relationship. Although the length of the segments could be arbitrary, we used the prediction length τ as the segment length to naturally reformulate the problem into next-token prediction, which is a standard training objective for decoder-only LLMs [ 34 , 39 , 62 ]. This approach allowed us to leverage the powerful predictive capabilities and token transition inherent in these language models. Additionally, we assumed that the length of historical CGM data L was N multiples of τ to ensure consistent and uniform segmentation throughout the analysis. In this case, the input and output in glucose prediction can be rewritten as , where k n = g t +( n − N −1) τ +1 , g t +( n − N −1) τ +2 , g t +( n − N −1) τ + τ . Here, we set L = 72 and τ = 12, corresponding to six hours of historical CGM readings and a one-hour prediction horizon. The six-hour window captures the typical duration of insulin action, as the effects of a meal insulin bolus generally subside within this period. A one-hour prediction horizon is widely adopted in glucose prediction applications [ 15 ], which allows individuals sufficient opportunity to take proactive interventions in response to anticipated glycemic excursions. After converting each token k in the GluLLM encoder, we obtain the glucose embedding; where d denotes the dimensionality of the LLM’s embedding, which varies across the different models considered in this work. To fully leverage temporal information, we also divided the timestamp and insulin data into the same segments used for the CGM data, and input them into the TDI. As illustrated in Fig. 1 , TDI comprises a tokenizer, an embedding layer, and the LLM body from the original LLM. This design allows the model to process temporal information and associated daily activities, producing a corresponding language embedding. Specifically, we selected the last hidden state vector from the output embedding of each segment, as this position typically aggregates the most relevant contextual information from the input text and is commonly used in LLMs to predict the probability of the next token. By concatenating these vectors, we obtain . Aligning the temporal embedding vectors with the glucose embedding vectors in a one-to-one fashion, similar to positional encoding, and prepending the EHR prompt embedding at the beginning, we construct the final input embedding for the LLM. The output embedding is then obtained by processing this input through the LLM body: where the addition GE t + TE t is performed element-wise. In this regard, the next-hour of glucose levels can be obtained by projecting the last hidden hidden state using GluLLM decoder, which is also implemented as a fully connected neural network, and we have The flexibility of next-token prediction in this model also enables us to extend the prediction horizon beyond one hour. For example, to achieve a two-hour prediction horizon, we simply append the previously predicted glucose token to the input and perform an additional round of next-token prediction. Preliminary results, shown in Supplementary Fig. 5, demonstrate that this approach achieves over 90% clinically acceptable predictions at the extended two-hour horizon in both datasets. Model training and evaluation To train a population-level glucose prediction model, we partitioned the data on a per-individual basis to ensure subject-level separation. Given the substantial volume of data in the REPLACE-BG dataset (approximately 12 million CGM readings), we used it for model training, validation, and hold-out testing, while the Móstoles dataset served as an external validation cohort. Specifically, the REPLACE-BG dataset was first divided into a development set comprising 180 individuals and a hold-out testing set of 46 individuals. We then performed five-fold cross-validation on the development set to fine-tune the hyperparameters of GluLLM. In each fold, data from 135 individuals were used for training, while the remaining 45 individuals were used for validation. This process was repeated across folds to ensure robust hyperparameter selection (Supplementary Fig. 4). The performance of the model was evaluated primarily using the global RMSE over the prediction horizon τ . In the detailed comparison, RMSE is the primary metric reflecting overall prediction error at a specific time point. MAE is also widely used and is known to exhibit strong discriminative ability for glycemic variability [ 63 ]. MARD is a well-established accuracy metric in the CGM industry [ 64 ]. Finally, gRMSE is a customized RMSE that incorporates a penalty function inspired by the Clarke Error Grid [ 65 ], emphasizing clinical relevance by penalizing the prediction errors that could lead to harmful therapeutic interventions. The detailed definitions and equations for these metrics are provided in Supplementary Information. All deep learning models were implemented in PyTorch and accelerated using NVIDIA A10 Tensor Core GPUs. Hyperparameter optimization was conducted using Optuna to ensure efficient and systematic tuning. We performed a maximum of 10 optimization trials per model, exploring a search space that included learning rate, batch size, hidden layer dimensions, dropout rate, and optimizer parameters. After identifying optimal hyperparameters, we trained the model on the entire development set and evaluated its performance on the independent hold-out testing set of 46 individuals and the external validation cohort from the Móstoles dataset, which includes 207 individuals. Notably, hyperparameter tuning was performed for all baseline models to ensure fair comparisons. We employed the Adam optimizer and the mean square error loss function to optimize the model parameters. The trainable modules in GluLLM consist only of the encoder and decoder components, which comprise stacks of fully connected neural network layers. These components have a significantly smaller number of trainable parameters compared to full LLM fine-tuning or even PEFT approaches [ 56 ]. The computational complexity of this design is comparable to that of an extreme case of LoRA-based adapters [ 57 ], where a low-rank matrix with rank 1 is applied to a single Transformer layer. However, even with this efficient design, training GluLLM with larger models such as LLaMA 8B and Mistral 7B remains challenging on a GPU with 24 GB of RAM, even with a single batch of data, due to the significant computational demands of LLM inference. To address this, we distributed the Transformer layers of these two models across 8 NVIDIA A10 Tensor Core GPUs, the maximum number of GPUs available on a single node of our high-performance computing cluster, to enable model training. For smaller LLMs, inference can be performed on a single GPU; thus, we employed standard data parallelism by distributing batches across multiple GPUs to accelerate training. Smartphone platform implementation One of the most challenging aspects of deploying LLMs for on-device health management is determining the feasibility of implementation. To address this concern, we conducted a study implementing GluLLM on smartphone platforms. We selected the iOS platform specifically, as on-device LLMs will be available in their next generation of iPhones. For development, we utilized Swift 5 and Xcode 16.2 as the integrated development environment. It should be noted that we focused solely on evaluating memory and CPU usage of the proposed LLM-driven health application rather than developing a complete user interface. We implemented LLaMA 3.2 1B model using the GGUF format from Huggingface [ 66 ] with Full F16 weights and without quantization. The LLM inference was handled using the llama.cpp library [ 67 ]. For integration, the GluLLM encoder and decoder components were converted to Core ML format to facilitate efficient interaction with the LLaMA model. Real-time CPU and memory usage were assessed using Xcode’s profiling tools. The evaluation was conducted on an iPhone 14 Pro Max, equipped with an Apple A16 Bionic chip, a 6-core CPU, and 6 GB of RAM, as well as on an iPhone SE (2020), featuring an A13 Bionic chip, a 6-core CPU, and 3 GB of RAM. Data availability Datasets used in this study are publicly available. The REPLACE-BG dataset was obtained from Aleppo et al . (2017), “REPLACE-BG RCT Protocol 10-27-15 v2.0,” available at https://public.jaeb.org/dataset/546 . The analyses, interpretations, and conclusions presented herein are solely the responsibility of the authors and have not been reviewed or endorsed by Aleppo et al . The Móstoles dataset was obtained from Colás et al . (2019), available at https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0225817 . Code availability This study was implemented using the free and open-source programming language Python (version 3.11.8) along with the PyTorch (version 2.1.2) deep learning framework. The source code for both the deep learning model and the smartphone app is available from the corresponding authors upon reasonable request. Competing Interests J.M.M.H. is an employee of Novo Nordisk Research Centre Oxford and also a shareholder of Novo Nordisk. Supplementary Participant demographic profiles in the two clinical datasets Table 4 presents the demographic characteristics of participants in the REPLACE-BG and Móstoles datasets. Cohort stratification and cross-validation framework Fig. 4 illustrates the cohort stratification used in the five-fold cross-validation framework, as well as the procedures for model training and hold-out testing. Download figure Open in new tab Fig. 4: Data partitioning strategy. The REPLACE-BG dataset was first divided into a development set, which was used for training and validation in five-fold cross-validation, and a hold-out testing set, with splitting performed on an individual basis to prevent data leakage. The Móstoles dataset served as an independent external validation cohort to assess generalization performance. Definition of regression metrics Three widely used regression metrics for glucoselevel prediction were empolyed: RMSE, MAE, and MARD, defined as: where L T denotes the total number of testing samples, G t is the ground truth BG measurement at time t , and Ĝ t is the corresponding predicted value. To assess clinical relevance, we further used gRMSE [ 65 ] as follows: where P is a penalty function inspired by the Clarke Error Grid (CEG) [ 68 ], designed to penalize overestimation in hypoglycemia and underestimation in hyperglycemia. Definitions of the sigmoid-like functions , σ , and the corresponding parameters [ α L , β L , γ L , T L , α H , β H , γ H , T H ] can be found in the literature [ 65 ]. Clinically-relevant analysis at extended two-hour prediction horizon The percentage of predictions falling within zones A+B, which indicate clinically acceptable accuracy, was very high across both datasets. For the REPLACE-BG cohort, GluLLM achieved 92.26% accuracy within these zones, while for the Móstoles dataset, it reached an exceptional 99.09% accuracy as shown in Fig. 5 . These results demonstrate the strong clinical utility of our predictions even at extended forecast horizons. We did not provide a CEG plot for the REPLACE-BG dataset due to the excessive density of data points, which would render the visualization difficult to interpret. Download figure Open in new tab Fig. 5: Clark error grid (CEG) analysis comparing predicted glucose values against actual measurements at two-hour prediction horizon. This clinically-relevant visualization categorizes prediction errors into zones reflecting varying degrees of clinical significance, with zones A and B representing clinically acceptable predictions and zones C-E indicating potentially dangerous errors that could lead to inappropriate treatment decisions. Ablation study We conducted an ablation study to investigate the contribution of each central module in GluLLM, as shown in Fig. 6 . The results show that each module effectively reduces the global RMSE across both datasets. Notably, the improvement contributed by the TDI module is smaller in the Móstoles dataset compared to the REPLACE-BG dataset. This is primarily because the TDI input in REPLACE-BG includes insulin information, reflecting the intensive management for individuals with T1D, whereas such information is not available in the Móstoles dataset. Download figure Open in new tab Fig. 6: Ablation study evaluating the contributions of the EHR prompt and TDI modules to glucose prediction performance. Global RMSE over a 60-minute prediction horizon is used as the evaluation metric. Each dot represents the RMSE for an individual participant in the respective dataset. In the box plots, the lower and upper hinges correspond to the first (Q1) and third (Q3) quartiles, respectively; the central line indicates the median; and the whiskers extend to 1.5 times the interquartile range (IQR). Circles denote statistical outliers. Acknowledgements Taiyu Zhu is fully funded by Novo Nordisk and is a Novo Nordisk Postdoctoral Fellowship run in partnership with the University of Oxford. References [1]. ↵ Zhou , B. et al. Worldwide trends in diabetes since 1980: a pooled analysis of 751 population-based studies with 4· 4 million participants . The Lancet 387 , 1513 – 1530 ( 2016 ). OpenUrl CrossRef [2]. ↵ Sun , H. et al. Idf diabetes atlas: Global, regional and country-level diabetes prevalence estimates for 2021 and projections for 2045 . Diabetes Research and Clinical Practice 183 , 109119 ( 2022 ). OpenUrl CrossRef PubMed [3]. ↵ Gregg , E. W. , Sattar , N. & Ali , M. K. The changing face of diabetes complications . The Lancet Diabetes & Endocrinology 4 , 537 – 547 ( 2016 ). OpenUrl PubMed [4]. ↵ Chan , J. C. et al. The lancet commission on diabetes: using data to transform diabetes care and patient lives . The Lancet 396 , 2019 – 2082 ( 2020 ). OpenUrl [5]. ↵ Pickup , J. C. , Freeman , S. C. & Sutton , A. J. Glycaemic control in type 1 diabetes during real time continuous glucose monitoring compared with self monitoring of blood glucose: meta-analysis of randomised controlled trials using individual patient data . BMJ 343 ( 2011 ). [6]. Heinemann , L. et al. Real-time continuous glucose monitoring in adults with type 1 diabetes and impaired hypoglycaemia awareness or severe hypoglycaemia treated with multiple daily insulin injections (HypoDE): a multicentre, randomised controlled trial . The Lancet 391 , 1367 – 1377 ( 2018 ). OpenUrl [7]. Lind , M. et al. Continuous glucose monitoring vs conventional therapy for glycemic control in adults with type 1 diabetes treated with multiple daily insulin injections: the GOLD randomized clinical trial . JAMA 317 , 379 – 387 ( 2017 ). OpenUrl CrossRef PubMed [8]. ↵ Martens , T. et al. Effect of continuous glucose monitoring on glycemic control in patients with type 2 diabetes treated with basal insulin: a randomized clinical trial . JAMA 325 , 2262 – 2272 ( 2021 ). OpenUrl CrossRef PubMed [9]. ↵ Sinha , M. et al. A comparison of time delay in three continuous glucose monitors for adolescents and adults . Journal of Diabetes Science and Technology 11 , 1132 – 1137 ( 2017 ). OpenUrl [10]. ↵ Mathieu , C. , Gillard , P. & Benhalima , K. Insulin analogues in type 1 diabetes mellitus: getting better all the time . Nature Reviews Endocrinology 13 , 385 – 399 ( 2017 ). OpenUrl PubMed [11]. ↵ Frier , B. M. Hypoglycaemia in diabetes mellitus: epidemiology and clinical implications . Nature Reviews Endocrinology 10 , 711 – 722 ( 2014 ). OpenUrl PubMed [12]. ↵ Monnier , L. & Colette , C. Glycemic variability: should we and can we prevent it? Diabetes Care 31 , S150 – S154 ( 2008 ). OpenUrl Abstract / FREE Full Text [13]. ↵ Ceriello , A. , Monnier , L. & Owens , D. Glycaemic variability in diabetes: clinical and therapeutic implications . The Lancet Diabetes & Endocrinologyy 7 , 221 – 230 ( 2019 ). OpenUrl [14]. ↵ Hásková , A. et al. Real-time CGM is superior to flash glucose monitoring for glucose control in type 1 diabetes: the corrida randomized controlled trial . Diabetes Care 43 , 2744 – 2750 ( 2020 ). OpenUrl Abstract / FREE Full Text [15]. ↵ Zhu , T. , Li , K. , Herrero , P. & Georgiou , P. Deep learning for diabetes: A systematic review . IEEE Journal of Biomedical and Health Informatics 25 , 2744 – 2757 ( 2021 ). OpenUrl [16]. ↵ Li , K. , Liu , C. , Zhu , T. , Herrero , P. & Georgiou , P. GluNet: A deep learning framework for accurate glucose forecasting . IEEE Journal of Biomedical and Health Informatics 24 , 414 – 423 ( 2020 ). OpenUrl [17]. ↵ Shuvo , M. M. H. & Islam , S. K. Deep multitask learning by stacked long short-term memory for predicting personalized blood glucose concentration . IEEE Journal of Biomedical and Health Informatics 27 , 1612 – 1623 ( 2023 ). OpenUrl [18]. ↵ Zhu , T. , Li , K. , Herrero , P. & Georgiou , P. Personalized blood glucose prediction for type 1 diabetes using evidential deep learning and meta-learning . IEEE Transactions on Biomedical Engineering 70 , 193 – 204 ( 2023 ). OpenUrl PubMed [19]. ↵ Zhu , T. et al. Enhancing self-management in type 1 diabetes with wearables and deep learning . npj Digital Medicine 5 , 78 ( 2022 ). OpenUrl PubMed [20]. ↵ Deng , Y. et al. Deep transfer learning and data augmentation improve glucose levels prediction in type 2 diabetes patients . npj Digital Medicine 4 , 109 ( 2021 ). OpenUrl PubMed [21]. ↵ Zhu , T. et al. Multi-horizon glucose prediction across populations with deep domain generalization . IEEE Journal of Biomedical and Health Informatics ( 2024 ). [22]. ↵ Zhu , T. , Li , K. , Chen , J. , Herrero , P. & Georgiou , P. Dilated recurrent neural networks for glucose forecasting in type 1 diabetes . Journal of Healthcare Informatics Research 4 , 308 – 324 ( 2020 ). OpenUrl PubMed [23]. ↵ Fan , C. et al. Multi-horizon time series forecasting with temporal attention learning . In Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining , 2527 – 2535 ( 2019 ). [24]. ↵ Singhal , K. et al. Large language models encode clinical knowledge . Nature 620 , 172 – 180 ( 2023 ). OpenUrl CrossRef PubMed [25]. ↵ Thirunavukarasu , A. J. et al. Large language models in medicine . Nature Medicine 29 , 1930 – 1940 ( 2023 ). OpenUrl CrossRef PubMed [26]. ↵ Jin , M. et al. Time-llm: Time series forecasting by reprogramming large language models . In The Twelfth International Conference on Learning Representations ( 2024 ). [27]. ↵ Liu , Y. , Qin , G. , Huang , X. , Wang , J. & Long , M. Autotimes: Autoregressive time series forecasters via large language models . Advances in Neural Information Processing Systems 37 , 122154 – 122184 ( 2025 ). OpenUrl [28]. Gruver , N. , Finzi , M. , Qiu , S. & Wilson , A. G. Large language models are zero-shot time series forecasters . Advances in Neural Information Processing Systems 36 , 19622 – 19635 ( 2023 ). OpenUrl [29]. Sun , C. , Li , H. , Li , Y. & Hong , S. Test: Text prototype aligned embedding to activate llm’s ability for time series . In The Twelfth International Conference on Learning Representations ( 2024 ). [30]. ↵ Xue , H. & Salim , F. D. Promptcast: A new prompt-based learning paradigm for time series forecasting . IEEE Transactions on Knowledge and Data Engineering 36 , 6851 – 6864 ( 2023 ). OpenUrl [31]. ↵ Lara-Abelenda , F. J. et al. Personalized glucose forecasting for people with type 1 diabetes using large language models . Computer Methods and Programs in Biomedicine 265 , 108737 ( 2025 ). OpenUrl PubMed [32]. ↵ Healey , E. , Tan , A. L. M. , Flint , K. L. , Ruiz , J. L. & Kohane , I. A case study on using a large language model to analyze continuous glucose monitoring data . Scientific Reports 15 , 1143 ( 2025 ). OpenUrl PubMed [33]. ↵ Galindo , R. J. & Aleppo , G. Continuous glucose monitoring: the achievement of 100 years of innovation in diabetes technology . Diabetes Research and Clinical Practice 170 , 108502 ( 2020 ). OpenUrl PubMed [34]. ↵ Touvron , H. et al. Llama: Open and efficient foundation language models . CoRR abs/2302.13971 ( 2023 ). [35]. ↵ Jiang , A. Q. et al. Mistral 7b . CoRR abs/2310.06825 ( 2023 ). [36]. ↵ Rivière , M. et al. Gemma 2: Improving open language models at a practical size . CoRR abs/2408.00118 ( 2024 ). [37]. ↵ Abdin , M. et al. Phi-3 technical report: A highly capable language model locally on your phone . CoRR abs/2404.14219 ( 2024 ). [38]. ↵ Mehta , S. et al. Openelm: An efficient language model family with open training and inference framework . In Workshop on Efficient Systems for Foundation Models II@ ICML2024 ( 2024 ). [39]. ↵ Radford , A. et al. Language models are unsupervised multitask learners . OpenAI blog 1 , 9 ( 2019 ). OpenUrl [40]. ↵ Wu , H. , Xu , J. , Wang , J. & Long , M. Autoformer: Decomposition transformers with auto-correlation for long-term series forecasting . Advances in Neural Information Processing Systems 34 , 22419 – 22430 ( 2021 ). OpenUrl [41]. ↵ Zhang , Y. & Yan , J. Crossformer: Transformer utilizing cross-dimension dependency for multivariate time series forecasting . In The Eleventh International Conference on Learning Representations ( 2023 ). [42]. ↵ Zhou , T. et al. Fedformer: Frequency enhanced decomposed transformer for long-term series forecasting . In International Conference on Machine Learning , 27268 – 27286 ( PMLR , 2022 ). [43]. ↵ Zhou , H. et al. Informer: Beyond efficient transformer for long sequence time-series forecasting . Proceedings of the AAAI Conference on Artificial Intelligence 35 , 11106 – 11115 ( 2021 ). OpenUrl [44]. ↵ Liu , Y. et al. itransformer: Inverted transformers are effective for time series forecasting . In The Twelfth International Conference on Learning Representations ( 2024 ). [45]. ↵ Liu , Y. , Wu , H. , Wang , J. & Long , M. Non-stationary transformers: Exploring the stationarity in time series forecasting . Advances in Neural Information Processing Systems 35 , 9881 – 9893 ( 2022 ). OpenUrl [46]. ↵ Nie , Y. , Nguyen , N. H. , Sinthong , P. & Kalagnanam , J. A time series is worth 64 words: Long-term forecasting with transformers . In The Eleventh International Conference on Learning Representations ( 2023 ). [47]. ↵ Vaswani , A. et al. Attention is all you need . Advances in Neural Information Processing Systems 30 ( 2017 ). [48]. ↵ Oreshkin , B. N. , Carpov , D. , Chapados , N. & Bengio , Y. N-beats: Neural basis expansion analysis for interpretable time series forecasting . In International Conference on Learning Representations ( 2020 ). [49]. ↵ Zeng , A. , Chen , M. , Zhang , L. & Xu , Q. Are transformers effective for time series forecasting? Proceedings of the AAAI Conference on Artificial Intelligence 37 , 11121 – 11128 ( 2023 ). OpenUrl [50]. ↵ Das , A. et al. Long-term forecasting with tide: Time-series dense encoder . Transactions on Machine Learning Research ( 2023 ). [51]. ↵ Wu , H. et al. Timesnet: Temporal 2d-variation modeling for general time series analysis . In The Eleventh International Conference on Learning Representations ( 2023 ). [52]. ↵ Aleppo , G. et al. REPLACE-BG: a randomized trial comparing continuous glucose monitoring with and without routine blood glucose monitoring in adults with well-controlled type 1 diabetes . Diabetes Care 40 , 538 – 545 ( 2017 ). OpenUrl Abstract / FREE Full Text [53]. ↵ Li , D. , Yang , Y. Song , Y.-Z. & Hospedales , T. Learning to generalize: Meta-learning for domain generalization . Proceedings of the AAAI Conference on Artificial Intelligence 32 ( 2018 ). [54]. ↵ Colás , A. , Vigil , L. , Vargas , B. , Cuesta-Frau , D. & Varela , M. Detrended fluctuation analysis in the prediction of type 2 diabetes mellitus in patients at risk: Model optimization and comparison with other metrics . PloS One 14 , e0225817 ( 2019 ). OpenUrl PubMed [55]. ↵ Tam , W. , Alajlani , M. & Abd-Alrazaq , A. An exploration of wearable device features used in uk hospital parkinson disease care: Scoping review . Journal of Medical Internet Research 25 , e42950 ( 2023 ). OpenUrl PubMed [56]. ↵ Han , Z. , Gao , C. , Liu , J. , Zhang , J. & Zhang , S. Q. Parameter-efficient fine-tuning for large models: A comprehensive survey . Transactions on Machine Learning Research ( 2024 ). URL https://openreview.net/forum?id=lIsCS8b6zj . [57]. ↵ Hu , E. J. et al. LoRA: Low-rank adaptation of large language models . In The tenth International Conference on Learning Representations ( 2022 ). [58]. ↵ Chow , W. , Gardiner , L. , Hallgrímsson , H. T. , Xu , M. A. & Ren , S. Y. Towards time-series reasoning with llms . In NeurIPS 2024 Workshop on Time Series in the Age of Large Models . ( 2024 ). [59]. ↵ Lin , J. et al. Awq: Activation-aware weight quantization for on-device llm compression and acceleration . Proceedings of Machine Learning and Systems 6 , 87 – 100 ( 2024 ). OpenUrl [60]. ↵ Zaghir , J. et al. Prompt engineering paradigms for medical applications: Scoping review . Journal of Medical Internet Research 26 , e60501 ( 2024 ). OpenUrl CrossRef PubMed [61]. ↵ Zhou , T. , Niu , P. , Sun , L. , Jin , R. et al. One fits all: Power general time series analysis by pretrained LM . Advances in Neural Information Processing Systems 36 , 43322 – 43355 ( 2023 ). OpenUrl [62]. ↵ Brown , T. et al. Language models are few-shot learners . Advances in Neural Information Processing Systems 33 , 1877 – 1901 ( 2020 ). OpenUrl [63]. ↵ Moscardó , V. et al. Assessment of glucose control metrics by discriminant ratio . Diabetes Technology & Therapeutics 22 , 719 – 726 ( 2020 ). OpenUrl PubMed [64]. ↵ Battelino , T. et al. Continuous glucose monitoring and metrics for clinical trials: an international consensus statement . The Lancet Diabetes & Endocrinology 11 , 42 – 57 ( 2023 ). OpenUrl PubMed [65]. ↵ Del Favero , S. , Facchinetti , A. & Cobelli , C. A glucose-specific metric to assess predictors and identify models . IEEE Transactions on Biomedical Engineering 59 , 1281 – 1290 ( 2012 ). OpenUrl PubMed [66]. ↵ Kealty , C. Llama-3.2-1b-instruct-gguf . https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF ( 2024 ). Accessed: 2025-04-09 . [67]. ↵ Gerganov , G. llama.cpp: Llm inference in c/c++ . https://github.com/ggml-org/llama.cpp ( 2023 ). xAccessed: 2025-04-09 . [68]. ↵ Clarke , W. L. , Cox , D. , Gonder-Frederick , L. A. , Carter , W. & Pohl , S. L. Evaluating clinical accuracy of systems for self-monitoring of blood glucose . Diabetes Care 10 , 622 – 628 ( 1987 ). OpenUrl Abstract / FREE Full Text View the discussion thread. Back to top Previous Next Posted July 14, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Empowering digital health management with on-device large language models for glucose prediction Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Empowering digital health management with on-device large language models for glucose prediction Taiyu Zhu , Joanna Howson , Alejo Nevado-Holgado medRxiv 2025.07.12.25331188; doi: https://doi.org/10.1101/2025.07.12.25331188 Share This Article: Copy Citation Tools Empowering digital health management with on-device large language models for glucose prediction Taiyu Zhu , Joanna Howson , Alejo Nevado-Holgado medRxiv 2025.07.12.25331188; doi: https://doi.org/10.1101/2025.07.12.25331188 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9220) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffd87d12cca58d3',t:'MTc3OTQ3MDgzNQ=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();
Text is read by the "Ask this paper" AI Q&A widget below.
Extraction quality varies by source — PMC NXML preserves structure
cleanly, OA-HTML may include some navigation residue, and OA-PDF can
have broken hyphenation. The publisher copy
(via DOI)
is the canonical version.