Assessing Supervised Natural Language Processing (NLP) Classification of Violent Death Narratives: Development and Assessment of a Compact Large Language Model (LLM) Approach

doi:10.1101/2025.01.16.25320680

Assessing Supervised Natural Language Processing (NLP) Classification of Violent Death Narratives: Development and Assessment of a Compact Large Language Model (LLM) Approach

2025 · doi:10.1101/2025.01.16.25320680

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 41,819 characters · extracted from preprint-html · click to expand

Assessing Supervised Natural Language Processing (NLP) Classification of Violent Death Narratives: Development and Assessment of a Compact Large Language Model (LLM) Approach | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Assessing Supervised Natural Language Processing (NLP) Classification of Violent Death Narratives: Development and Assessment of a Compact Large Language Model (LLM) Approach Susan T. Parker doi: https://doi.org/10.1101/2025.01.16.25320680 Susan T. Parker 1 Research Assistant Professor Northwestern University Feinberg School of Medicine Find this author on Google Scholar Find this author on PubMed Search for this author on this site For correspondence: susan.parker{at}northwestern.edu Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Objective The recent availability of law enforcement and coroner/medical examiner reports for nearly every violent death in the US expands the potential for natural language processing (NLP) research into violence. The objective of this work is to assess applications of supervised NLP to unstructured narrative data in the National Violent Death Reporting System (NVDRS). Materials and Methods This analysis applied distilBERT, a compact LLM, to unstructured narrative data to simulate the impacts of pre-processing, volume and composition of training data on model performance, evaluated by F1-scores, precision, recall and the false negative rate. Model performance was evaluated for bias by race, ethnicity, and sex by comparing F1-scores across subgroups. Results A minimum training set of 1,500 cases was necessary to achieve an F1-score of 0.6 and a false negative rate of .01-.05 with a compact LLM. Replacement of domain-specific jargon improved model performance while oversampling positive class cases to address class imbalance did not substantially improve F1 scores. Between racial and ethnic groups, F1-score disparities ranged from 0.2 to 0.25, and between male and female victims differences ranged from 0.12 to 0.2. Discussion Findings demonstrate that compact LLMs with sufficient training data can be applied to supervised NLP tasks to events with class imbalance in NVDRS unstructured police and coroner/medical examiner reports. Conclusion Simulations of supervised text classification across the model-fitting process of pre-processing and training a compact LLM informed NLP applications to unstructured death narrative data. INTRODUCTION Violent injuries are among the leading causes of death in the United States for individuals under the age of 44, and are leading causes for young people aged 10-34. 1 The most comprehensive and detailed source of data on violent deaths in the United States is the National Violent Death Reporting System (NVDRS), aggregating information from death certificates, coroner/medical examiner reports, and law enforcement reports to characterize violent deaths. 2 Researchers have used structured data from NVDRS extensively to characterize the epidemiology of violent deaths including homicides, 3 – 6 suicides, 7 – 10 and those that result from legal intervention (police shootings). 11 While NVDRS has been widely used for its structured data, which captures information such as victim characteristics, weapons, circumstances, and suspect information, 12 far less attention has been given to the vast amounts of unstructured text data embedded within the narrative reports. Narratives provide rich details about the incident not necessarily captured in structured variables, such as nuanced descriptions of precipitating events and other contextual factors that are difficult to quantify. Despite the rich information these narratives provide, use of NVDRS narratives in research has been limited. Studies using narratives have, with few exceptions, 13 , 14 mostly relied on labor-intensive manual review and qualitative coding methods to analyze narrative content. 15 – 19 Machine learning techniques designed to analyze unstructured text, known as natural language processing (NLP), have the potential to enable researchers to use NVDRS narrative data more efficiently, and thus to take on research questions that might otherwise be resource intensive. Applications of NLP to a related text narrative type, clinical notes from medical providers, have identified patient self-harm 20 – 25 and violence-related 26 – 29 outcomes. Developing applications of NLP to NVDRS is particularly important because the volume of NVDRS data will substantially increase over time. NVDRS has gathered data on over 500,000 deaths since 2003 and will grow by approximately 100,000 records annually moving forward as additional states and counties participate. Although large language models (LLMs) have generally performed better than other NLP approaches to narrative data in medical informatics domains, few applications of LLMs to NVDRS exist. 13 , 14 In part, researchers and practitioners may face particular challenges applying LLMs to NVDRS. One important challenge is that many outcomes of interest are likely to be infrequent or rare events that can present classification challenges due to sparse information about the outcome. 30 – 34 Further, NVDRS narratives are composed of police and coroner reports which contain domain-specific language, or jargon, such as use of International Classification of Disease (ICD) codes. 35 – 40 NVDRS data restrictions on sensitive data do not permit narratives to be stored in the cloud thus limiting access to computing resources that are often used to train or fine-tune LLMs. Fourth, researchers documented racial disparities in narratives alongside gendered text differences in NVDRS. 41 – 44 Narratives involving victims from marginalized populations tend to be significantly shorter in length and are more likely to be missing altogether. These differences in data quality may result in models that generate predictions with similar patterns of subgroup bias. To address these challenges, this paper conducts simulations of supervised text classification that span the machine learning pipeline, from data preprocessing and model training to the evaluation of predictions for potential racial or gender bias. Text classification outcomes with class imbalance were selected, as this setting is likely of most use to NVDRS applications, and models were fit using a compact LLM to reflect settings where computing resources are limited. By conducting simulations, this analysis aims to inform future applications of supervised classification using LLMs to NVDRS by establishing concrete benchmarks for understanding training data quantity, pre-processing needs, and to what extent NLP results in predictions reflecting existing racial and/or gender bias in narratives. METHODS Data This analysis used violent death records from NVDRS data from 2015-2020. The National Violent Death Reporting System gathers information about violent deaths including homicides, suicides, and deaths caused by law enforcement. NVDRS combines data from death certificates, coroner/medical reports and law enforcement reports, providing context about violent deaths including information about mental health conditions, toxicology results, and other circumstances in addition to detail about victim characteristics. Trained abstractors code information about violent deaths into the over 600 variables that comprise the NVDRS surveillance system. 12 To obtain labeled outcomes for use as target outcomes in simulations, this analysis constructed measures from existing coded NVDRS variables that abstractors label. Because a substantial proportion of coded NVDRS fields group together case outcomes that are negative with those that are not known, this analysis instead relied on multinomial fields or combined separate NVDRS coded variables to obtain target outcomes for simulations. For instance, for case outcomes such a mental health crisis or drug involvement, outcomes are coded as “Yes” or as “No, Not Available, Unknown,” which would not constitute a labeled outcome. These constructed outcomes include four binary outcomes likely to be recorded accurately when known. The first outcome is whether or not a homicide is a legal intervention homicide, meaning the shooter was a law enforcement officer. Literature suggests that these homicides are well-recorded in NVDRS and less subject to noisy labeling or measurement error. 11 The second outcome is whether or not a homicide is classified as a driveby shooting. The third outcome is whether a homicide occurred at home or not, and the fourth outcome is whether or not additional victims were non fatally shot in the course of a homicide event. we constrain the sample to where the weapon type is listed as firearm and the abstractor manner of death is a homicide. Taken together, these outcomes represent a range of language complexity and frequency less subject to label noise by constructing outcomes. Statistical Analysis This analysis compared model performance across four configurations of training data and text composition using a compact large language model (LLM). The configurations examined included pre-processing of text data as well as the amount and composition of the training data. Specifically, the analysis first varied the amount of training data that the model was fitted on to inform how much randomly sampled training data must be annotated to train a LLM to predict NVDRS outcomes. Second, because positive class cases were often infrequent, the analysis simulated oversampling of positive class cases in training data. Specifically, oversampling included a larger proportion of additional positive class cases, holding the negative class cases constant, to inform what composition of training data was most effective to include as training data. This analysis additionally simulated different preprocessing techniques for unstructured text data. NVDRS text may be domain-specific as it comprises police and coroner reports which use both jargon and abbreviation. To simulate the impacts of clarifying common abbreviations, this analysis replaced NVDRS abbreviations with unabbreviated text. For example, often when NVDRS abstractors referred to victims and suspects in the report narratives, the abbreviations “v” for victim and “s” for suspect appeared rather than the full word. Abbreviations referring to victims, suspects, police, and gunshot wounds were replaced (see Appendix Table 1 ). View this table: View inline View popup Table 1: Sample Descriptive Statistics, Characteristics by Outcome Finally, the analysis simulated omitting coroner report text from the training data. Coroner reports may contain extraneous text such as toxicology reports that may be noisy in the context of prediction focused on criminal justice outcomes. Further, compact LLMs have limited token lengths which constrain the number of words in an input narrative and the combination of coroner and homicide reports can exceed the token length in some LLM applications. Because our outcomes are law enforcement focused, the analysis simulated omission of potentially extraneous narrative information. The analysis began by pre-processing the coroner and police narrative by removing special characters including numbers, punctuation and capitalization as is standard. Police and coroner report narratives were combined into a single field in order to use information available in both narratives (with the exception of the law enforcement narrative-only simulation). Next, the analysis turned to creating simulated data. First, a test set on which the model outputs were to be evaluated was randomly selected. The test set consisted of a random sample of 30 percent of each outcome’s records, which was then held out from any selection into the training data. To vary the amounts of training data, the analysis used different training data record counts, each with a different amount of training data. These splits ranged from a minimum of 100 cases, increasing in increments to 200, 500, 1,000, 1,500, and up to 2,000 cases. Each split was randomly sampled from the full dataset specified for each outcome, so that each training split maintained a proportion of positive and negative cases that approximates the true proportion. The prior sample was included in the next iteration to isolate the impact of adding additional training data, not adding different training data. For instance, to obtain 500 cases, first, the prior 200 cases were preserved and an additional 300 were sampled to comprise 500 cases. To simulate the impacts of language replacement and law enforcement-only text, the analysis followed the procedure process outlined above to randomly select training data in the same 100, 200, 500, 1000, 1500, 2000 increments. In the second configuration of training data, the composition of positive class cases was altered from the true proportion in the training data. Instead of randomly sampling cases, the proportion of positive class cases was increased in the training data by adding additional positive class cases to the negative class cases. The positive class cases were incrementally increased until they comprise 10, 20, 30, 40, and up to 50 percent of the training data starting from a baseline of 1,000 cases as lower amounts of training data were not performant in this application. For instance, to obtain training data composed of 10 percent positive class cases for legal intervention homicide, the process started with randomly sampled training data with 1,000 records, of which 54 were legal intervention homicides and 940 were not. To the 940 negative class cases, 59 additional positive class cases were added so that the total number of positive class cases was 113 (54+59) and the total was 1,059 cases, of which approximately 10 percent (113 / 1,059) were legal intervention homicides. For each of the configurations described above, distilBERT, a LLM with fewer parameters but comparable accuracy to large scale LLMs, was used. 45 Compact LLMs in this context were selected to better allow for simulated iteration with fewer computational needs and because data protections do not data cloud storage and computing application. The distilBERT models were fine-tuned on training data to select model parameters. Parameters were selected in initial fine-turning using two outcomes (legal intervention and drive-by). Because model parameters in each fine-tuned model were identical, these parameters were applied to each training data configuration (see Appendix Table 2 ). Because our target outcomes are imbalanced, we add a weighted trainer to account for class imbalance. View this table: View inline View popup Table 2: Narrative Descriptive Statistics, Characteristics by Outcome Classification performance was measured using learning curves, which plot performance metrics relative to differing splits of labeled training data to evaluate classifier model performance. Binary classification model metrics including precision and recall in addition to metrics considered useful for imbalanced class problems, including an F1 score, were used. Finally, to analyze classification performance by subgroup, learning curves were created for sex race, and ethnicity subgroups. RESULTS Classification outcomes differed by the proportion of positive to negative cases in each outcome ( Table 1 ). The most rare positive class outcome was a police shooting (5.9%) followed by drive-by shootings (9.2%) and shootings where additional victims were non fatally shot (15%) in the course of the homicide. The most prevalent outcome was whether a victim is shot in their home (25%) relative to another location outside the home. Victims of homicide in the sample tended to be male (84-85%), Black or African American (58-60%), and young, with the most frequent age range between 25-34 years of age ( Table 1 ). Intimate partner violence characterizes over a tenth of homicides overall but within cases where a victim is injured at home, intimate partner violence and preceding arguments occurred in over a quarter of cases (26% and 29%). Legal intervention homicides were most likely associated with mental health problems and alcohol use. Circumstances were known for almost all cases of legal intervention and drive-by shootings (98 and 100%) but less information was known about the circumstances of homicide where additional victims were shot or when victims were injured at home ( Table 2 ). Circumstances were known in 71% of homicides of Black victims in contrast to 83-84% among Hispanic and non-Hispanic white victims. The median number of words in a narrative for a law enforcement narrative was 81-83 words whereas CME narratives ranged from 88-91 words in length. Legal intervention homicides had the most lengthy narratives (115 for LE and 120 for CME). Narrative length differed by race and sex. Among law enforcement narratives, median length for Black victims was 98 words but 132 for non-Hispanic white victims. Narrative length differed among male and female victims. Female victims had longer narratives for each homicide outcome. Female victims shot at home had a median narrative length of 124 words in contrast to male victims shot at home with a length of 92 words. Table 3 displays classification performance by F1 score for each model type. Training data of approximately 1,500 cases achieved an F1 score of at least .6 for each outcome, though at 1,000 cases the majority of outcomes were at or exceeding .6. The exception was the number non-fatally shot. Figure 1 plots learning curves by F1 score in Table 3 . Replacement language models tended to perform best ( Table 3 , Figure 1 ) with the highest F1 score in all save six model interactions. In particular, language replacement models consistently obtained the highest F1 score for legal intervention homicides ( Table 3 , Figure 1 ). Omitting coroner/ medical examiner reports performed worse across outcomes. Language replacement models trained on 1,500-2,000 narratives obtained low false negative rates ranging from 1-5% of true cases resulting in a misclassified outcome ( Figure 1 , Appendix Table 4 ). View this table: View inline View popup Download powerpoint Table 3: F1 Scores by Model Outcome, Training Data, and Model Type Download figure Open in new tab Download figure Open in new tab Download figure Open in new tab Download figure Open in new tab Figure 1: Learning Curve by Outcome, Model Type Panel A: F1 Score View this table: View inline View popup Table 4: Classification Performance for Language Replacement Models by Outcome by Subgroup Oversampling positive class cases was negligibly helpful in improving F1 scores ( Figure 2 ). For instance, oversampling for legal intervention homicide to be composed of 20 percent positive class cases resulted in the addition of 580 positive class cases added to training data and an F1 score of .795 (Appendix Table 4 ; Figure 2 ). Relative to adding 500 randomly sampled cases which would result in an F1 score of .771 (Appendix Table 4 ), the gain from oversampling was 0.024 (.795 - .771) and therefore modest. Download figure Open in new tab Figure 2: F1 Learning Curve for Oversampled Positive Class Cases vs. Baseline Language Replacement Model Notes: F1 scores are plotted for distilBERT models fit with language replacement for both randomly sampled training data and oversampled training data. Oversampled training data corresponds to an increment of a 10 percent increase in the proportion of positive class cases included in training data. Exact training data set counts are in Appendix Table 4. Random train data is plotted at n=1,000, 1,500 and 2,000 randomly sampled training data records for reference. Figure 3 plots F1 scores of distilBERT language replacement models as these models tended to perform best overall and may capture linguistic differences most accurately across subgroups. Predictions differ by race/ethnicity and sex across models. Legal intervention homicide victims who were white or Hispanic were most often correctly classified as such, and Black victims were least likely to be correctly classified ( Figure 3 , panel A). The prediction difference is substantial for legal intervention victims with lower amounts of training data, though the gap persisted with higher volumes of training data. White victims shot at home were most often correctly predicted while Black and Hispanic victims were least likely. Female victims were less likely to be correctly predicted than male victims in all instances save if they were shot at home. Among models with at least 1,500 records of training data, F1-score disparities ranged from 0.2 to 0.25 by race and ethnicity, and between male and female victims with differences ranging from 0.12 to 0.2 ( Table 4 ). Download figure Open in new tab Download figure Open in new tab Figure 3: F1 Learning Curves for distilBERT+language models by Subgroup DISCUSSION This analysis simulated the NLP model-fitting process to demonstrate how different training and pre-processing decisions impact model performance in NLP applications of text classification of violent death homicide and coroner narratives. Fine tuning compact LLMs on NVDRS text requires approximately 1,000-1,500 training data records to achieve an F1 score of at least .6. Results suggested that compact LLMs are less useful in few shot learning applications with limited training data. Oversampling the positive class cases in training data does not increase prediction accuracy substantially over randomly sampled training data. Predictions differed by race, ethnicity and sex. Differential prediction by subgroup is not explainable by outcome frequency or narrative length alone. For instance, white victims of police shootings are less prevalent than Black victims in the sample but are more often classified correctly. Similarly, female victims have longer median narratives for all outcomes, but are less likely to be correctly classified. Further research should characterize sources of differential prediction, whether input narratives or exacerbation by NLP classifier, and examine fairness aware models particularly if the prediction is used for decision-making or resource allocation in public health settings. These findings may inform a range of researchers and practitioners in the health informatics and public health. Compact LLMs with simple text changes can effectively predict rare NVDRS outcomes is of use to researchers considering the use of supervised machine learning to expand what is known about violent deaths beyond existing coded fields. For researchers seeking annotated training data, random sampling and labeling a sufficient number of cases (approximately 1,000) combined with a weighting layer is an effective strategy. Further, manual annotation requirements for NVDRS applications to rare events do not require prohibitive amounts of training data which can require substantial costs. For instance, if labeling a narrative requires approximately 2-5 minutes of annotator time, it would require approximately 33-83 hours of annotation time for one annotator. While it is likely that future access to privacy-compliant sophisticated LLMs will be more accessible to researchers working with sensitive data, in the interim, this analysis provides useful baselines for researchers considering similar undertakings. Finally, this approach may assist state level violent death reporting systems in the lengthy process of abstraction in NVDRS, where manual abstractor annotation of violent death narratives result in long delays to data access. This research is subject to several limitations. First, results from a compact LLM may not fully generalize to new LLMs with additional sophistication or to different language contexts beyond NVDRS. Label noise from NVDRS annotators may mean that results understate the performance of compact LLMs, which is consistent with police shootings tending to be the outcome type that is most accurately predicted. The potential for differential prediction by subgroup raises concerns about fairness and equity in model performance. Further investigations into the sources of this differential prediction are needed to ensure that NLP applications do not exacerbate existing disparities. Conclusion This study conducted a comprehensive analysis of the model-fitting process for supervised binary classification of infrequent violent death outcomes using natural language processing (NLP) techniques on National Violent Death Reporting System (NVDRS) narrative data. Through simulations, this study examined the impacts of pre-processing, the quantity and composition of labeled training data on model performance, as well as by race, ethnicity, and sex subgroups. Data Availability All data produced in the study are available for permissioned researchers by applying for restricted access National Violent Death Reporting System Data. https://www.cdc.gov/nvdrs/about/nvdrs-data-access.html Competing Interest None Funding and all other required statements This work was funded by APHA AWARD # 2023-0011. Acknowledgments I thank Matthew Miller and Deb Azrael for comments on a prior draft of this article. I thank Daniel Bowen and Stephen Sumner for valuable discussion in development of this article. REFERENCES 1. ↵ WISQARS Leading Causes of Death Visualization Tool . Centers for Disease Control and Prevention https://wisqars.cdc.gov/lcd/ . 2. ↵ CDC. About The National Violent Death Reporting System . National Violent Death Reporting System (NVDRS) https://www.cdc.gov/nvdrs/about/index.html ( 2024 ). 3. ↵ Chatfield , S. L. , DeBois , K. A. & Evans , S. D . Mixed Methods Secondary Analysis of Older Adult Homicide-Suicides from National Violent Death Reporting System (NVDRS) Data . Am. J. Qual. Res . 6 , 115 – 132 ( 2022 ). OpenUrl 4. Fowler , K. A. , Leavitt , R. A. , Betz , C. J. , Yuan , K. & Dahlberg , L. L . Examining differences between mass, multiple, and single-victim homicides to inform prevention: findings from the National Violent Death Reporting System . Inj. Epidemiol . 8 , 49 ( 2021 ). 5. Rogers , E. M. & Davis , J . The Research Utility of the National Violent Death Reporting System for Understanding Homicide Trends . J. Contemp. Crim. Justice 40 , 26 – 47 ( 2024 ). OpenUrl 6. ↵ Adhia , A. , Austin , S. B. , Fitzmaurice , G. M. & Hemenway , D . The Role of Intimate Partner Violence in Homicides of Children Aged 2-14 Years . Am. J. Prev. Med . 56 , 38 – 46 ( 2019 ). OpenUrl PubMed 7. ↵ Anglemyer , A. , Horvath , T. & Rutherford , G . The Accessibility of Firearms and Risk for Suicide and Homicide Victimization Among Household Members: A Systematic Review and Meta-analysis . Ann. Intern. Med . 160 , 101 – 110 ( 2014 ). OpenUrl CrossRef PubMed 8. Azrael , D ., et al. Identifying and Tracking Gas Suicides in the U.S. Using the National Violent Death Reporting System, 2005–2012 . Am. J. Prev. Med . 51 , S219 – S225 ( 2016 ). OpenUrl PubMed 9. Barber , C. , Azrael , D. , Miller , M. & Hemenway , D . Who owned the gun in firearm suicides of men, women, and youth in five US states? Prev. Med . 164 , 107066 ( 2022 ). 10. ↵ Barber , C. , Walters , H. , Brown , T. & Hemenway , D . Suicides at Shooting Ranges . Crisis 42 , 13 – 19 ( 2021 ). OpenUrl PubMed 11. ↵ Conner , A. , Azrael , D. , Lyons , V. H. , Barber , C. & Miller , M . Validating the National Violent Death Reporting System as a Source of Data on Fatal Shootings of Civilians by Law Enforcement Officers . Am. J. Public Health 109 , 578 – 584 ( 2019 ). OpenUrl CrossRef PubMed 12. ↵ CDC. National Violent Death Reporting System Web Coding Manual, 6.0 . ( 2022 ). 13. ↵ Zhou , W. , Prater , L. C. , Goldstein , E. V. & Mooney , S. J . Identifying Rare Circumstances Preceding Female Firearm Suicides: Validating A Large Language Model Approach . JMIR Ment. Health 10 , e49359 ( 2023 ). OpenUrl 14. ↵ Wang , S. et al. An NLP approach to identify SDoH-related circumstance and suicide crisis from death investigation narratives . J. Am. Med. Inform. Assoc. JAMIA 30 , 1408 – 1417 ( 2023 ). OpenUrl PubMed 15. ↵ Graham , L. M. et al. Intimate Partner Violence–Related Fatalities Among U.S. Youth Aged 0–24 Years, 2014–2018 . Am. J. Prev. Med . 62 , 529 – 537 ( 2022 ). OpenUrl PubMed 16. Mays , V. M. et al. Identifying Witnessed Suicides in National Violent Death Reporting System Narratives . Healthcare 12 , 209 ( 2024 ). 17. Kafka , J. M. et al. Intimate Partner Violence Circumstances for Fatal Violence in the US. JAMA Netw . Open 6 , e2312768 ( 2023 ). OpenUrl 18. Mezuk , B. , Ko , T. M. , Kalesnikava , V. A. & Jurgens , D. Suicide Among Older Adults Living in or Transitioning to Residential Long-term Care, 2003 to 2015 . JAMA Netw. Open 2 , e195627 ( 2019 ). OpenUrl 19. ↵ Ko , T. M. , Kalesnikava , V. A. , Jurgens , D. & Mezuk , B . A Data Science Approach to Estimating the Frequency of Driving Cessation Associated Suicide in the US: Evidence From the National Violent Death Reporting System . Front. Public Health 9 , 689967 ( 2021 ). 20. ↵ Workman , T. E. et al. Identifying suicide documentation in clinical notes through zerolJshot learning . Health Sci. Rep . 6 , e1526 ( 2023 ). OpenUrl 21. Fernandes , A. C. et al. Identifying Suicide Ideation and Suicidal Attempts in a Psychiatric Clinical Research Database using Natural Language Processing . Sci. Rep . 8 , 7426 ( 2018 ). OpenUrl PubMed 22. Obeid , J. S. et al. Identifying and Predicting Intentional Self-Harm in Electronic Health Record Clinical Notes: Deep Learning Approach . JMIR Med. Inform . 8 , e17784 ( 2020 ). OpenUrl 23. Carson , N. J. et al. Identification of suicidal behavior among psychiatrically hospitalized adolescents using natural language processing and machine learning of electronic health records . PLOS ONE 14 , e0211116 ( 2019 ). OpenUrl PubMed 24. Levis , M. , Westgate , C. L. , Gui , J. , Watts , B. V. & Shiner , B . Natural language processing of clinical mental health notes may add predictive value to existing suicide risk models . Psychol. Med . 51 , 1382 – 1391 ( 2021 ). OpenUrl PubMed 25. ↵ Bey , R. et al. Natural language processing of multi-hospital electronic health records for public health surveillance of suicidality . Npj Ment. Health Res . 3 , 1 – 9 ( 2024 ). OpenUrl PubMed 26. ↵ Tabaie , A. , Zeidan , A. J. , Evans , D. P. , Smith , R. N. & Kamaleswaran , R . A Novel Technique to Identify Intimate Partner Violence in a Hospital Setting . West. J. Emerg. Med . 23 , 781 ( 2022 ). 27. Mason , A. J. C. et al. Applying neural network algorithms to ascertain reported experiences of violence in routine mental healthcare records and distributions of reports by diagnosis . Front. Psychiatry 15 , ( 2024 ). 28. Botelle , R. et al. Can natural language processing models extract and classify instances of interpersonal violence in mental healthcare electronic records: an applied evaluative study . BMJ Open 12 , e052911 ( 2022 ). OpenUrl Abstract / FREE Full Text 29. ↵ Parker , S. T . Estimating Nonfatal Gunshot Injury Locations With Natural Language Processing and Machine Learning Models. JAMA Netw . Open 3 , e2020664 ( 2020 ). OpenUrl 30. ↵ Ali , A. , Shamsuddin , S. M. & Ralescu , A. L . Classification with class imbalance problem: A Review . 29 . 31. Padurariu , C. & Breaban , M. E . Dealing with Data Imbalance in Text Classification . Procedia Comput. Sci . 159 , 736 – 745 ( 2019 ). OpenUrl 32. Subramanian , S. , Rahimi , A. , Baldwin , T. , Cohn , T. & Frermann , L . Fairness-aware Class Imbalanced Learning . Preprint at doi: 10.48550/arXiv.2109.10444 ( 2021 ). 33. Shyalika , C. , Wickramarachchi , R. & Sheth , A. P . A Comprehensive Survey on Rare Event Prediction . ACM Comput Surv ( 2024 ) doi: 10.1145/3699955 . OpenUrl CrossRef 34. ↵ Zhong , S. et al. A machine learning case study to predict rare clinical event of interest: imbalanced data, interpretability, and practical considerations . J. Biopharm. Stat . 0 , 1 – 14 . 35. ↵ Yang , R. et al. Large language models in health care: Development, applications, and challenges . Health Care Sci . 2 , 255 – 263 ( 2023 ). OpenUrl PubMed 36. Abd-alrazaq , A. , et al. Large Language Models in Medical Education: Opportunities, Challenges, and Future Directions . JMIR Med. Educ . 9 , e48291 ( 2023 ). OpenUrl 37. Jahan , I. , Laskar , M. T. R. , Peng , C. & Huang , J. X . A comprehensive evaluation of large Language models on benchmark biomedical text processing tasks . Comput. Biol. Med . 171 , 108189 ( 2024 ). 38. Guo , Y. , Ge , Y. , Yang , Y.-C. , Al-Garadi , M. A. & Sarker , A . Comparison of Pretraining Models and Strategies for Health-Related Social Media Text Classification . Healthc. Basel Switz . 10 , 1478 ( 2022 ). OpenUrl 39. Shao , Y . et al. Clinical sublanguage trend and usage analysis from a large clinical corpus . In 2020 IEEE International Conference on Big Data (Big Data) 3837 – 3845 (IEEE, 2020 ). 40. ↵ Workman , T. E. , Divita , G. & Zeng-Treitler , Q . Discovering Sublanguages in a Large Clinical Corpus through Unsupervised Machine Learning and Information Gain. in 2019 IEEE International Conference on Big Data (Big Data) 4889 – 4898 ( 2019 ). doi: 10.1109/BigData47090.2019.9006492 . OpenUrl CrossRef 41. ↵ Mezuk , B. , Kalesnikava , V. A. , Kim , J. , Ko , T. M. & Collins , C. Not discussed: Inequalities in narrative text data for suicide deaths in the National Violent Death Reporting System . PloS One 16 , e0254417 ( 2021 ). OpenUrl CrossRef PubMed 42. Arseniev-Koehler , A. , Foster , J. G. , Mays , V. M. , Chang , K.-W. & Cochran , S. D. Aggression, Escalation, and Other Latent Themes in Legal Intervention Deaths of Non-Hispanic Black and White Men: Results From the 2003lJ2017 National Violent Death Reporting System . Am. J. Public Health 111 , S107 – S115 ( 2021 ). OpenUrl PubMed 43. Arseniev-Koehler , A. , Mays , V. M. , Foster , J. G. , Chang , K.-W. & Cochran , S. D . Gendered Patterns in Manifest and Latent Mental Health Indicators Among Suicide Decedents: 2003– 2020 National Violent Death Reporting System (NVDRS) . Am. J. Public Health 114 , S268 – S277 ( 2024 ). OpenUrl PubMed 44. ↵ Rahman , N. et al. Using natural language processing to improve suicide classification requires consideration of race . Suicide Life. Threat. Behav . 52 , 782 – 791 ( 2022 ). OpenUrl PubMed 45. ↵ Sanh , V. , Debut , L. , Chaumond , J. & Wolf , T . DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter . Preprint at doi: 10.48550/arXiv.1910.01108 ( 2020 ). OpenUrl CrossRef View the discussion thread. Back to top Previous Next Posted January 17, 2025. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Assessing Supervised Natural Language Processing (NLP) Classification of Violent Death Narratives: Development and Assessment of a Compact Large Language Model (LLM) Approach Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Assessing Supervised Natural Language Processing (NLP) Classification of Violent Death Narratives: Development and Assessment of a Compact Large Language Model (LLM) Approach Susan T. Parker medRxiv 2025.01.16.25320680; doi: https://doi.org/10.1101/2025.01.16.25320680 Share This Article: Copy Citation Tools Assessing Supervised Natural Language Processing (NLP) Classification of Violent Death Narratives: Development and Assessment of a Compact Large Language Model (LLM) Approach Susan T. Parker medRxiv 2025.01.16.25320680; doi: https://doi.org/10.1101/2025.01.16.25320680 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Health Informatics Subject Areas All Articles Addiction Medicine (568) Allergy and Immunology (863) Anesthesia (299) Cardiovascular Medicine (4425) Dentistry and Oral Medicine (443) Dermatology (382) Emergency Medicine (607) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1507) Epidemiology (15221) Forensic Medicine (30) Gastroenterology (1123) Genetic and Genomic Medicine (6588) Geriatric Medicine (667) Health Economics (997) Health Informatics (4524) Health Policy (1368) Health Systems and Quality Improvement (1612) Hematology (540) HIV/AIDS (1264) Infectious Diseases (except HIV/AIDS) (15910) Intensive Care and Critical Care Medicine (1103) Medical Education (623) Medical Ethics (145) Nephrology (667) Neurology (6588) Nursing (346) Nutrition (998) Obstetrics and Gynecology (1143) Occupational and Environmental Health (956) Oncology (3331) Ophthalmology (970) Orthopedics (369) Otolaryngology (420) Pain Medicine (435) Palliative Medicine (129) Pathology (663) Pediatrics (1690) Pharmacology and Therapeutics (691) Primary Care Research (710) Psychiatry and Clinical Psychology (5440) Public and Global Health (9220) Radiology and Imaging (2195) Rehabilitation Medicine and Physical Therapy (1369) Respiratory Medicine (1196) Rheumatology (593) Sexual and Reproductive Health (710) Sports Medicine (529) Surgery (710) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'9ffde7fb5e79ad07',t:'MTc3OTQ3NDc3NA=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00