Methods for safely sharing dual-use genetic data

doi:10.1101/2024.11.29.24318203

Methods for safely sharing dual-use genetic data

2024 · doi:10.1101/2024.11.29.24318203

preprint OA: closed

📄 Open PDF Full text JSON View at publisher

Full text 39,397 characters · extracted from preprint-html · click to expand

Methods for safely sharing dual-use genetic data | medRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-P4HH5NV'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search Methods for safely sharing dual-use genetic data View ORCID Profile Sterling Sawaya , Chien-Chi Lo , Po-E Li , Blake Hovde , View ORCID Profile Patrick Chain doi: https://doi.org/10.1101/2024.11.29.24318203 Sterling Sawaya 1 GeneInfoSec Inc Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Sterling Sawaya For correspondence: sterlingsawaya{at}gmail.com Chien-Chi Lo 2 Los Alamos National Laboratory Find this author on Google Scholar Find this author on PubMed Search for this author on this site Po-E Li 2 Los Alamos National Laboratory Find this author on Google Scholar Find this author on PubMed Search for this author on this site Blake Hovde 2 Los Alamos National Laboratory Find this author on Google Scholar Find this author on PubMed Search for this author on this site Patrick Chain 2 Los Alamos National Laboratory Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Patrick Chain Abstract Full Text Info/History Metrics Supplementary material Data/Code Preview PDF Abstract Background Some genetic data has dual-use potential. Sharing pathogen data has shown tremendous value. For example therapeutic development and lineage tracking during the COVID pandemic. This data sharing is complicated by the fact that these data have the potential to be used for harm. The genome sequence of a pathogen can be used to enable malicious genetic engineering approaches or to recreate the pathogen from synthetic DNA. Standard data security methods can be applied to genetic data, but when data is shared between institutions, ensuring appropriate security can be difficult. Sensitive data that is shared internationally among a wide array of institutions can be especially difficult to control. Methods for securely storing and sharing genetic data with potential for dual-use are needed to mitigate this potential harm. Results Here we propose new methods that allow genetic data to be shared in a data format that prevents a nefarious actor from accessing sensitive aspects of the data. Our methods obfuscate raw sequence data by pooling reads from different samples. This approach can ensure that data is secure while stored and during electronic transfer. We demonstrate that by pooling raw sequence data from multiple samples of the same organism, the ability to fully reconstruct any individual sample is prevented. In the pooled data, most genomic information remains, but reads or mutations cannot be directly attributed to any individual sample. To further restrict access to information, regions of a genome can be removed from the reads. Conclusion Our methods obscure genomic information within raw sequence reads. This method can allow genetic data to be stored and shared while preventing a nefarious actor from being able to perfectly reconstruct an organism. Broad-scale sequence information remains, while fine scale details about specific samples are difficult or impossible to reconstruct. Background As biotechnology advances, there is an increased potential that it can be misused [ 1 ]. Today, methods in synthetic biology can facilitate the creation of novel organisms. DNA can be accurately synthesized and this DNA can be inserted into an organism using CRISPR/CAS9 [ 2 ]. Entire portions of a genome can either be directly synthesized or be rewritten to become any desired sequence. Furthermore, entire organisms can be constructed with synthetic DNA [ 3 ]. The ability to fully reconstruct a virus or bacteria with only a genome sequence has been demonstrated [ 4 ]. Although today this method is more cumbersome and expensive than simply adding or deleting genes from a genome, as synthetic biology continues to advance, these capabilities will become easier and more widespread. The ability to recreate an organism with genetic data introduces a dilemma for those generating pathogen genome data. There is a need to share genetic information of pathogens so that their origin and evolution can be understood, and also a beneficial use of the genetic information when designing countermeasures to an infectious disease [ 5 , 6 ]. However, because this data can be misused, there may be hesitance to share genetic data that has been collected [ 7 ], or perhaps even withhold publishing to avoid pressure to share the data. For genetic data from the most dangerous pathogens, sharing data may be prohibited in some countries, especially if the data is shared internationally [ 8 , 9 ]. There are international treaties for sharing specific pathogen data, such as the Pandemic Influenza Preparedness Framework [ 10 ]. This framework encourages the sharing of genomic data from influenza, but is, however, not legally binding and its enforceability has never been tested [ 11 ]. There are ongoing negotiations to expand this approach to other pathogen data, but in general the sharing of pathogen sequence data only occurs between countries through bilateral agreements [ 12 ]. The ownership of genetic data and the sharing of benefits or products from genetic data are governed by the Nagoya protocol, but this treaty requires a case-by-case bilateral negotiation to make specific arrangements [ 13 ]. The standards and regulations for data sharing during a world-wide crisis have been tested by the COVID pandemic. During the initial stages of the outbreak, sharing of pathogen sequence data was essential for tracking its spread, predicting how the virus may evolve, and developing countermeasures such as mRNA vaccines [ 14 , 15 ]. Without the prompt sharing of accurate sequence data, the world may not have been able to respond rapidly to this novel disease. Nevertheless, as data was openly shared, concerns about equity and benefit-sharing arose [ 16 , 17 ]. These and other concerns can restrict fully open pathogen genomic data sharing during disease outbreaks. The open sharing of scientific data stands as an ideal standard on which major scientific advances rely [ 18 , 19 ]. However, this data sharing approach can conflict with the data sharing restrictions in place for the genomic data of certain pathogens. Currently, there is no standard method by which critical data from dangerous pathogens can be shared that is commensurate with requisite security. The data is either siloed, so no genetic data is shared, or the data is wholly shared, allowing anyone with access to the data to recreate the pathogen [ 20 ]. New, safer methods for accessing the sequence data of pathogens are needed. The methods here are inspired by techniques in molecular cryptography, in which cryptographic protocols are applied directly to DNA molecules to ensure that sensitive information is protected before the DNA is sequenced. Molecular cryptography can be used to mix DNA samples in such a way that a key is needed to determine which read from a DNA sequencer belongs to which sample [ 21 ]. Without the key, data from the resulting pool is obfuscated. A similar approach can be done with software, in which reads from multiple files are pooled together and the meta-data is stripped. Using software, we examine the use of this approach. Here, we propose a new method that allows for sharing genetic sequence data while protecting sensitive aspects of the data. Methods We developed new methods for formatting genetic data that prevents anyone with access to the data from being able to reconstruct an entire genome ( Figure 1 ). The first proposed method involves the following ( Figure 1B ): DNA sequencing reads from a sample (or group of samples) are aligned to an incomplete reference genome. A substantial portion of this genome must be removed (e.g. removal of a full gene). Download figure Open in new tab Figure 1. Workflow for methods used to obfuscate genomic data. A. Method for pooling reads to generate a secure FASTQ file. FASTQ files were pooled and their order randomized. B. Method for removing genomic region(s) to limit information within a set of (pooled) samples. Samples, or a pool of samples from (A), are aligned to an incomplete reference genome to produce a file that lacks complete genomic information. The resulting alignment file (e.g. in SAM format) can then be processed into a variant call file (VCF) or can be shared directly. The aligned file or the VCF will be insufficient to recreate the full genome of the sample, as sensitive sequence regions that were removed are not included in the alignment of assembly files. Our second proposed method is more sophisticated, and can occur prior to the application of the first proposed method ( Figure 1A ). Here, we pool DNA sequence reads from multiple samples of the same organism (e.g. in FASTQ format) and the metadata from each read is stripped. The order of the reads in the pooled data is then randomized. The resulting file remains in FASTQ format and contains all of the reads from the data of the original samples but paired-end read information is lost. The files with pooled reads can be directly processed into VCF files by mapping reads to a reference genome and calling variants. For additional security, the pooled data can also be processed with the first proposed method (aligning to an incomplete reference genome). To examine the read pooling method, we applied it to FASTQ data from SARS-CoV-2 and from Bacillus anthracis . For SARS-CoV-2, we generated 7 independent datasets in FASTQ format, consisting of a unique mixture of 2, 5, 10, 50, 100, and 500 samples obtained from CDC (see supplementary table 1). For B. anthracis we generated 4 distinct datasets, consisting of 2, 5, 10 and 50 isolates, randomly subselected from NCBI SRA. (see supplementary table 2) with search criteria “Organism = Bacillus anthracis” and platform = Illumina” and “layout = paired”. The metadata from each sample, including the depth of coverage and strain variants/lineages, were utilized as the ground truth for the composition/abundance of each pooled dataset. For SARS-CoV-2, the EDGE COVID-19, build 20230131 [ 22 ], was used for variant calling and lineage assignment was done with Pangolin (v4.3.1)[ 23 ]. For B. anthracis , the EDGE, build 20231115 [ 24 ] was used for read-mapping to the reference genome Bacillus anthracis str. ‘Ames Ancestor’ (AE017334.2) and for variant calling. Freyja v1.4.8 [ 25 ] was used for deconvoluting the variants/lineages in the datasets from the VCF files from the pooled SARS-CoV-2 data. This program was designed to deconvolute genomes for ‘mixed’ samples, with the ability to predict the SARS-CoV-2 lineages present within the mixed samples. We used Freyja to predict the proportion of SARS-CoV-2 variants and lineages in the datasets, for both pooled data and for individual samples. To examine the pooled B. anthracis data, we generated variant frequency heatmaps using the Plotly dash-bio.Clustergram package [ 26 ]. The Euclidean pairwise distance [ 27 ] and complete linkage clustering algorithms [ 28 ] were employed to the variant calling result based on variant locations. By applying these algorithms to the variant calling results, we can identify groups of variants with similar frequency patterns across different genomic locations. This clustering approach helps to identify, visualize, and interpret the relationships between variants in the B. anthracis data, potentially revealing insights into genetic similarities or differences among the different treatments. Results By examining pooled data for larger and larger pools of samples (FASTQ reads), we tested the ability of our pooling method to obfuscate genomic data using traditional bioinformatics methods. We estimated the composition of the pooled data using these methods and compared the results to the true composition based on the samples used to generate those pools. For the SARS-CoV-2 samples, the EDGE COVID-19 workflow was used to provide majority-rule based variant calls [ 22 ]. Consequently, regardless of whether the SARS-CoV-2 reads were pooled or whether samples were analyzed separately, the same single nucleotide polymorphism variants (SNVs) and insertion/deletion (INDELs) were found. Freyja, another tool designed for SARS-CoV-2 [ 25 ], provides a population level analysis of the pooled reads. Freyja was successfully able to estimate the proportion of variants present within the pooled data ( Figure 2 ). Note that the samples used to generate the pool of 5 datasets did not contain any Delta variant, being entirely composed of the Omicron variant, so a comparison of variant calling was not possible with this group. Download figure Open in new tab Figure 2. Prediction of variants present for each sample and for the pooled datasets. For each group examined, Freyja was used to predict variants present within each individual sample (left stacked histogram in each subplot), and the proportion of each variant in the pooled sample represents the ratio of reads from that sample used in the pooled dataset. The right stacked histogram for each subplot represents the variants present within the pooled datasets as predicted by Freyja. While variants were accurately detected using the lineages identified by Freyja were not always correct ( Figure 3 ). The prediction of lineages in Freyja produced primarily false positives, even when only two samples were pooled. Freyja’s ability to predict the presence of lineages in pooled data had diminishing accuracy as more samples were pooled together. Notably, Freyja tended to overestimate the number of lineages in the pooled data, and this overestimate increased as more samples were pooled together. Download figure Open in new tab Figure 3. Prediction of lineages present within samples and within the pooled datasets. For each group examined, Freyja was used to predict lineages present within each individual sample (left stacked histogram in each subplot), and the proportion of each lineage in the pooled sample represents the ratio of reads from that sample used in the pooled dataset. The right stacked histogram for each subplot represents the lineages present within the pooled datasets as predicted by Freyja. While the community is familiar with delineating variants in viral outbreaks, the definition of a lineage within bacterial species is not as straightforward. Due to the lack of sublineage definitions, and also, since there exists no similar program to Freyja for bacteria, we took a different approach to demonstrate that our method provides obfuscation of bacterial genomes. For B. anthracis we generated 4 distinct datasets, consisting of 2, 5, 10 and 50 isolates, for a total 67 SRA samples. When the 67 individual samples of B. anthracis were individually mapped to the reference genome, an average genome coverage ranging from 97.04% to 99.06% was achieved. The variant calling analysis revealed a SNV count ranging from 8 to 2,653. To assess the pooled B. anthracis data, we compared the SNV positions in the pools with those from individual samples. Figure 4A presents a Venn diagram illustrating the SNVs found in the pooled 2-samples relative to those found in the individual samples alone (SRR5811050 with a mean depth 54x and SRR13950249 with a mean depth 203x). The pooled samples share 1,115 SNVs with the individual samples. However, the pooling of two samples introduced three false positive SNVs that were not present in the individual sample analyses. These false positives occurred because, although the alternative base ratios in the individual samples were below the detection threshold at these positions, their combined ratios in the pooled sample exceeded the threshold and were detected by the variant calling algorithm. Conversely, each individual sample contains 1,307 SNVs that were not detected in the pooled data, representing a high false negative (FN) rate in the pooled analysis. This high rate is likely due to two factors: (1) pooling increased the depth coverage of non-variant regions, diluting the signal of true variants (1,196 FN in SRR13950249 and 53 FN in SRR5811050), and (2) the pooling method removed paired-end information, preventing the mapping algorithm from rescuing reads that did not perfectly align due to SNVs (58 FN shared by two individual samples). Figure 4B depicts a cluster analysis heatmap of the pooled 2-samples data, along with data from the two individual samples. The vertical dendrogram on the left of Figure 4B illustrates the hierarchical clustering of the samples, revealing patterns of similarity. The heatmap itself shows variations in alternate base ratios across different SNV positions, with color gradients indicating the frequency of these variants. As seen in Figure 4B , although the SNV positions in the pooled samples overlap with those in the individual samples, there are differences in the alternative base ratios for other SNV positions.Notably, the top cluster highlights a “dilution” effect for false negative SNVs, where pooling reduces variant signals below detection thresholds. This result demonstrates that combining reads from different samples can obscure variant information, leading to both false positives and false negatives. Download figure Open in new tab Figure 4. Pooled 2-samples SNVs and individual sample SNVs. A. Venn diagram ( https://bioinfogp.cnb.csic.es/tools/venny/ .) The variants present within two individual samples (SRR5811050 and SRR13950249) were compared to determine which variants overlap between the different samples and also between the variants called in the pooled samples. B. Heatmap. Y-axis represents SNV position clustering. Heatmap color represents the SNVs ratio which is defined as the alternate base versus the reference base ratio. Discussion The data generated by our software has a similar format to a real-life scenario in which the read data from individual samples is not clear. Samples can contaminate each other, multiple variants of a disease can co-infect a patient, or a new, recombinant lineage can arise. The program Freyja is designed to disentangle these potential scenarios for SARS-CoV-2, making it an ideal approach by which we can test the limits of our method. Freyja was unable to accurately predict which SARS-CoV-2 lineages were present within our pooled data, indicating that our method successfully obfuscates lineage information ( Figure 3 ). Consequently, our pooling method can prevent the reconstruction of complete SARS-CoV-2 genomes, even when only a few genomes are pooled. Freyja’s ability to accurately estimate the fraction of variants in our pooled data, on the other hand, indicates that our method does not conceal lineage-specific information ( Figure 2 ). When constructing genomes, many factors can influence variant calling. We expected that the differences between the bacterial and viral genomes examined here would influence how well our pooling method would obfuscate the genomes. Bacterial genomes are larger and more structurally complex than viral genomes and contain more variation, and thus the reconstruction of their genomes can be more difficult. Furthermore, delineation of bacteria lineages can be complicated by frequent horizontal gene transfer from distant species and large-scale genome rearrangements [ 29 , 30 ]. Therefore, we expected a priori that reconstructing B. anthracis genomes from the pooled sequence data would be a bigger challenge than for SARS-CoV-2. Without a custom tool like Freyja to disentangle genomes, our analysis instead focused on a comparison of variant calling between pooled data and individual samples. For data pooled from two B. anthracis samples, the SNPs called in the pool did not always match the SNPs independently called from the two original samples ( Figure 4A, 4B ). These results demonstrate that pooling data obfuscates variant calling. Accurate variant calling is needed to fully reconstruct genomes or to predict the presence of specific bacterial lineages. This effect becomes more pronounced as additional samples are pooled (see supplementary figures), making it increasingly difficult to accurately call variants and fully reconstruct genomes. Overall, these results demonstrate that by pooling reads from the same species, the samples’ genomes become obfuscated. Not all information is concealed, however. Much of the genomic information remains: the species being sequenced, the common variants that are present in the pool, and general information about gene sequences. We also found that pooling more samples always increases genome obfuscation, and larger genomes with more variants are more difficult to accurately reconstruct. However, the limits of this method have not been tested here. Pooling genomes of distantly related organisms will facilitate the attribution of a specific read to a specific lineage, limiting obfuscation of the samples of distant relation to the rest of the pool. Furthermore, if two samples are nearly identical in their genome sequence, then pooling their sequence data will do little to change the overall composition of the pool. Additional work is required to determine exactly how sequence diversity, genome size, and the number of pooled samples affect how information is concealed with our method. To address this uncertainty, aligning reads to an incomplete reference genome ( Figure 1A ) can always be used to ensure that full reconstruction of a genome is not possible. Many other types of genetic data are widely shared. For example, human genomic data can be shared to further research on human genomics or to develop new pharmaceutical drugs [ 31 ]. The sharing of human genomic data is complicated by issues of identifiability of the data [ 32 ] and the consequent violations of privacy [ 33 , 34 , 35 ]. Human genetic data also has potential security issues that result from the fact that the people from whom the data is generated will be alive for decades and future technological advancements in biotechnology and AI may facilitate the misuse of this data [ 36 ]. Like with pathogen data, concerns about the insecure sharing of human genomic data may ultimately lead to limitations on how data is shared. The application of our data security techniques to human data represent a potential future direction of this work. Conclusion The genomes analyzed here are of importance to biosecurity, and this method can have widespread applications to genomic data of other pathogens. This method can also be applied to larger genomes, such as human, to obfuscate sensitive data for privacy and security. Pooling genomic data, or equally, pooling genomic material, results in a mixture that prevents full reconstruction of a genome. This method, potentially combined with methods that remove some regions of the genome, can nevertheless allow useful genomic data to be shared. The computational methods used here also have the potential to guide and optimize methods in molecular cryptography. The effectiveness of different pooling methods can be examined with this software, and the results can be used to design molecular cryptography. By sharing raw data in a way that prevents misuse, analyses can be done to ensure an accurate interpretation of the data. The use of genetic data, in pathogen surveillance and elsewhere, can be essential for biosecurity, for the development of the bioeconomy, and the advancement of medicine. This data sharing can be stifled by the threat of misuse. The availability of raw data can help ensure the accuracy, authenticity and interpretation of any genomic analyses. Even when raw data is not to be shared, securing information within raw data itself provides security for the data at rest, and ensures that any information extracted from the data remains equally secure. Data Availability All data produced are available online at the NCBI SRA repository. Declarations Ethics approval and consent to participate - Not applicable Consent for publication - Not applicable Availability of data and materials The datasets analyzed during the current study are available in the NCBI SRA repository and their accession numbers are listed in supplementary table 1 and table 2. Competing interests SS is a founding member of GeneInfoSec Inc. which produced and owns the rights to the software used to obfuscate the genetic data for this research. The software was provided at no cost. Funding The development of the software was partially supported by a grant to GeneInfoSec Inc. from the US Government: FA864922P0653, Air Force SBIR/STTR Topic No. AF211-CSO1, Proposal No. F2-15277. The analysis work was supported by eBioAID project [#R-00675-23-0] which is a DTRA CBA-funded collaboration between Los Alamos National Laboratory (LANL) and United States Army Medical Research Institute for Infectious Disease (USAMRIID). Authors Contributions SS, BH and PC designed the research plan. CCL and PL managed the data and conducted the analyses. All authors contributed to the writing of, read and approved the final manuscript. Acknowledgements Not applicable List of abbreviations DNA Deoxyribonucleic acid SARS-CoV-2 Severe acute respiratory syndrome coronavirus 2 CRISPR Clustered regularly interspaced short palindromic repeats VCF Variant Call File FN False Negative References 1. ↵ Smith JA , Sandbrink JB . Biosecurity in an age of open science . PLOS Biology . 2022 Apr 14; 20 ( 4 ): e3001600 . OpenUrl CrossRef PubMed 2. ↵ Doudna JA , Charpentier E. The New Frontier of Genome Engineering with CRISPR-Cas9 . Science . 2014 Nov 27; 346 ( 6213 ): 1258096 – 6 . OpenUrl Abstract / FREE Full Text 3. ↵ Gibson DG , Glass JI , Lartigue C , Noskov VN , Chuang RY ., Algire MA , et al. Creation of a Bacterial Cell Controlled by a Chemically Synthesized Genome . Science . 2010 May 20; 329 ( 5987 ): 52 – 6 . OpenUrl Abstract / FREE Full Text 4. ↵ Mitchell LA , Ellis T. Synthetic genome engineering gets infectious . Proceedings of the National Academy of Sciences . 2017 Oct 9; 114 ( 42 ): 11006 – 8 . OpenUrl FREE Full Text 5. ↵ Baker D , van den Beek M , Blankenberg D , Bouvier D , Chilton J , Coraor N , et al. No more business as usual: Agile and effective responses to emerging pathogen threats require open data and open analytics . Coyne CB, editor. PLOS Pathogens . 2020 Aug 13; 16 ( 8 ): e1008643 . OpenUrl 6. ↵ Neves A , Cuesta I , Hjerde E , Terje Klemetsen , Salgado D , Jacques van Helden , et al. FAIR+E pathogen data for surveillance and research: lessons from COVID-19 . Frontiers in Public Health . 2023 Nov 21; 11 : 1289945 . OpenUrl PubMed 7. ↵ dos S. Ribeiro C , Koopmans MP , Haringhuizen GB . Threats to timely sharing of pathogen sequence data . Science . 2018 Oct 26; 362 ( 6413 ): 404 – 6 . OpenUrl Abstract / FREE Full Text 8. ↵ Edelstein M , Sane J. Overcoming Barriers to Data Sharing in Public Health: A Global Perspective . Chatham House ; 2015 . 9. ↵ Edelstein M , Lee LM , Herten-Crabb A , Heymann DL , Harper DR . Strengthening Global Public Health Surveillance through Data and Benefit Sharing . Emerging Infectious Diseases . 2018 Jul ; 24 ( 7 ): 1324 – 30 . OpenUrl 10. ↵ Thompson AK , Faith K , Gibson JL , Upshur RE . Pandemic influenza preparedness: an ethical framework to guide decision-making . BMC Medical Ethics . 2006 ; 7 ( 1 ): 12 . OpenUrl 11. ↵ Rourke M , Eccleston-Turner M. The Pandemic Influenza Preparedness Framework as a “specialized international access and benefit-sharing instrument” under the Nagoya Protocol . Northern Ireland Legal Quarterly . 2021 Dec 17; 72 ( 3 ): 411 – 47 . OpenUrl 12. ↵ Rourke M. Access by Design, Benefits if Convenient: A Closer Look at the Pandemic Influenza Preparedness Framework’s Standard Material Transfer Agreements . Milbank Q . 2019 Mar 1; 97 ( 1 ): 91 – 112 . OpenUrl PubMed 13. ↵ Buck M , Hamilton C. The Nagoya Protocol on Access to Genetic Resources and the Fair and Equitable Sharing of Benefits Arising from their Utilization to the Convention on Biological Diversity . Review of European Community & International Environmental Law . 2011 Apr ; 20 ( 1 ): 47 – 61 . OpenUrl 14. ↵ Rockett RJ , Arnott A , Lam C , Sadsad R , Timms V , Gray KA , et al. Revealing COVID-19 transmission in Australia by SARS-CoV-2 genome sequencing and agent-based modeling . Nature Medicine . 2020 Jul 9; 26 : 1398 – 404 . OpenUrl PubMed 15. ↵ Chiara M , D’Erchia AM , Gissi C , Manzari C , Parisi A , Resta N , et al. Next generation sequencing of SARS-CoV-2 genomes: challenges, applications and opportunities . Briefings in Bioinformatics . 2020 Dec 7; 22 ( 2 ). 16. ↵ Pratt B , Bull S. Equitable data sharing in epidemics and pandemics . BMC Medical Ethics . 2021 Oct 6; 22 ( 1 ). 17. ↵ Shu Y , McCauley J. GISAID: Global initiative on sharing all influenza data – from vision to reality . Eurosurveillance . 2017 Mar 30; 22 ( 13 ). 18. ↵ Murray-Rust P. Open Data in Science . Nature Precedings . 2008 Jan 18; 19. ↵ Huston P , Edge V , Bernier E. Reaping the benefits of Open Data in public health . Canada Communicable Disease Report . 2019 Oct 3; 45 ( 10 ): 252 – 6 . OpenUrl CrossRef PubMed 20. ↵ Vinatzer BA , Heath LS , Almohri HMJ , Stulberg MJ , Lowe C , Li S. Cyberbiosecurity Challenges of Pathogen Genome Databases . Frontiers in Bioengineering and Biotechnology . 2019 May 15; 7 . 21. ↵ Sawaya S. Cryptography for genetic material . bioRxiv . 2017 Jun 30; 22. ↵ Alkan C Lo CC , Shakya M , Connor R , Davenport K , Flynn M , Gutiérrez AM y, et al. EDGE COVID-19: a web platform to generate submission-ready genomes from SARS-CoV-2 sequencing efforts . Alkan C , editor. Bioinformatics . 2022 Mar 24; 38 ( 10 ): 2700 – 4 . OpenUrl PubMed 23. ↵ O’Toole Á , Scher E , Underwood A , Jackson B , Hill V , McCrone JT , et al. Assignment of Epidemiological Lineages in an Emerging Pandemic Using the Pangolin Tool . Virus Evolution . 2021 Jul 5; 7 ( 2 ). 24. ↵ Li PE , Lo CC , Anderson JC , Davenport KW , Bishop-Lilly KA , Xu Y , et al. Enabling the democratization of the genomics revolution with a fully integrated web-based bioinformatics platform . Nucleic Acids Research . 2017 Jan 9; 45 ( 1 ): 67 – 80 . OpenUrl CrossRef PubMed 25. ↵ Karthikeyan S , Levy JI , De Hoff P , Humphrey G , Birmingham A , Jepsen K , et al. Wastewater sequencing reveals early cryptic SARS-CoV-2 variant transmission . Nature . 2022 Jul 7; 609 : 101 – 8 . OpenUrl CrossRef PubMed 26. ↵ Plotly Technologies Inc. Dash Bio.Clustergram . https://dash.plotly.com/dash-bio/clustergram . 2024 . 27. ↵ Dokmanic I , Parhizkar R , Ranieri J , Vetterli M. Euclidean Distance Matrices: Essential theory, algorithms, and applications . IEEE Signal Processing Magazine . 2015 Nov ; 32 ( 6 ): 12 – 30 . OpenUrl 28. ↵ Großwendt A , Röglin H. Improved Analysis of Complete-Linkage Clustering . Algorithmica . 2017 Feb 6; 78 ( 4 ): 1131 – 50 . OpenUrl 29. ↵ Power JJ , Pinheiro F , Pompei S , Kovacova V , Yüksel M , Rathmann I , et al. Adaptive evolution of hybrid bacteria by horizontal gene transfer . Proceedings of the National Academy of Sciences . 2021 Mar ; 118 ( 10 ). 30. ↵ Smith JT , Andam CP . Extensive horizontal gene transfer within and between species of coagulase-negative Staphylococcus . Dagan T, editor. Genome Biology and Evolution . 2021 Sep 8; 13 ( 9 ). 31. ↵ Alvarellos M , Sheppard HE , Knarston I , Davison C , Raine N , Seeger T , et al. Democratizing clinical-genomic data: How federated platforms can promote benefits sharing in genomics . Frontiers in Genetics . 2023 Jan 10; 13 : 1045450 . OpenUrl PubMed 32. ↵ Gymrek M , McGuire AL , Golan D , Halperin E , Erlich Y. Identifying Personal Genomes by Surname Inference . Science . 2013 Jan 17; 339 ( 6117 ): 321 – 4 . OpenUrl Abstract / FREE Full Text 33. ↵ McGuire AL . Identifiability of DNA Data: The Need for Consistent Federal Policy . Am J Bioeth . 2008 Nov 17; 8 ( 10 ): 75 – 6 . OpenUrl PubMed 34. ↵ Erlich Y , Narayanan A. Routes for breaching and protecting genetic privacy . Nature Reviews Genetics . 2014 May 8; 15 ( 6 ): 409 – 21 . OpenUrl CrossRef PubMed 35. ↵ Bonomi L , Huang Y , Ohno-Machado L. Privacy Challenges and Research Opportunities for Genomic Data Sharing . Nature genetics . 2020 Jul 1; 52 ( 7 ): 646 – 54 . OpenUrl CrossRef PubMed 36. ↵ Sawaya S , Kenneally E , Nelson D , Schumacher G. Artificial Intelligence and the Weaponization of Genetic Data . Springer eBooks . 2012 Feb 24; 265 – 78 . View the discussion thread. Back to top Previous Next Posted December 01, 2024. Download PDF Supplementary Material Data/Code Email Thank you for your interest in spreading the word about medRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following Methods for safely sharing dual-use genetic data Message Subject (Your Name) has forwarded a page to you from medRxiv Message Body (Your Name) thought you would like to see this page from the medRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share Methods for safely sharing dual-use genetic data Sterling Sawaya , Chien-Chi Lo , Po-E Li , Blake Hovde , Patrick Chain medRxiv 2024.11.29.24318203; doi: https://doi.org/10.1101/2024.11.29.24318203 Share This Article: Copy Citation Tools Methods for safely sharing dual-use genetic data Sterling Sawaya , Chien-Chi Lo , Po-E Li , Blake Hovde , Patrick Chain medRxiv 2024.11.29.24318203; doi: https://doi.org/10.1101/2024.11.29.24318203 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Epidemiology Subject Areas All Articles Addiction Medicine (573) Allergy and Immunology (865) Anesthesia (302) Cardiovascular Medicine (4453) Dentistry and Oral Medicine (444) Dermatology (383) Emergency Medicine (609) Endocrinology (including Diabetes Mellitus and Metabolic Disease) (1515) Epidemiology (15242) Forensic Medicine (30) Gastroenterology (1131) Genetic and Genomic Medicine (6615) Geriatric Medicine (669) Health Economics (1001) Health Informatics (4552) Health Policy (1372) Health Systems and Quality Improvement (1614) Hematology (543) HIV/AIDS (1270) Infectious Diseases (except HIV/AIDS) (15929) Intensive Care and Critical Care Medicine (1106) Medical Education (624) Medical Ethics (147) Nephrology (670) Neurology (6625) Nursing (346) Nutrition (999) Obstetrics and Gynecology (1148) Occupational and Environmental Health (957) Oncology (3344) Ophthalmology (979) Orthopedics (369) Otolaryngology (421) Pain Medicine (436) Palliative Medicine (130) Pathology (665) Pediatrics (1696) Pharmacology and Therapeutics (693) Primary Care Research (714) Psychiatry and Clinical Psychology (5461) Public and Global Health (9252) Radiology and Imaging (2207) Rehabilitation Medicine and Physical Therapy (1371) Respiratory Medicine (1197) Rheumatology (597) Sexual and Reproductive Health (715) Sports Medicine (530) Surgery (714) Toxicology (99) Transplantation (289) Urology (265) (function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'a02efc08f9498e2e',t:'MTc3OTk4OTM5Nw=='};var a=document.createElement('script');a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';document.getElementsByTagName('head')[0].appendChild(a);";b.getElementsByTagName('head')[0].appendChild(d)}}if(document.body){var a=document.createElement('iframe');a.height=1;a.width=1;a.style.position='absolute';a.style.top=0;a.style.left=0;a.style.border='none';a.style.visibility='hidden';document.body.appendChild(a);if('loading'!==document.readyState)c();else if(window.addEventListener)document.addEventListener('DOMContentLoaded',c);else{var e=document.onreadystatechange||function(){};document.onreadystatechange=function(b){e(b);'loading'!==document.readyState&&(document.onreadystatechange=e,c())}}}})();

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

⚙ Ask this paper AI returns verbatim quotes from the full text · source: preprint-html ⓘ

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2024) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc: last seen: 2026-05-20T01:45:00.602351+00:00