ProteoParc: A tool to generate protein reference databases for ancient and non-model organisms

preprint OA: gold CC-BY-NC-ND-4.0
📄 Open PDF Full text JSON View at publisher
Full text 44,414 characters · extracted from preprint-html · click to expand
ProteoParc: A tool to generate protein reference databases for ancient and non-model organisms | bioRxiv /* */ /* */ <!-- <!-- /*! * yepnope1.5.4 * (c) WTFPL, GPLv2 */ (function(a,b,c){function d(a){return"[object Function]"==o.call(a)}function e(a){return"string"==typeof a}function f(){}function g(a){return!a||"loaded"==a||"complete"==a||"uninitialized"==a}function h(){var a=p.shift();q=1,a?a.t?m(function(){("c"==a.t?B.injectCss:B.injectJs)(a.s,0,a.a,a.x,a.e,1)},0):(a(),h()):q=0}function i(a,c,d,e,f,i,j){function k(b){if(!o&&g(l.readyState)&&(u.r=o=1,!q&&h(),l.onload=l.onreadystatechange=null,b)){"img"!=a&&m(function(){t.removeChild(l)},50);for(var d in y[c])y[c].hasOwnProperty(d)&&y[c][d].onload()}}var j=j||B.errorTimeout,l=b.createElement(a),o=0,r=0,u={t:d,s:c,e:f,a:i,x:j};1===y[c]&&(r=1,y[c]=[]),"object"==a?l.data=c:(l.src=c,l.type=a),l.width=l.height="0",l.onerror=l.onload=l.onreadystatechange=function(){k.call(this,r)},p.splice(e,0,u),"img"!=a&&(r||2===y[c]?(t.insertBefore(l,s?null:n),m(k,j)):y[c].push(l))}function j(a,b,c,d,f){return q=0,b=b||"j",e(a)?i("c"==b?v:u,a,b,this.i++,c,d,f):(p.splice(this.i++,0,a),1==p.length&&h()),this}function k(){var a=B;return a.loader={load:j,i:0},a}var l=b.documentElement,m=a.setTimeout,n=b.getElementsByTagName("script")[0],o={}.toString,p=[],q=0,r="MozAppearance"in l.style,s=r&&!!b.createRange().compareNode,t=s?l:n.parentNode,l=a.opera&&"[object Opera]"==o.call(a.opera),l=!!b.attachEvent&&!l,u=r?"object":l?"script":"img",v=l?"script":u,w=Array.isArray||function(a){return"[object Array]"==o.call(a)},x=[],y={},z={timeout:function(a,b){return b.length&&(a.timeout=b[0]),a}},A,B;B=function(a){function b(a){var a=a.split("!"),b=x.length,c=a.pop(),d=a.length,c={url:c,origUrl:c,prefixes:a},e,f,g;for(f=0;f<d;f++)g=a[f].split("="),(e=z[g.shift()])&&(c=e(c,g));for(f=0;f<b;f++)c=x[f](c);return c}function g(a,e,f,g,h){var i=b(a),j=i.autoCallback;i.url.split(".").pop().split("?").shift(),i.bypass||(e&&(e=d(e)?e:e[a]||e[g]||e[a.split("/").pop().split("?")[0]]),i.instead?i.instead(a,e,f,g,h):(y[i.url]?i.noexec=!0:y[i.url]=1,f.load(i.url,i.forceCSS||!i.forceJS&&"css"==i.url.split(".").pop().split("?").shift()?"c":c,i.noexec,i.attrs,i.timeout),(d(e)||d(j))&&f.load(function(){k(),e&&e(i.origUrl,h,g),j&&j(i.origUrl,h,g),y[i.url]=2})))}function h(a,b){function c(a,c){if(a){if(e(a))c||(j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}),g(a,j,b,0,h);else if(Object(a)===a)for(n in m=function(){var b=0,c;for(c in a)a.hasOwnProperty(c)&&b++;return b}(),a)a.hasOwnProperty(n)&&(!c&&!--m&&(d(j)?j=function(){var a=[].slice.call(arguments);k.apply(this,a),l()}:j[n]=function(a){return function(){var b=[].slice.call(arguments);a&&a.apply(this,b),l()}}(k[n])),g(a[n],j,b,n,h))}else!c&&l()}var h=!!a.test,i=a.load||a.both,j=a.callback||f,k=j,l=a.complete||f,m,n;c(h?a.yep:a.nope,!!i),i&&c(i)}var i,j,l=this.yepnope.loader;if(e(a))g(a,0,l,0);else if(w(a))for(i=0;i (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0];var j=d.createElement(s);var dl=l!='dataLayer'?'&l='+l:'';j.src='//www.googletagmanager.com/gtm.js?id='+i+dl;j.type='text/javascript';j.async=true;f.parentNode.insertBefore(j,f);})(window,document,'script','dataLayer','GTM-M677548'); Skip to main content Home About Submit ALERTS / RSS Search for this keyword Advanced Search New Results ProteoParc: A tool to generate protein reference databases for ancient and non-model organisms View ORCID Profile Guillermo Carrillo-Martin , View ORCID Profile Johanna Krueger , View ORCID Profile Tomas Marques-Bonet , View ORCID Profile Esther Lizano doi: https://doi.org/10.1101/2025.07.31.667843 Guillermo Carrillo-Martin 1 Departament de Medicina i Ciències de la Vida, Institut de Biologia Evolutiva (CSIC-UPF), Universitat Pompeu Fabra , Barcelona, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Guillermo Carrillo-Martin For correspondence: guillermo.carrillo{at}upf.edu esther.lizano{at}upf.edu tomas.marques{at}upf.edu Johanna Krueger 1 Departament de Medicina i Ciències de la Vida, Institut de Biologia Evolutiva (CSIC-UPF), Universitat Pompeu Fabra , Barcelona, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Johanna Krueger Tomas Marques-Bonet 1 Departament de Medicina i Ciències de la Vida, Institut de Biologia Evolutiva (CSIC-UPF), Universitat Pompeu Fabra , Barcelona, Spain 2 Institut Català de Paleontologia Miquel Crusafont (ICP-CERCA), Universitat Autònoma de Barcelona, Cerdanyola del Vallès , Barcelona, Spain 4 Catalan Institution of Research and Advanced Studies (ICREA), Passeig de Lluís Companys , Barcelona, Spain 5 CNAG, Centro Nacional de Analisis Genomico , Barcelona, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Tomas Marques-Bonet For correspondence: guillermo.carrillo{at}upf.edu esther.lizano{at}upf.edu tomas.marques{at}upf.edu Esther Lizano 1 Departament de Medicina i Ciències de la Vida, Institut de Biologia Evolutiva (CSIC-UPF), Universitat Pompeu Fabra , Barcelona, Spain 2 Institut Català de Paleontologia Miquel Crusafont (ICP-CERCA), Universitat Autònoma de Barcelona, Cerdanyola del Vallès , Barcelona, Spain 3 Unidad de Paleobiología, ICP-CERCA, Unidad Asociada al CSIC por el IBE UPF-CSIC, Cerdanyola del Vallès , Barcelona, Spain Find this author on Google Scholar Find this author on PubMed Search for this author on this site ORCID record for Esther Lizano For correspondence: guillermo.carrillo{at}upf.edu esther.lizano{at}upf.edu tomas.marques{at}upf.edu Abstract Full Text Info/History Metrics Data/Code Preview PDF Abstract Over the last few years, the increasing interest in analysing the proteome of extinct and non-model organisms has generated a new field of research expanding the scope of proteomics. The lack of curated databases and/or molecular data from these organisms forces researchers to manually search in different public repositories for related protein sequences, either for MS/MS peptide identification or ZooMS marker annotation. This can lead to format incongruences and hinder reproducibility between studies. To address this issue, we introduce ProteoParc, a user-friendly software that generates reference databases by systematically downloading and processing protein sequences from the most widely used public repositories. The pipeline’s output is a non-redundant protein database, formatted to be interpreted by typical peptide identification software. Moreover, the user can adjust the database dimension and composition by applying different criteria to include only a certain number of genes or species. Thus, ProteoParc is an easy and fast, custom-made bioinformatic tool useful for future paleoproteomics analysis in ancient samples related to understudied organisms. Introduction It is well known that biomolecules can remain in the ecosphere for up to millions of years ( Cappellini et al., 2018 ) . Although palaeogenetics has been proven to be a productive discipline for studying the past ( Dalén et al., 2023 ) , DNA is unstable and rapidly degraded or banished in post-mortem conditions by temperature and microbial communities ( Brunson & Reich, 2019 ) . By contrast, proteins are generally smaller and more compact, reducing their susceptibility to chemical degradation ( Warinner et al., 2022 ) . While the oldest sequenced aDNA belongs to exceptionally well-preserved sediment samples dated approximately 2 million years (Ma) ( Kjær et al., 2022 ) , recent studies have identified ancient protein sequences dating back to 21–24 Ma ( Paterson et al., 2025 ) . In this sense, paleoproteomics has emerged as a molecular science discipline that aims to recover, identify, and analyse peptides in ancient tissues or biological remains ( Hendy et al., 2018 ) . In the last decade, protein analysis through shotgun tandem mass spectrometry (MS/MS) helped to unravel some aspects of long-established evolutionary areas, such as palaeontology or archaeology, e.g. to solve the phylogeny of extinct species ( Cappellini et al., 2019 ; Presslee et al., 2019 ; Welker et al., 2019 , 2020) , perform chromosomal sexing on fossil individuals ( Mikšík et al., 2023 ) , or study ancient human behaviour, from Palaeolithic (M. Ma et al., 2024 ) to recent historical events ( Di Gianvincenzo et al., 2023 ; Mackie et al., 2018 ) . Additionally, the identification of species through zooarchaeology by mass spectrometry (ZooMS), a collagen fingerprinting technique, has been established as a powerful workflow for taxonomic assessment in bone remains ( Richter et al., 2022 ) . The growth of genomics and the price reduction of genome sequencing have also enabled the possibility of analysing the proteome of non-model organisms, that is, the vast majority of species that have not been systematically studied. The applications of this horizon range from biomedical advances to new biological discoveries ( Heck & Neely, 2020 ) . Reference protein databases are a key aspect of mass-spectrometry-aided peptide analyses, as they serve as the source for: 1) in ZooMS, identifying the sequence behind a specific marker for its annotation in the standard nomenclature ( Brown et al., 2021 ) ; and 2) in shotgun MS/MS proteomics, generating theoretical mass-to-charge spectra to match experimental data and infer amino acid sequences ( Steen & Mann, 2004 ) . In MS/MS tissue-specific analyses, the size and composition of these databases heavily affect the identification output. Large-database approaches, covering sequences of many species or tissues, raise the computational cost and likelihood of identifying false positives when using peptide identification software ( Rodriguez Palomo et al., 2024 ) –e.g. MASCOT ( Perkins et al., 1999 ) , SEQUEST ( MacCoss et al., 2002 ) , PEAKS (B. Ma et al., 2003 ) , pFind ( Li et al., 2005 ) , MaxQuant ( Cox & Mann, 2008 ) , Byonic (Bern et al., 2012) , MsFragger ( Kong et al., 2017 ) , MetaMorpheus ( Solntsev et al., 2018 ) , or Proteome Discoverer ( Orsburn, 2021 ) –. Thus, narrow databases identify peptides with higher statistical confidence, but sequences not present in the database will most likely not be detected, resulting in a set of unidentified experimental spectra ( Welker, 2018 ) . In shotgun palaeoproteomic analyses, this is the norm more than the exception, as ancient peptides are potentially different to the sequences present in reference databases due to unpredictable evolutionary changes. Particularly, a recent meta-analysis revealed that more than 94% of the experimental spectra remain unidentified in this context ( Chiang, Welker, et al., 2024 ). Nowadays, narrow databases for ancient and non-model proteomics contain proteins known to be expressed in the tissue sample type. In the specific case of paleoproteomics, sequences are taken from an extant sister clade of the sampled extinct species ( Taurozzi et al., 2024 ) . As many repositories must be consulted (e.g. UniProt ( The UniProt Consortium, 2025 ) or RefSeq ( O’Leary et al., 2016 ) ), the manual process of building a database is repetitive and time-consuming, leading to format incongruences and missing data; these issues complicate any downstream analysis and may lead to errors in interpretation. Moreover, this process must be repeated each time a new species and tissue combination is analysed or even within the same study to optimise the reference database composition. The increased interest in ancient and non-model protein studies makes it likely that more taxa will be studied under these methodologies. Thus, a solution to automatically generate custom and reproducible reference protein multi-FASTA is essential for efficiency and consistency. To address this issue, we propose ProteoParc, an easy-to-use software tool to generate optimised reference databases by systematically downloading and processing protein sequences from the most widely used public repositories. Methods Software Overview ProteoParc is coded as a bioinformatic pipeline, concatenating scripts written in Python and R code languages ( github.com/guillecarrillo/proteoparc ). Its execution relies on a Python script (proteoparc.py), which any Linux-like terminal can execute. In short, the protein sequences download mechanism is based on iteratively searching records at UniParc ( uniprot.org/help/uniparc ), a non-redundant archive that stores peptide sequences from more than 20 repositories. The search scope is focused on a particular clade, specified by its NCBI TaxID. This ID is unique to each described taxon and shared between public sequence repositories. Alternatively, the search can be more specific if a gene list is input, which results in downloading only the proteins annotated under the genes in the list. This way, the database specificity increases, reducing downstream computational times and increasing the statistical confidence of peptide identifications in MS/MS analysis. After the database is built, records might be processed under two optional operations: a) remove redundant sequences to reduce extra comparisons by MS/MS identification software, or b) generate an aligned version of the recovered proteins to easily detect incomplete and low-quality sequences in manual inspection by the user. To summarise the metadata information, ProteoParc parses the final database version; results are output in a collection of tables, showing the number of genes, species, and repositories represented. Furthermore, a set of plots is generated to visually inspect for missing proteins or species not present in the repositories. Although its use can be diverse, ProteoParc is intended to generate reference databases for non-model organisms and ancient protein analysis. As previously discussed in this context, it is crucial to include protein information from various extant and extinct species. This relates to either generating input databases for MS/MS identification software or building collagen alignments to analyse ZooMS markers among different species. Workflow Design ProteoParc’s execution is divided into three main steps: Download, Processing and Metadata ( Figure 1A ) ; Each of them is consecutively executed once the previous process has finished. They can also be run separately by executing the scripts in individual runs, allowing manual curation between steps. Two inputs, Project name and TaxID, are mandatory, while a list of genes is optional. Without a gene list, the whole clade proteome is downloaded. The output location may be modified, with the working directory being the default output path. The user can also deactivate the Remove Redundancy or Alignment processes before the execution to reduce computational time (see Execution and Time Performance ). Download figure Open in new tab Figure 1. A) ProteoParc’s workflow overview. (1) Protein records are downloaded from UniParc under a specific TaxID and an optional list of genes, then written into a multi-FASTA file. (2) Duplicated and fragmentary sequences can be filtered out, and an aligned version of each protein in the multi-FASTA can be generated for manual curation purposes. (3) Finally, metadata information is collected from the multi-FASTA database and presented as a set of tables and plots. Optional inputs or processes are marked with an asterisk. B) Output display of a run with all the processes being activated: (1) the multi-FASTA protein database (database.fasta); (2.1) a folder containing the removed redundant records and the unfiltered database (fasta_remove_redundancy); (2.2) a folder containing the protein sequence alignments (alignment_per_gene); (3) a folder containing the metadata information as a set of tables and plots (metadata). 1. Download step A multi-FASTA database is built from all protein isoforms and variants that fulfil each search requirement, i.e the combination of a gene name and/or a TaxID. Proteins are downloaded through an informatics procedure, i.e application programming interface (API), which extracts information from a software component, in this case, UniParc ( Ahmad et al., 2025 ) . The query search parameters are detailed within a URL link; If a gene list is input, the search process is repeated, including a different gene name within the URL query for each iteration. As a sequence can be associated with more than one species and repositories, three JSON files are generated, annotating the species, taxID and source repositories of each record. After the download, all headers are rearranged into the same format, including information about the source repository, UniParc’s ID, last update, protein name, species name, species TaxID, gene name and sequence version. 2. Processing step Two optional processes might be performed using the multi-FASTA database as input: a) Remove redundancy : Although UniParc is a non-redundant archive, proteins can be downloaded twice if they are annotated under two different gene names present in the gene list. Thus, redundant proteins present in two or more different records are removed until only one copy remains. This process also removes sequences that are fragments (substrings) of other records in the database; For instance, short protein isoforms or incomplete sequences. b) Alignment : An aligned version of the database is generated for each annotated gene. Proteins are first sorted based on their gene name and then aligned using mafft v7.525 ( Katoh & Standley, 2013 ) under the --auto argument, which automatically selects the optimal strategy according to the sequence size. Records without a gene name will not be aligned through this process. 3. Metadata step Metadata information is extracted from the multi-FASTA record headers and presented as a set of tables, text files, and plots. This information is completed with the JSON dictionaries for those records that are associated with more than one species or source repository. However, this option can be disabled to just rely on the metadata within the multi-FASTA. In any case, JSON files are removed after the metadata has been extracted. The metadata tables summarise the number of retrieved genes, species and repositories, in addition to the number of genes retrieved per species. On the other hand, plots are generated to visualise the content of the database easily. All information related to the database name, download date, and metadata is condensed into a summary text file. Warning messages are printed there when records have missing information or if genes present in the gene list are not found. Results Output & Formatting ProteoParc’s output is structured in three categories regarding the steps underlying the pipeline ( Figure 1B ) : 1) multi-FASTA protein database - Download step ; All the records are parsed and formatted similarly, based on the UniProt structure. The metadata present in each header consists of information about the source origin, UniParc ID, last update, protein name, species, TaxID, and gene name. The protein name and gene name might be omitted in cases when a gene list is not input, as some proteins in the whole species proteome might not be properly annotated. Commas (,), semicolons (;) or colons (:) are removed, if present, to avoid interference with CSV formatting. 2.1) Redundancy FASTA files (optional) - Processing step ; A filtered multi-FASTA database (without redundant sequences) substitutes the prime database version. Both the unfiltered database (unfiltered_database.fasta) and a file storing the removed records (redundant_records.fasta) are output in the fasta_remove_redundancy directory. 2.2) Protein sequence alignments (optional) - Processing step ; All proteins in the database annotated under the same gene will be aligned and output as a multi-FASTA file with hyphens (“-”) representing gaps. All the alignments are stored in the alignment_per_gene directory, under the name x_aligned.fasta ( x being a specific gene name). 3) Metadata information - Metadata process ; six metadata files are output displaying: I) count of retrieved genes (genes_retrieved.csv), II) count of retrieved species (species_retrieved.csv), III) count of repositories from which the records stem from (repositories_employed.csv), IV) count of retrieved genes per species (species_genes.csv), V) parsed information of each record (records_info.csv), and VI) summarized information about all the metadata metrics, including warnings if records have missing information. In those tables, since each record can be associated with multiple species and/or repositories, the count of these features may exceed the total count of records. An extra text file (VII) is also generated if a gene list is input to build the database, with all targeted genes that were not found after the download step (genes_not_retrieved.txt). Additionally, some metadata information is plotted to visualise the presence (species_per_gene_grid.png) and abundance (species_per_gene_barplot.png) of each gene per species ( Figure 2 ) . Databases with more than 15 species or 35 genes (arbitrary criteria) might overload the plot information, making them unintelligible. In this case, a warning message is printed in the terminal. Download figure Open in new tab Figure 2. Metadata output plots to visualise the protein content and counts of a Proboscidea enamelome database (TaxID: 9779; gene list: documentation/example/enamelome.txt. A) Grid plot showing the presence or absence of a gene for each species. B) Barplot showing the number of different protein records (i.e. isoforms or protein variants) per species and gene. If a record is associated with multiple species, it is counted more than once. As a consequence, counts between species might overlap some records, eliminating the one-to-one correspondence between records and species per gene counts. A step-by-step tutorial can be found in ProteoParc’s ‘documentation’ folder on GitHub. Additionally, two demo output results are stored there, consisting of a whole proteome download of the Mammuthus genus and an enamel tissue-specific (enamelome) download of the Proboscidea order. The enamelome gene list (example/enamelome.txt), employed to target the Proboscidea download, contains 15 usually detected enamel and dentine proteins reported in Taurozzi et al., 2024 . Execution and Time Performance Five scenarios were selected to test the performance of ProteoParc, covering different orders of magnitude in terms of records downloaded: Proboscidea enamel proteome (59 records ∼ 10 1 , TaxID: 9779); human enamel proteome (240 records ∼ 10 2 , TaxID: 9606); Mammuthus whole proteome (1249 records ∼ 10 3 , TaxID: 37348); mammalian enamel proteome (15544 records ∼ 10 4 , TaxID: 40674); and Proboscidea whole proteome (133166 records ∼ 10 5 , TaxID: 9779). For each scenario, all processes – Download, Remove redundancy, Align and Metadata – were consecutively run on a cluster node 2 x AMD EPYC 7H12 64-Core processor with a mean download speed of 50 MB/s. Each test was run 20 times to compute the mean execution real time through the bash time command. All the “enamel proteome” tests were generated by inputting an enamelome gene list. ProteoParc’s execution time significantly increases with the number of records downloaded. This time increment is mostly due to the Remove redundancy and Align processes ( Figure 3 ) , which recursively compare all the sequences present in the database. Specifically, the Align process is the main driver of computational time, while the Remove redundancy process influences the entire execution time only when downloading more than 10 5 records. The execution time for gene-targeted downloads (scenarios 10 1 , 10 2 , and 10 4 ) and small proteomes (scenario 10 3 ) took less than 5 minutes in each process. Those results reflect realistic scenarios as the pipeline is intended to build tissue-specific (narrow) protein databases. Even though the 10 5 scenario run took more than four hours to finish, the execution time is still feasible for an HPC job execution. Thus, the Align step might be omitted to remarkably reduce computational time in download scenarios with more than 10 5 records, also considering that manually checking that many records is unfeasible. The Remove redundancy step might be deactivated as well, but then repetitive sequences will be present in the database, possibly compromising downstream analysis. Download figure Open in new tab Figure 3. ProteoParc’s execution time in five different download scenarios: Proboscidea enamel proteome (59 records ∼ 10 1 , TaxID: 9779); human enamel proteome (240 records ∼ 10 2 , TaxID: 9606); Mammuthus whole proteome (1249 records ∼ 10 3 , TaxID: 37348); mammalian enamel proteome (15544 records ∼ 10 4 , TaxID: 40674); and Proboscidea whole proteome (133166 records ∼ 10 5 , TaxID: 9779). All processes were consecutively run 20 times to calculate the mean execution time on a node 2 x AMD EPYC 7H12 64-Core processor with a mean download speed of 50 MB/s. The standard deviation range is plotted at all points, but it only shows when the values are high enough to appear in the plot scale. The left plot is a zoom in from the right plot (dashed area), removing the 10 5 scenario. The mean execution time grows exponentially with the number of proteins downloaded, and the Align process is the most time-consuming under all scenarios. In the 10 5 scenario, the Remove redundancy process also plays a significant role in the whole computational time. Conclusion Current bioinformatic tools employed in ancient and non-model organism studies have been designed to perform high-quality sample proteomics, but modified in a certain way to fit low-quality scenarios. Therefore, specific software still needs to be developed to assist in quality control, validation, and analysis of ancient and non-model proteomics. Software and pipelines such as deamiDATE ( Ramsøe et al., 2020 ) , PaleoProPhyler ( Patramanis et al., 2023 ) or Anubis ( Chiang, Nair, et al., 2024 ) are good examples of helpful bioinformatic tools in these contexts. ProteoParc follows this trend, providing the community with a user-friendly and versatile tool to build reference databases for palaeoproteomic and non-model organism studies. It also removes format incompatibilities and provides metadata, allowing for database testing within different analyses. The option to build non-redundant and gene-specific databases will reduce computational costs and yield statistically more confident peptide sequences in MS/MS identification software. Moreover, custom databases can also behave as a starting point to be extended using newly translated sequences from genomic data. Thus, ProteoParc aims to help in one of the first critical steps of ancient protein data analysis, making research more reproducible and comparable in the newly developed field of paleoproteomics and non-model organisms analyses. Data Availability ProteoParc: github.com/guillecarrillo/proteoparc Execution time data: github.com/guillecarrillo/execution_time_proteoparc Author Contributions Conceptualisation: E.L. and G.C. with assistance and guidance from all co-authors. Programming and analyses: G.C. Writing: G.C. with input from all co-authors. Conflicts of Interest The authors declare no conflict of interest. Acknowledgements G.C.M . is supported by the programa predoctoral AGAUR-FI ajuts (2024 FI-1 00211) Joan Oró from Secretaria d’Universitats i Recerca, Departament de Recerca i Universitats de la Generalitat de Catalunya and the European Social Found Plus. J.K . is supported by the European Union’s Horizon 2020 research and innovation program under the Marie Sklodowska-Curie “PUSHH” training network, grant agreement No. 861389. T.M.B . is supported by funding from the European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme, grant agreement No. 864203 and PID2021-126004NB-100 (MICIIN/FEDER, UE). This work is part of R+D+I projects PID2020-116908GB-I00 and PID2020-117289GB-I00, funded by the Agencia Estatal de Investigación of the Spanish Ministerio de Ciencia e Innovación (MCIN/AEI/10.13039/501100011033/). Research has also been supported by the Agència de Gestió d’Ajuts Universitaris i de Recerca of the Generalitat de Catalunya (2001 SGR 00620). We thank Ricardo Fong-Zazueta, Amanda Gutierrez, Joseph D. Orkin, Joanna L. Kelley, and Luis Ferrández for their feedback throughout the development stage. Funder Information Declared programa predoctoral AGAUR-FI ajuts Joan Oró from Secretaria d’Universitats i Recerca, Departament de Recerca i Universitats de la Generalitat de Catalunya and the European Social Found Plus , 2024 FI-1 00211 European Union’s Horizon 2020 research and innovation program under the Marie Sklodowska-Curie “PUSHH” training network , grant agreement No. 861389 European Research Council (ERC) under the European Union’s Horizon 2020 research and innovation programme , grant agreement No. 864203 and PID2021-126004NB-100 (MICIIN/FEDER, UE). Agencia Estatal de Investigación of the Spanish Ministerio de Ciencia e Innovación , PID2020-116908GB-I00 and PID2020-117289GB-I00 Agència de Gestió d’Ajuts Universitaris i de Recerca of the Generalitat de Catalunya , 2001 SGR 00620 Footnotes http://github.com/guillecarrillo/proteoparc http://github.com/guillecarrillo/execution_time_proteoparc References ↵ Ahmad , S. , Jose da Costa Gonzales , L. , Bowler-Barnett , E. H. , Rice , D. L. , Kim , M. , Wijerathne , S. , Luciani , A. , Kandasaamy , S. , Luo , J. , Watkins , X. , Turner , E. , Martin , M. J. , & the UniProt Consortium . ( 2025 ). The UniProt website API: facilitating programmatic access to protein knowledge . Nucleic Acids Research, gkaf394 . doi: 10.1093/nar/gkaf394 OpenUrl CrossRef ↵ Brown , S. , Douka , K. , Collins , M. J. , & Richter , K. K. ( 2021 ). On the standardization of ZooMS nomenclature . Journal of Proteomics , 235 , 104041 . doi: 10.1016/j.jprot.2020.104041 OpenUrl CrossRef PubMed ↵ Brunson , K. , & Reich , D. ( 2019 ). The Promise of Paleogenomics Beyond Our Own Species . Trends in Genetics , 35 ( 5 ), 319 – 329 . doi: 10.1016/j.tig.2019.02.006 OpenUrl CrossRef PubMed ↵ Cappellini , E. , Prohaska , A. , Racimo , F. , Welker , F. , Pedersen , M. W. , Allentoft , M. E. , de Barros Damgaard , P. , Gutenbrunner , P. , Dunne , J. , Hammann , S. , Roffet-Salque , M. , Ilardo , M. , Moreno-Mayar , J. V. , Wang , Y. , Sikora , M. , Vinner , L. , Cox , J. , Evershed , R. P. , & Willerslev , E. ( 2018 ). Ancient Biomolecules and Evolutionary Inference . In Annual Review of Biochemistry (Vol. 87 , Issue Volume 87 , 2018, pp. 1029 – 1060 ). Annual Reviews. doi: 10.1146/annurev-biochem-062917-012002 OpenUrl CrossRef PubMed ↵ Cappellini , E. , Welker , F. , Pandolfi , L. , Ramos-Madrigal , J. , Samodova , D. , Rüther , P. L. , Fotakis , A. K. , Lyon , D. , Moreno-Mayar , J. V. , Bukhsianidze , M. , Rakownikow Jersie-Christensen , R. , Mackie , M. , Ginolhac , A. , Ferring , R. , Tappen , M. , Palkopoulou , E. , Dickinson , M. R. , Stafford , T. W. , Chan , Y. L. , … Willerslev , E. ( 2019 ). Early Pleistocene enamel proteome from Dmanisi resolves Stephanorhinus phylogeny . Nature , 574 ( 7776 ), 103 – 107 . doi: 10.1038/s41586-019-1555-y OpenUrl CrossRef PubMed ↵ Chiang , Y. , Nair , B. A. B. , Ramsøe , M. E. E. , Ravnsborg , T. , Jensen , O. N. , & Collins , M. J. ( 2024 ). Anubis: A multi-level authentication scale for ancient proteins using random forest classification . bioRxiv , 2024.11.15.623824. doi: 10.1101/2024.11.15.623824 OpenUrl Abstract / FREE Full Text ↵ Chiang , Y. , Welker , F. , & Collins , M. ( 2024 ). Spectra without stories: Reporting 94% dark and unidentified ancient proteomes . Open Research Europe , 4 ( 71 ). doi: 10.12688/openreseurope.17225.1 OpenUrl CrossRef PubMed ↵ Cox , J. , & Mann , M. ( 2008 ). MaxQuant enables high peptide identification rates, individualized p.p.b.-range mass accuracies and proteome-wide protein quantification . Nature Biotechnology , 26 ( 12 ), 1367 – 1372 . doi: 10.1038/nbt.1511 OpenUrl CrossRef PubMed Web of Science ↵ Dalén , L. , Heintzman , P. D. , Kapp , J. D. , & Shapiro , B. ( 2023 ). Deep-time paleogenomics and the limits of DNA survival . Science , 382 ( 6666 ), 48 – 53 . doi: 10.1126/science.adh7943 OpenUrl CrossRef ↵ Di Gianvincenzo , F. , Andersen , C. K. , Filtenborg , T. , Mackie , M. , Ernst , M. , Ramos Madrigal , J. , Olsen , J. V. , Wadum , J. , & Cappellini , E. ( 2023 ). Proteomic identification of beer brewing products in the ground layer of Danish Golden Age paintings . Science Advances , 9 ( 21 ), eade7686 . doi: 10.1126/sciadv.ade7686 OpenUrl CrossRef PubMed ↵ Heck , M. , & Neely , B. A. ( 2020 ). Proteomics in Non-model Organisms: A New Analytical Frontier . Journal of Proteome Research , 19 ( 9 ), 3595 – 3606 . doi: 10.1021/acs.jproteome.0c00448 OpenUrl CrossRef PubMed ↵ Hendy , J. , Welker , F. , Demarchi , B. , Speller , C. , Warinner , C. , & Collins , M. J. ( 2018 ). A guide to ancient protein studies . Nature Ecology & Evolution , 2 ( 5 ), 791 – 799 . doi: 10.1038/s41559-018-0510-x OpenUrl CrossRef PubMed ↵ Katoh , K. , & Standley , D. M. ( 2013 ). MAFFT Multiple Sequence Alignment Software Version 7: Improvements in Performance and Usability . Molecular Biology and Evolution , 30 ( 4 ), 772 – 780 . doi: 10.1093/molbev/mst010 OpenUrl CrossRef PubMed Web of Science ↵ Kjær , K. H. , Winther Pedersen , M. , De Sanctis , B. , De Cahsan , B. , Korneliussen , T. S. , Michelsen , C. S. , Sand , K. K. , Jelavić , S. , Ruter , A. H. , Schmidt , A. M. A. , Kjeldsen , K. K. , Tesakov , A. S. , Snowball , I. , Gosse , J. C. , Alsos , I. G. , Wang , Y. , Dockter , C. , Rasmussen , M. , Jørgensen , M. E. , … PhyloNorway Consortium . ( 2022 ). A 2-million-year-old ecosystem in Greenland uncovered by environmental DNA . Nature , 612 ( 7939 ), 283 – 291 . doi: 10.1038/s41586-022-05453-y OpenUrl CrossRef PubMed ↵ Kong , A. T. , Leprevost , F. V. , Avtonomov , D. M. , Mellacheruvu , D. , & Nesvizhskii , A. I. ( 2017 ). MSFragger: Ultrafast and comprehensive peptide identification in mass spectrometry–based proteomics . Nature Methods , 14 ( 5 ), 513 – 520 . doi: 10.1038/nmeth.4256 OpenUrl CrossRef PubMed ↵ Li , D. , Fu , Y. , Sun , R. , Ling , C. X. , Wei , Y. , Zhou , H. , Zeng , R. , Yang , Q. , He , S. , & Gao , W. ( 2005 ). pFind: A novel database-searching software system for automated peptide and protein identification via tandem mass spectrometry . Bioinformatics , 21 ( 13 ), 3049 – 3050 . doi: 10.1093/bioinformatics/bti439 OpenUrl CrossRef PubMed Web of Science ↵ Ma , B. , Zhang , K. , Hendrie , C. , Liang , C. , Li , M. , Doherty-Kirby , A. , & Lajoie , G. ( 2003 ). PEAKS: powerful software for peptide de novo sequencing by tandem mass spectrometry . Rapid Communications in Mass Spectrometry , 17 ( 20 ), 2337 – 2342 . doi: 10.1002/rcm.1196 OpenUrl CrossRef PubMed Web of Science ↵ Ma , M. , Lu , M. , Sun , R. , Zhu , Z. , Fuller , D. Q. , Guo , J. , He , G. , Yang , X. , Tan , L. , Lu , Y. , Dong , J. , Liu , R. , Yang , J. , Li , B. , Guo , T. , Li , X. , Zhao , D. , Zhang , Y. , Wang , C.-C. , & Dong , G. ( 2024 ). Forager-farmer transition at the crossroads of East and Southeast Asia 4900 years ago . Science Bulletin , 69 ( 1 ), 103 – 113 . doi: 10.1016/j.scib.2023.10.015 OpenUrl CrossRef PubMed ↵ MacCoss , M. J. , Wu , C. C. , & Yates , J. R. ( 2002 ). Probability-Based Validation of Protein Identifications Using a Modified SEQUEST Algorithm . Analytical Chemistry , 74 ( 21 ), 5593 – 5599 . doi: 10.1021/ac025826t OpenUrl CrossRef PubMed ↵ Mackie , M. , Rüther , P. , Samodova , D. , Di Gianvincenzo , F. , Granzotto , C. , Lyon , D. , Peggie , D. A. , Howard , H. , Harrison , L. , Jensen , L. J. , Olsen , J. V. , & Cappellini , E. ( 2018 ). Palaeoproteomic Profiling of Conservation Layers on a 14th Century Italian Wall Painting . Angewandte Chemie International Edition , 57 ( 25 ), 7369 – 7374 . doi: 10.1002/anie.201713020 OpenUrl CrossRef ↵ Mikšík , I. , Morvan , M. , & Brůžek , J. ( 2023 ). Peptide analysis of tooth enamel – A sex estimation tool for archaeological, anthropological, or forensic research . Journal of Separation Science , 46 ( 15 ), 2300183 . doi: 10.1002/jssc.202300183 OpenUrl CrossRef ↵ O’Leary , N. A. , Wright , M. W. , Brister , J. R. , Ciufo , S. , Haddad , D. , McVeigh , R. , Rajput , B. , Robbertse , B. , Smith-White , B. , Ako-Adjei , D. , Astashyn , A. , Badretdin , A. , Bao , Y. , Blinkova , O. , Brover , V. , Chetvernin , V. , Choi , J. , Cox , E. , Ermolaeva , O. , … Pruitt , K. D. ( 2016 ). Reference sequence (RefSeq) database at NCBI: current status, taxonomic expansion, and functional annotation . Nucleic Acids Research , 44 ( D1 ), D733 – D745 . doi: 10.1093/nar/gkv1189 OpenUrl CrossRef PubMed ↵ Orsburn , B. C. ( 2021 ). Proteome Discoverer—A Community Enhanced Data Processing Suite for Protein Informatics . Proteomes , 9 ( 1 ). doi: 10.3390/proteomes9010015 OpenUrl CrossRef PubMed ↵ Paterson , R. S. , Mackie , M. , Capobianco , A. , Heckeberg , N. S. , Fraser , D. , Demarchi , B. , Munir , F. , Patramanis , I. , Ramos-Madrigal , J. , Liu , S. , Ramsøe , A. D. , Dickinson , M. R. , Baldreki , C. , Gilbert , M. , Sardella , R. , Bellucci , L. , Scorrano , G. , Leonardi , M. , Manica , A. , … Cappellini , E. ( 2025 ). Phylogenetically informative proteins from an Early Miocene rhinocerotid . Nature . doi: 10.1038/s41586-025-09231-4 OpenUrl CrossRef ↵ Patramanis , I. , Ramos-Madrigal , J. , Cappellini , E. , & Racimo , F. ( 2023 ). PaleoProPhyler: A reproducible pipeline for phylogenetic inference using ancient proteins . Peer Community Journal , 3 . doi: 10.24072/pcjournal.344 OpenUrl CrossRef ↵ Perkins , D. N. , Pappin , D. J. C. , Creasy , D. M. , & Cottrell , J. S. ( 1999 ). Probability-based protein identification by searching sequence databases using mass spectrometry data . Electrophoresis , 20 ( 18 ), 3551 – 3567 . doi: 10.1002/(SICI)1522-2683(19991201)20:183.0.CO;2-2 OpenUrl CrossRef PubMed Web of Science ↵ Presslee , S. , Slater , G. J. , Pujos , F. , Forasiepi , A. M. , Fischer , R. , Molloy , K. , Mackie , M. , Olsen , J. V. , Kramarz , A. , Taglioretti , M. , Scaglia , F. , Lezcano , M. , Lanata , J. L. , Southon , J. , Feranec , R. , Bloch , J. , Hajduk , A. , Martin , F. M. , Salas Gismondi , R. , … MacPhee , R. D. E. ( 2019 ). Palaeoproteomics resolves sloth relationships . Nature Ecology & Evolution , 3 ( 7 ), 1121 – 1130 . doi: 10.1038/s41559-019-0909-z OpenUrl CrossRef ↵ Ramsøe , A. , van Heekeren , V. , Ponce , P. , Fischer , R. , Barnes , I. , Speller , C. , & Collins , M. J. ( 2020 ). DeamiDATE 1.0: Site-specific deamidation as a tool to assess authenticity of members of ancient proteomes . Journal of Archaeological Science , 115 , 105080 . doi: 10.1016/j.jas.2020.105080 OpenUrl CrossRef ↵ Richter , K. K. , Codlin , M. C. , Seabrook , M. , & Warinner , C. ( 2022 ). A primer for ZooMS applications in archaeology . Proceedings of the National Academy of Sciences , 119 ( 20 ), e2109323119 . doi: 10.1073/pnas.2109323119 OpenUrl CrossRef PubMed ↵ Rodriguez Palomo , I. , Nair , B. , Chiang , Y. , Dekker , J. , Dartigues , B. , Mackie , M. , Evans , M. , Macleod , R. , Olsen , J. V. , & Collins , M. J. ( 2024 ). Benchmarking the identification of a single degraded protein to explore optimal search strategies for ancient proteins . Peer Community Journal , 4 . doi: 10.24072/pcjournal.491 OpenUrl CrossRef ↵ Solntsev , S. K. , Shortreed , M. R. , Frey , B. L. , & Smith , L. M. ( 2018 ). Enhanced Global Post-translational Modification Discovery with MetaMorpheus . Journal of Proteome Research , 17 ( 5 ), 1844 – 1851 . doi: 10.1021/acs.jproteome.7b00873 OpenUrl CrossRef PubMed ↵ Steen , H. , & Mann , M. ( 2004 ). The abc’s (and xyz’s) of peptide sequencing . Nature Reviews Molecular Cell Biology , 5 ( 9 ), 699 – 711 . doi: 10.1038/nrm1468 OpenUrl CrossRef PubMed Web of Science ↵ Taurozzi , A. J. , Rüther , P. L. , Patramanis , I. , Koenig , C. , Sinclair Paterson , R. , Madupe , P. P. , Harking , F. S. , Welker , F. , Mackie , M. , Ramos-Madrigal , J. , Olsen , J. V. , & Cappellini , E. ( 2024 ). Deep-time phylogenetic inference by paleoproteomic analysis of dental enamel . Nature Protocols , 19 ( 7 ), 2085 – 2116 . doi: 10.1038/s41596-024-00975-3 OpenUrl CrossRef ↵ The UniProt Consortium . ( 2025 ). UniProt: The Universal Protein Knowledgebase in 2025 . Nucleic Acids Research , 53 ( D1 ), D609 – D617 . doi: 10.1093/nar/gkae1010 OpenUrl CrossRef PubMed ↵ Warinner , C. , Korzow Richter , K. , & Collins , M. J. ( 2022 ). Paleoproteomics . Chemical Reviews , 122 ( 16 ), 13401 – 13446 . doi: 10.1021/acs.chemrev.1c00703 OpenUrl CrossRef PubMed ↵ Welker , F. ( 2018 ). Elucidation of cross-species proteomic effects in human and hominin bone proteome identification through a bioinformatics experiment . BMC Evolutionary Biology , 18 ( 1 ), 23 . doi: 10.1186/s12862-018-1141-1 OpenUrl CrossRef PubMed Welker , F. , Ramos-Madrigal , J. , Gutenbrunner , P. , Mackie , M. , Tiwary , S. , Rakownikow Jersie-Christensen , R. , Chiva , C. , Dickinson , M. R. , Kuhlwilm , M. , de Manuel , M. , Gelabert , P. , Martinón-Torres , M. , Margvelashvili , A. , Arsuaga , J. L. , Carbonell , E. , Marques-Bonet , T. , Penkman , K. , Sabidó , E. , Cox , J. , … Cappellini , E. ( 2020 ). The dental proteome of Homo antecessor . Nature , 580 ( 7802 ), 235 – 238 . doi: 10.1038/s41586-020-2153-8 OpenUrl CrossRef ↵ Welker , F. , Ramos-Madrigal , J. , Kuhlwilm , M. , Liao , W. , Gutenbrunner , P. , de Manuel , M. , Samodova , D. , Mackie , M. , Allentoft , M. E. , Bacon , A.-M. , Collins , M. J. , Cox , J. , Lalueza-Fox , C. , Olsen , J. V. , Demeter , F. , Wang , W. , Marques-Bonet , T. , & Cappellini , E. ( 2019 ). Enamel proteome shows that Gigantopithecus was an early diverging pongine . Nature , 576 ( 7786 ), 262 – 265 . doi: 10.1038/s41586-019-1728-8 OpenUrl CrossRef PubMed View the discussion thread. Back to top Previous Next Posted August 02, 2025. Download PDF Data/Code Email Thank you for your interest in spreading the word about bioRxiv. NOTE: Your email address is requested solely to identify you as the sender of this article. Your Email * Your Name * Send To * Enter multiple addresses on separate lines or separate them with commas. You are going to email the following ProteoParc: A tool to generate protein reference databases for ancient and non-model organisms Message Subject (Your Name) has forwarded a page to you from bioRxiv Message Body (Your Name) thought you would like to see this page from the bioRxiv website. Your Personal Message CAPTCHA This question is for testing whether or not you are a human visitor and to prevent automated spam submissions. Share ProteoParc: A tool to generate protein reference databases for ancient and non-model organisms Guillermo Carrillo-Martin , Johanna Krueger , Tomas Marques-Bonet , Esther Lizano bioRxiv 2025.07.31.667843; doi: https://doi.org/10.1101/2025.07.31.667843 Share This Article: Copy Citation Tools ProteoParc: A tool to generate protein reference databases for ancient and non-model organisms Guillermo Carrillo-Martin , Johanna Krueger , Tomas Marques-Bonet , Esther Lizano bioRxiv 2025.07.31.667843; doi: https://doi.org/10.1101/2025.07.31.667843 Citation Manager Formats BibTeX Bookends EasyBib EndNote (tagged) EndNote 8 (xml) Medlars Mendeley Papers RefWorks Tagged Ref Manager RIS Zotero Tweet Widget Facebook Like Google Plus One Subject Area Bioinformatics Subject Areas All Articles Animal Behavior and Cognition (7624) Biochemistry (17651) Bioengineering (13871) Bioinformatics (41884) Biophysics (21424) Cancer Biology (18566) Cell Biology (25463) Clinical Trials (138) Developmental Biology (13365) Ecology (19867) Epidemiology (2067) Evolutionary Biology (24290) Genetics (15590) Genomics (22477) Immunology (17714) Microbiology (40331) Molecular Biology (17148) Neuroscience (88487) Paleontology (666) Pathology (2828) Pharmacology and Toxicology (4817) Physiology (7635) Plant Biology (15114) Scientific Communication and Education (2044) Synthetic Biology (4286) Systems Biology (9815) Zoology (2268)

Text is read by the "Ask this paper" AI Q&A widget below. Extraction quality varies by source — PMC NXML preserves structure cleanly, OA-HTML may include some navigation residue, and OA-PDF can have broken hyphenation. The publisher copy (via DOI) is the canonical version.

My notes (saved in your browser only)

Ask this paper AI returns verbatim quotes from the full text · source: preprint-html

Answers must be backed by verbatim quotes from this paper's full text. Hallucinated quotes are dropped automatically; if no verbatim passage answers the question, we say so. How this works

Citation neighborhood (no data yet)

We don't have any in-corpus citations linked to this paper yet. This is a recent paper (2025) — citers typically take a year or two to land, and the OpenAlex reference graph may still be filling in.

Source provenance

europepmc
last seen: 2026-05-20T01:45:00.602351+00:00
unpaywall
last seen: 2026-05-21T05:10:58.409756+00:00
License: CC-BY-NC-ND-4.0