diff --git a/joss_paper/flowchart.png b/joss_paper/flowchart.png deleted file mode 100644 index 6338044..0000000 Binary files a/joss_paper/flowchart.png and /dev/null differ diff --git a/joss_paper/paper.bib b/joss_paper/paper.bib deleted file mode 100644 index 4b6de7e..0000000 --- a/joss_paper/paper.bib +++ /dev/null @@ -1,182 +0,0 @@ -@misc{DIF, - author = {O. Lindemann and F. Krause}, - title = {Data Integrity Fingerprint (DIF) - A proposal for a human-readable fingerprint of scientific datasets that allows verifying their integrity}, - year = {2021}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/expyriment/dataintegrityfingerprint} -} - -@online{EU, - author = {{European Parliament, Council of the European Union}}, - title = {Directive (EU) 2019/1024 of the European Parliament and of the Council of 20 June 2019 on open data and the re-use of public sector information}, - year = {2019}, - url = {https://eur-lex.europa.eu/eli/dir/2019/1024/oj}, - urldate = {2021-12-06} -} - - -@article{Wilkinson, - title = {The {FAIR} {Guiding} {Principles} for scientific data management and stewardship}, - volume = {3}, - copyright = {2016 The Author(s)}, - issn = {2052-4463}, - url = {https://www.nature.com/articles/sdata201618}, - doi = {10.1038/sdata.2016.18}, - abstract = {There is an urgent need to improve the infrastructure supporting the reuse of scholarly data. A diverse set of stakeholders—representing academia, industry, funding agencies, and scholarly publishers—have come together to design and jointly endorse a concise and measureable set of principles that we refer to as the FAIR Data Principles. The intent is that these may act as a guideline for those wishing to enhance the reusability of their data holdings. Distinct from peer initiatives that focus on the human scholar, the FAIR Principles put specific emphasis on enhancing the ability of machines to automatically find and use the data, in addition to supporting its reuse by individuals. This Comment is the first formal publication of the FAIR Principles, and includes the rationale behind them, and some exemplar implementations in the community.}, - language = {en}, - number = {1}, - urldate = {2021-12-06}, - journal = {Scientific Data}, - author = {Wilkinson, Mark D. and Dumontier, Michel and Aalbersberg, IJsbrand Jan and Appleton, Gabrielle and Axton, Myles and Baak, Arie and Blomberg, Niklas and Boiten, Jan-Willem and da Silva Santos, Luiz Bonino and Bourne, Philip E. and Bouwman, Jildau and Brookes, Anthony J. and Clark, Tim and Crosas, Mercè and Dillo, Ingrid and Dumon, Olivier and Edmunds, Scott and Evelo, Chris T. and Finkers, Richard and Gonzalez-Beltran, Alejandra and Gray, Alasdair J. G. and Groth, Paul and Goble, Carole and Grethe, Jeffrey S. and Heringa, Jaap and ’t Hoen, Peter A. C. and Hooft, Rob and Kuhn, Tobias and Kok, Ruben and Kok, Joost and Lusher, Scott J. and Martone, Maryann E. and Mons, Albert and Packer, Abel L. and Persson, Bengt and Rocca-Serra, Philippe and Roos, Marco and van Schaik, Rene and Sansone, Susanna-Assunta and Schultes, Erik and Sengstag, Thierry and Slater, Ted and Strawn, George and Swertz, Morris A. and Thompson, Mark and van der Lei, Johan and van Mulligen, Erik and Velterop, Jan and Waagmeester, Andra and Wittenburg, Peter and Wolstencroft, Katherine and Zhao, Jun and Mons, Barend}, - month = mar, - year = {2016}, - note = {Bandiera\_abtest: a -Cg\_type: Nature Research Journals -Number: 1 -Primary\_atype: Comments \& Opinion -Publisher: Nature Publishing Group -Subject\_term: Publication characteristics;Research data -Subject\_term\_id: publication-characteristics;research-data}, - pages = {160018}, -} - - -@techreport{Tan, - type = {preprint}, - title = {Research data repositories chosen by researchers across broad range of disciplines, from an analysis of 145,000 data availability statements}, - url = {https://www.authorea.com/users/260319/articles/436166-research-data-repositories-chosen-by-researchers-across-broad-range-of-disciplines-from-an-analysis-of-145-000-data-availability-statements?commit=05d5052a56e7ac63dd07d17fb88c47430b194874}, - urldate = {2021-12-06}, - institution = {Preprints}, - author = {Tan, Serena C and Flanagan, Dave and Morris, Elisha and Graf, Chris}, - month = jul, - year = {2020}, - doi = {10.22541/au.159422974.49069472}, -} - - -@article{Liu, - title = {Digital {Object} {Identifier} ({DOI}) and {DOI} {Services}: {An} {Overview}}, - volume = {71}, - issn = {1865-8423}, - shorttitle = {Digital {Object} {Identifier} ({DOI}) and {DOI} {Services}}, - url = {https://www.degruyter.com/document/doi/10.1515/libri-2020-0018/html}, - doi = {10.1515/libri-2020-0018}, - abstract = {In the establishing anniversary of the two biggest Digital Object Identifier (DOI) registration agencies all over the world, Crossref and DataCite, the paper intends to provide an overview of the development and approaches and of DOI and DOI services, from which scholarly communication has benefited greatly. At first, the author explores the initiation of DOI and differences of DOI from other persistent identifiers. After that, DOIs for different kinds of objects and DOIs’ value in enhancing scholarly communication is discussed; then, in the second part, DOI services at different levels in a pyramid and those particularly in Germany are described. The active involvement of the library world are also introduced here; finally, the current situation and prospects as well as some issues dealing with DOIs and DOI services are investigated in the last part of the paper.}, - language = {en}, - number = {4}, - urldate = {2021-12-06}, - journal = {Libri}, - author = {Liu, Jia}, - month = dec, - year = {2021}, - note = {Publisher: De Gruyter Saur}, - pages = {349--360}, -} - - -@article{Lin, - title = {The {TRUST} {Principles} for digital repositories}, - volume = {7}, - copyright = {2020 This is a U.S. government work and not under copyright protection in the U.S.; foreign copyright protection may apply}, - issn = {2052-4463}, - url = {https://www.nature.com/articles/s41597-020-0486-7}, - doi = {10.1038/s41597-020-0486-7}, - abstract = {As information and communication technology has become pervasive in our society, we are increasingly dependent on both digital data and repositories that provide access to and enable the use of such resources. Repositories must earn the trust of the communities they intend to serve and demonstrate that they are reliable and capable of appropriately managing the data they hold.}, - language = {en}, - number = {1}, - urldate = {2021-12-06}, - journal = {Scientific Data}, - author = {Lin, Dawei and Crabtree, Jonathan and Dillo, Ingrid and Downs, Robert R. and Edmunds, Rorie and Giaretta, David and De Giusti, Marisa and L’Hours, Hervé and Hugo, Wim and Jenkyns, Reyna and Khodiyar, Varsha and Martone, Maryann E. and Mokrane, Mustapha and Navale, Vivek and Petters, Jonathan and Sierman, Barbara and Sokolova, Dina V. and Stockhause, Martina and Westbrook, John}, - month = may, - year = {2020}, - note = {Bandiera\_abtest: a -Cc\_license\_type: cc\_by -Cg\_type: Nature Research Journals -Number: 1 -Primary\_atype: Comments \& Opinion -Publisher: Nature Publishing Group -Subject\_term: Genetic databases;Policy -Subject\_term\_id: genetic-databases;policy}, - pages = {144}, -} - - -@misc{DirHash, - author = {M. IDRASSI}, - title = {DirHash - Windows command line utility to compute hash of directories and files}, - year = {2021}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/idrassi/DirHash} -} - - -@misc{checksum, - author = {H. Weickert}, - title = {checksum - Creates checksums for files and directories}, - year = {2019}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/hweickert/checksum} -} - - -@misc{Dirtools, - author = {T. Sileo}, - title = {Dirtools - Exclude/ignore files in a directory (using .gitignore like syntax), compute hash, search projects for an entire directory tree, gzip compression and track changes in a directory over time}, - year = {2014}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/tsileo/dirtools} -} - - -@misc{checksumdir, - author = {T. McCarthy}, - title = {checksumdir - Simple package to compute a single deterministic hash of the file contents of a directory}, - year = {2020}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/cakepietoast/checksumdir} -} - - -@misc{hashdir, - author = {F. Civaner}, - title = {hashdir - A command line tool to calculate hashes of directory trees using various hash algorithms}, - year = {2021}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/fcivaner/hashdir} -} - - -@misc{dirhash-python, - author = {A. Huss}, - title = {dirhash-python - Python module and CLI for hashing of file system directories based on the Dirhash Standard}, - year = {2020}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/andhus/dirhash-python} -} - - -@misc{filehash, - author = {L.T. Saguisag Jr.}, - title = {filehash - Python module that wraps around hashlib and zlib to facilitate generating checksums / hashes of files and directories}, - year = {2021}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/leonidessaguisagjr/filehash} -} - - -@misc{dirhash, - author = {A. Huss}, - title = {The Dirhash Standard: A formal procedure for hashing of a filesystem directory}, - year = {2020}, - publisher = {GitHub}, - journal = {GitHub repository}, - url = {https://github.com/andhus/dirhash} -} diff --git a/joss_paper/paper.md b/joss_paper/paper.md deleted file mode 100644 index a62f7ea..0000000 --- a/joss_paper/paper.md +++ /dev/null @@ -1,140 +0,0 @@ ---- -title: 'Data Integrity Fingerprint (DIF) - A reference implementation in Python' -tags: - - Python - - Open Data - - datasets - - data integrity - - checksum - - hash -authors: - - name: Oliver Lindemann^[corresponding author] - orcid: 0000-0003-3789-5373 - affiliation: 1 - - name: Florian Krause - orcid: 0000-0002-2754-3692 - affiliation: "2, 3" -affiliations: - - name: Department of Psychology, Education and Child Studies, Erasmus University Rotterdam, The Netherlands - index: 1 - - name: Donders Institute for Brain, Cognition and Behaviour, Radboud University Medical Center, Nijmegen, The Netherlands - index: 2 - - name: Department of Psychiatry, Brain+Nerve Centre, Maastricht University Medical Center, The Netherlands - index: 3 -date: 06 December 2021 -bibliography: paper.bib ---- - - -# Summary - -We hereby present the reference implementation of the _Data Integrity -Fingerprint (DIF)_ - a proposal for a human-readable fingerprint of scientific -datasets [@DIF]. The software can be used via the command line, via a graphical -user interface, or as a Python library for embedding in other software. In -either case, the user has the choice of calculating the DIF based on a variety -of (cryptographic) hash algorithms using serial (single CPU core) or parallel -(multiple CPU cores) computing. In addition, a checksums file with fingerprints -of individual files in a dataset can be created. These files can also serve as -the basis for calculating the DIF and, in addition, can be compared against a -dataset in order to reveal content differences in case a DIF could not be -verified. - - -# Statement of need - -In recent years, sharing scientific datasets has become good research practice -[@Wilkinson] and the concept of _Open Data_ has been incorporated into -international policies [@EU]. Making the scientific data that corresponds to -a published journal article publicly available calls for a reliable procedure to -unmistakenly and indefinitely link the data to the article. Currently, this is -typically implemented through storage [e.g. GitHub, Dryad, Open Science -Framework; @Tan] and identification/linking providers -[e.g. Digital Object Identifier; @Liu] that need to be constantly maintained [@Lin]. -This dependency on the availability of such (mostly commerical) third-party services, -however, is fragile and inappropriate for a persistent and sustainable Open Data -practice. - -The DIF provides a simple alternative solution that does not rely on a third -party by extending the concept of file verification to multi-file datasets -(see also \autoref{fig:Fig1}): - -* The author of a journal article calculates checksums of all the files in the - dataset the article relates to - -* From these checksums the author calculates a single "master checksum" (the - DIF) that uniquly identifies the entire dataset - -* The author reports the DIF in the journal article - -* A reader of the journal article who obtained a copy of the dataset (from - either the author or any other source) calculates the DIF of their copy of - the dataset and compares it to the correct DIF as stated in the article - -* If the list of checksums of individual files in the original dataset is - available, the author can furthermore investigate in detail the differences - between the datasets, in case of a DIF mismatch - -![Schematic overview of verifying the integrity of a dataset using the DIF.\label{fig:Fig1}](flowchart.png) - -Notably, previous efforts to solve this problem outside of the scientific -domain suffer from several shortcomings. Their implementations are either not -available cross-platform [i.e. Windows, MacOS, Linux; @DirHash], lack a -command line [@checksum; @Dirtools] or graphical [@checksumdir; @hashdir; -@checksum; @Dirtools; @DirHash; @dirhash-python; @filehash] user interface, or -are not meant to be used as a programming library [@checksum]. The here -presented software offers all of these features. - -More importantly, however, previous efforts are incompatible with each other, -due to a lack of a formal specification of how to calculate the fingerprint. -The (to our knowledge) only attempt at defining a standard procedure of how to -calculate a fingerprint of a directory [@dirhash] is specifically designed to -be extendable and requires a user to make a priori decicions on variety of -options, which all affect the calculation (and the result) and hence also need -to be know by anyone wanting to verify the fingerprint. We believe that this -amount of degrees of freedom (and potential error) are not a good fit for a -scientific application, and is at odds with our goal of a simple human-readable -fingerprint that can be printed in a journal article. -The DIF, on the other hand, has only a single degree of freedom, which is the -hash algorithm chosen to base all calculations on. While we recommend to -use SHA-256, having a algorithm-independent DIF is crucial for being able to -adapt to future developments in the domain of cryptography and computer -security. - -We chose Python as the reference implementation for the DIF, because Python -(a) is open source, (b) is available cross-platform, (c) offers very good code -readability, (d) is widely-used in the scientific community, and (e) allowed -us to implement the underlying calculation as well as command line and -graphical user interfaces using only the built-in standard libary, without -relying on a set of complex external dependencies. Not only does this make the -reference implementation easy to understand and a good basis for other -implementations, but it also simplifies maintainability of the software and -ensures its long-term availability. - - -# Specification - -The procedure for calculating the DIF is: - -1. Choose a (cryptographic) hash function `Hash` (e.g. SHA-256) - -2. For every file `f` in the (potentially nested) subtree under the dataset root directory (with symbolic links being followed), - - * calculate the checksum `c` as the hexadecimal digest (lower case letters) of `Hash(f)` (i.e. the hashed _binary contents_ of the file) - - * get the file path `p` as the UTF-8 encoded relative path in Unix notation (i.e. U+002F slash character as separator) from the dataset root directory to `f` - - * create the string `cp` (i.e the concatenation of `c` and `p`) - - * add `cp` to a list `l` - -3. Sort `l` in ascending Unicode code point order (i.e., byte- wise sorting, NOT based on the Unicode collation algorithm) - -4. Create the string `l[0]l[1]...l[n]` (i.e. the concatenation of all elements of `l`) - -5. Retrieve the DIF as the hexadecimal digest of `Hash(l[0]l[1]...l[n])` - -Optionally, checksums of individual files and their file paths can be saved as a checksums file with lines of `c␣␣p` for each `f` (i.e. `c` followed by two U+0020 whitespace characters followed by `p`). - - -# References diff --git a/src/dataintegrityfingerprint/__init__.py b/src/dataintegrityfingerprint/__init__.py index 3f87f71..318ff11 100644 --- a/src/dataintegrityfingerprint/__init__.py +++ b/src/dataintegrityfingerprint/__init__.py @@ -9,7 +9,7 @@ __author__ = 'Oliver Lindemann , ' \ 'Florian Krause ' -__version__ = '0.7.5' +__version__ = '0.7.6' from .dif import DataIntegrityFingerprint