From 1949d8fe66d0fb3ba858c4e96650d12fb0dde6c7 Mon Sep 17 00:00:00 2001 From: Jonathan de Bruin Date: Fri, 15 Sep 2023 01:14:47 +0200 Subject: [PATCH] Add support for arXiv (#61) --- datahugger/config.py | 2 ++ datahugger/services.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/datahugger/config.py b/datahugger/config.py index 10e0a2e..b873ada 100644 --- a/datahugger/config.py +++ b/datahugger/config.py @@ -1,3 +1,4 @@ +from datahugger.services import ArXivDataset from datahugger.services import DataDryadDataset from datahugger.services import DataOneDataset from datahugger.services import DataverseDataset @@ -12,6 +13,7 @@ # fast lookup SERVICES_NETLOC = { + "arxiv.org": ArXivDataset, "zenodo.org": ZenodoDataset, "github.com": GitHubDataset, "datadryad.org": DataDryadDataset, diff --git a/datahugger/services.py b/datahugger/services.py index 672f71a..05a5164 100644 --- a/datahugger/services.py +++ b/datahugger/services.py @@ -323,3 +323,21 @@ def _get( def files(self): # at the moment, .files is not available for HuggingFace raise AttributeError("'files' is not available for HuggingFace") + + +class ArXivDataset(DatasetDownloader, DatasetResult): + """Downloader for ArXiv publication.""" + + REGEXP_ID = r"https://arxiv\.org/abs/(?P.*)" + + @property + def files(self): + return [ + { + "link": f"https://arxiv.org/pdf/{self._params['record_id']}.pdf", + "name": self._params["record_id"].split("/")[-1] + ".pdf", + "size": None, + "hash": None, + "hash_type": None, + } + ]