Skip to content

Commit

Permalink
Add support for arXiv (#61)
Browse files Browse the repository at this point in the history
  • Loading branch information
J535D165 committed Sep 14, 2023
1 parent 49def2c commit 1949d8f
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 0 deletions.
2 changes: 2 additions & 0 deletions datahugger/config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datahugger.services import ArXivDataset
from datahugger.services import DataDryadDataset
from datahugger.services import DataOneDataset
from datahugger.services import DataverseDataset
Expand All @@ -12,6 +13,7 @@

# fast lookup
SERVICES_NETLOC = {
"arxiv.org": ArXivDataset,
"zenodo.org": ZenodoDataset,
"github.com": GitHubDataset,
"datadryad.org": DataDryadDataset,
Expand Down
18 changes: 18 additions & 0 deletions datahugger/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,3 +323,21 @@ def _get(
def files(self):
# at the moment, .files is not available for HuggingFace
raise AttributeError("'files' is not available for HuggingFace")


class ArXivDataset(DatasetDownloader, DatasetResult):
"""Downloader for ArXiv publication."""

REGEXP_ID = r"https://arxiv\.org/abs/(?P<record_id>.*)"

@property
def files(self):
return [
{
"link": f"https://arxiv.org/pdf/{self._params['record_id']}.pdf",
"name": self._params["record_id"].split("/")[-1] + ".pdf",
"size": None,
"hash": None,
"hash_type": None,
}
]

0 comments on commit 1949d8f

Please sign in to comment.