Skip to content

Commit

Permalink
Add support for DOIs pointing to single files (#51)
Browse files Browse the repository at this point in the history
  • Loading branch information
J535D165 committed Sep 10, 2023
1 parent 5520eb4 commit e0f44da
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 57 deletions.
78 changes: 49 additions & 29 deletions datahugger/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,23 +201,11 @@ def _parse_url(self, url):
if not isinstance(url, str) or not _is_url(url):
raise ValueError("Not a valid URL.")

# first try to parse with version number
if hasattr(self, "REGEXP_ID_AND_VERSION"):
match = re.search(self.REGEXP_ID_AND_VERSION, url)

if match and match.group(1):
if match.group(2) == "":
return match.group(1), None
return match.group(1), match.group(2)

# then try to parse without version number
if hasattr(self, "REGEXP_ID"):
try:
match = re.search(self.REGEXP_ID, url)

if match and match.group(1):
return match.group(1), None

raise ValueError(f"Failed to parse record identifier from URL '{url}'")
return match.groupdict()
except Exception as err:
raise ValueError(f"Failed to parse URL '{url}'") from err

def _unpack_single_folder(self, zip_url, output_folder):
r = requests.get(zip_url)
Expand All @@ -230,16 +218,16 @@ def _unpack_single_folder(self, zip_url, output_folder):
z.extract(zip_info, output_folder)

@property
def api_record_id(self):
if hasattr(self, "_api_record_id"):
return self._api_record_id
def _params(self):
if hasattr(self, "__params"):
return self.__params

if isinstance(self.url, str) and _is_url(self.url):
self._api_record_id, self.version = self._parse_url(self.url)
self.__params = self._parse_url(self.url)
else:
self._api_record_id, self.version = self.url, self.version
self.__params = {"record_id": self.url, "version": None}

return self._api_record_id
return self.__params

def _pre_files(self):
pass
Expand Down Expand Up @@ -296,21 +284,53 @@ def _get_files_recursive(self, url, folder_name=None):

return result

def _get_single_file(self, url, folder_name=None):
if not isinstance(url, str):
ValueError(f"Expected url to be string type, got {type(url)}")

# get the data from URL
res = requests.get(url)
response = res.json()

# find path to raw files
if hasattr(self, "META_FILES_SINGLE_JSONPATH"):
jsonpath_expression = parse(self.META_FILES_SINGLE_JSONPATH)
file_raw = jsonpath_expression.find(response)[0].value

if folder_name is None:
f_path = self._get_attr_name(file_raw)
else:
f_path = str(Path(folder_name, self._get_attr_name(file_raw)))

return [
{
"link": self._get_attr_link(file_raw),
"name": f_path,
"size": self._get_attr_size(file_raw),
"hash": self._get_attr_hash(file_raw),
"hash_type": self._get_attr_hash_type(file_raw),
}
]

@property
def files(self):
if hasattr(self, "_files"):
return self._files

self._pre_files()

self._files = self._get_files_recursive(
self.API_URL_META.format(
api_url=self.API_URL,
api_record_id=self.api_record_id,
version=self.version,
base_url=self.base_url,
if hasattr(self, "is_singleton") and self.is_singleton:
self._files = self._get_single_file(
self.API_URL_META_SINGLE.format(
api_url=self.API_URL, base_url=self.base_url, **self._params
)
)
else:
self._files = self._get_files_recursive(
self.API_URL_META.format(
api_url=self.API_URL, base_url=self.base_url, **self._params
)
)
)

return self._files

Expand Down
58 changes: 30 additions & 28 deletions datahugger/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ class ZenodoDataset(DatasetDownloader, DatasetResult):
For Zenodo records, new versions have new identifiers.
"""

REGEXP_ID = r"zenodo\.org\/record\/(\d+).*"
REGEXP_ID = r"zenodo\.org\/record\/(?P<record_id>\d+).*"

# the base entry point of the REST API
API_URL = "https://zenodo.org/api/"

# the files and metadata about the dataset
API_URL_META = "{api_url}records/{api_record_id}"
API_URL_META = "{api_url}records/{record_id}"
META_FILES_JSONPATH = "files"

# paths to file attributes
Expand All @@ -43,12 +43,15 @@ def _get_attr_hash_type(self, record):
class DataverseDataset(DatasetDownloader, DatasetResult):
"""Downloader for Dataverse repository."""

REGEXP_ID = r"dataset\.xhtml\?persistentId=(.*)"
REGEXP_ID = r"(?P<type>dataset|file)\.xhtml\?persistentId=(?P<record_id>.*)"

# the files and metadata about the dataset
API_URL_META = "{base_url}/api/datasets/:persistentId/?persistentId={api_record_id}"
API_URL_META = "{base_url}/api/datasets/:persistentId/?persistentId={record_id}"
META_FILES_JSONPATH = "data.latestVersion.files"

API_URL_META_SINGLE = "{base_url}/api/files/:persistentId/?persistentId={record_id}"
META_FILES_SINGLE_JSONPATH = "data"

# paths to file attributes
ATTR_NAME_JSONPATH = "dataFile.filename"
ATTR_SIZE_JSONPATH = "dataFile.filesize"
Expand All @@ -60,18 +63,21 @@ def _get_attr_link(self, record):
self.base_url, record["dataFile"]["id"]
)

def _pre_files(self):
if "type" in self._params and self._params["type"] == "file":
self.is_singleton = True


class FigShareDataset(DatasetDownloader, DatasetResult):
"""Downloader for FigShare repository."""

REGEXP_ID_AND_VERSION = r"articles\/.*\/.*\/(\d+)\/(\d+)"
REGEXP_ID = r"articles\/.*\/.*\/(\d+)"
REGEXP_ID = r"articles\/.*?\/.*?\/(?P<record_id>\d+)(?:\/(?P<version>\d+)|)"

# the base entry point of the REST API
API_URL = "https://api.figshare.com/v2"

# the files and metadata about the dataset
API_URL_META = "{api_url}/articles/{api_record_id}/files"
API_URL_META = "{api_url}/articles/{record_id}/files"

# paths to file attributes
ATTR_FILE_LINK_JSONPATH = "download_url"
Expand All @@ -84,8 +90,7 @@ class FigShareDataset(DatasetDownloader, DatasetResult):
class Djehuty(FigShareDataset):
"""Downloader for Djehuty repository."""

REGEXP_ID_AND_VERSION = r"articles\/.*\/(\d+)\/(\d+)"
REGEXP_ID = r"articles\/.*\/(\d+)"
REGEXP_ID = r"articles\/.*?\/(?P<record_id>\d+)(?:\/(?P<version>\d+)|)"

# the base entry point of the REST API
API_URL = "https://data.4tu.nl/v2"
Expand All @@ -94,13 +99,13 @@ class Djehuty(FigShareDataset):
class OSFDataset(DatasetDownloader, DatasetResult):
"""Downloader for OSF repository."""

REGEXP_ID = r"osf\.io\/(.*)/"
REGEXP_ID = r"osf\.io\/(?P<record_id>.*)/"

# the base entry point of the REST API
API_URL = "https://api.osf.io/v2/registrations/"

# the files and metadata about the dataset
API_URL_META = "{api_url}{api_record_id}/files/osfstorage/?format=jsonapi"
API_URL_META = "{api_url}{record_id}/files/osfstorage/?format=jsonapi"
META_FILES_JSONPATH = "data"

PAGINATION_JSONPATH = "links.next"
Expand All @@ -120,7 +125,7 @@ class OSFDataset(DatasetDownloader, DatasetResult):
class DataDryadDataset(DatasetDownloader, DatasetResult):
"""Downloader for DataDryad repository."""

REGEXP_ID = r"datadryad\.org[\:]*[43]{0,3}\/stash\/dataset\/doi:(.*)"
REGEXP_ID = r"datadryad\.org[\:]*[43]{0,3}\/stash\/dataset\/doi:(?P<record_id>.*)"

# the base entry point of the REST API
API_URL = "https://datadryad.org/api/v2"
Expand All @@ -134,7 +139,7 @@ def files(self):
if hasattr(self, "_files"):
return self._files

doi_safe = quote(f"doi:{self.api_record_id}", safe="")
doi_safe = quote(f"doi:{self._params['record_id']}", safe="")
dataset_metadata_url = self.API_URL + "/datasets/" + doi_safe

res = requests.get(dataset_metadata_url)
Expand Down Expand Up @@ -178,7 +183,7 @@ def _get_attr_link(self, record):
class DataOneDataset(DatasetDownloader, DatasetResult):
"""Downloader for DataOne repositories."""

REGEXP_ID = r"view/doi:(.*)"
REGEXP_ID = r"view/doi:(?P<record_id>.*)"

# the base entry point of the REST API
API_URL = "https://cn.dataone.org/cn/v2/object/"
Expand All @@ -188,7 +193,7 @@ def files(self):
if hasattr(self, "_files"):
return self._files

doi_safe = quote(f"doi:{self.api_record_id}", safe="")
doi_safe = quote(f"doi:{self._params['record_id']}", safe="")

res = requests.get(self.API_URL + doi_safe)
res.raise_for_status()
Expand Down Expand Up @@ -216,7 +221,7 @@ def files(self):
class DSpaceDataset(DatasetDownloader, DatasetResult):
"""Downloader for DSpaceDataset repositories."""

REGEXP_ID = r"handle/(\d+\/\d+)"
REGEXP_ID = r"handle/(?P<record_id>\d+\/\d+)"

# paths to file attributes
ATTR_KIND_JSONPATH = "attributes.kind"
Expand All @@ -232,7 +237,7 @@ def _get_attr_link(self, record):
return self.base_url + record["retrieveLink"]

def _pre_files(self):
handle_id_url = f"{self.base_url}/rest/handle/{self.api_record_id}"
handle_id_url = f"{self.base_url}/rest/handle/{self._params['record_id']}"
res = requests.get(handle_id_url)
res.raise_for_status()

Expand All @@ -243,18 +248,17 @@ def _pre_files(self):
class MendeleyDataset(DatasetDownloader, DatasetResult):
"""Downloader for Mendeley repository."""

REGEXP_ID_WITH_VERSION = r"data\.mendeley\.com\/datasets\/([0-9a-z]+)\/(\d+)"
REGEXP_ID = r"data\.mendeley\.com\/datasets\/([0-9a-z]+)"
REGEXP_ID = r"data\.mendeley\.com\/datasets\/(?P<record_id>[0-9a-z]+)(?:\/(?P<version>\d+)|)" # noqa

# the base entry point of the REST API
API_URL = "https://data.mendeley.com/public-api/"

# version url
API_URL_VERSION = "{api_url}datasets/{api_record_id}/versions"
API_URL_VERSION = "{api_url}datasets/{record_id}/versions"

# the files and metadata about the dataset
API_URL_META = (
"{api_url}datasets/{api_record_id}/files?folder_id=root&version={version}"
"{api_url}datasets/{record_id}/files?folder_id=root&version={version}"
)

# paths to file attributes
Expand All @@ -267,9 +271,7 @@ class MendeleyDataset(DatasetDownloader, DatasetResult):
def _pre_files(self):
if self.version is None:
r_version = requests.get(
self.API_URL_VERSION.format(
api_url=self.API_URL, api_record_id=self.api_record_id
)
self.API_URL_VERSION.format(api_url=self.API_URL, **self._params)
)
r_version.raise_for_status()
self.version = r_version.json()[-1]["version"]
Expand All @@ -279,11 +281,11 @@ class GitHubDataset(DatasetDownloader, DatasetResult):
"""Downloader for GitHub repository."""

API_URL = "https://github.com/"
REGEXP_ID = r"github\.com\/([a-zA-Z0-9]+\/[a-zA-Z0-9]+)[\/]*.*"
REGEXP_ID = r"github\.com\/(?P<record_id>[a-zA-Z0-9]+\/[a-zA-Z0-9]+)[\/]*.*"

def _get(self, output_folder: Union[Path, str], *args, **kwargs):
res = requests.get(
f"{self.API_URL}{self.api_record_id}/archive/refs/heads/master.zip"
f"{self.API_URL}{self._params['record_id']}/archive/refs/heads/master.zip"
)
z = zipfile.ZipFile(io.BytesIO(res.content))
z.extractall(output_folder)
Expand All @@ -292,7 +294,7 @@ def _get(self, output_folder: Union[Path, str], *args, **kwargs):
class HuggingFaceDataset(DatasetDownloader, DatasetResult):
"""Downloader for Huggingface repository."""

REGEXP_ID = r"huggingface.co/datasets/(.*)"
REGEXP_ID = r"huggingface.co/datasets/(?P<record_id>.*)"

def _get(
self,
Expand All @@ -307,4 +309,4 @@ def _get(
" or use 'pip install datahugger[all]'"
) from err

load_dataset(self.api_record_id, cache_dir=output_folder, **kwargs)
load_dataset(self._params["record_id"], cache_dir=output_folder, **kwargs)
2 changes: 2 additions & 0 deletions tests/test_repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
"tutorial1.py",
),
("https://doi.org/10.7910/DVN/KBHLOD", "tutorial1.py"),
# Dataverse single file
("10.7910/DVN/HZBYG7/RQ26H2", "Table 2.do"),
# Figshare
("https://doi.org/10.6084/m9.figshare.8851784.v1", "cross_year_data2.csv"),
(
Expand Down

0 comments on commit e0f44da

Please sign in to comment.