Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify single file DOI workflow for Dataverse #68

Merged
merged 4 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 8 additions & 43 deletions datahugger/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,8 @@ def _get_files_recursive(self, url, folder_name=None, base_url=None):
jsonpath_expression = parse(self.META_FILES_JSONPATH)
else:
jsonpath_expression = self.META_FILES_JSONPATH
files_raw = jsonpath_expression.find(response)[0].value

files_raw = [x.value for x in jsonpath_expression.find(response)]
else:
files_raw = response

Expand Down Expand Up @@ -259,34 +260,6 @@ def _get_files_recursive(self, url, folder_name=None, base_url=None):

return result

def _get_single_file(self, url, folder_name=None, base_url=None):
if not isinstance(url, str):
ValueError(f"Expected url to be string type, got {type(url)}")

# get the data from URL
res = requests.get(url)
response = res.json()

# find path to raw files
if hasattr(self, "META_FILES_SINGLE_JSONPATH"):
jsonpath_expression = parse(self.META_FILES_SINGLE_JSONPATH)
file_raw = jsonpath_expression.find(response)[0].value

if folder_name is None:
f_path = self._get_attr_name(file_raw)
else:
f_path = str(Path(folder_name, self._get_attr_name(file_raw)))

return [
{
"link": self._get_attr_link(file_raw, base_url=base_url),
"name": f_path,
"size": self._get_attr_size(file_raw),
"hash": self._get_attr_hash(file_raw),
"hash_type": self._get_attr_hash_type(file_raw),
}
]

@property
def _params(self):
"""Params including url params."""
Expand Down Expand Up @@ -315,20 +288,12 @@ def files(self):
uri = urlparse(url)
base_url = uri.scheme + "://" + uri.netloc

if hasattr(self, "is_singleton") and self.is_singleton:
self._files = self._get_single_file(
self.API_URL_META_SINGLE.format(
api_url=self.API_URL, base_url=base_url, **self._params
),
base_url=base_url,
)
else:
self._files = self._get_files_recursive(
self.API_URL_META.format(
api_url=self.API_URL, base_url=base_url, **self._params
),
base_url=base_url,
)
self._files = self._get_files_recursive(
self.API_URL_META.format(
api_url=self.API_URL, base_url=base_url, **self._params
),
base_url=base_url,
)

return self._files

Expand Down
51 changes: 27 additions & 24 deletions datahugger/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import requests
from jsonpath_ng.jsonpath import Fields
from jsonpath_ng.jsonpath import Slice

from datahugger.base import DatasetDownloader
from datahugger.utils import _get_url
Expand All @@ -26,7 +27,7 @@ class ZenodoDataset(DatasetDownloader):

# the files and metadata about the dataset
API_URL_META = "{api_url}records/{record_id}"
META_FILES_JSONPATH = "files"
META_FILES_JSONPATH = "files[*]"

# paths to file attributes
ATTR_NAME_JSONPATH = "key"
Expand All @@ -46,36 +47,36 @@ class DataverseDataset(DatasetDownloader):

REGEXP_ID = r"(?P<type>dataset|file)\.xhtml\?persistentId=(?P<record_id>.*)"

# the files and metadata about the dataset
META_FILES_JSONPATH = "data.files"

API_URL_META_SINGLE = "{base_url}/api/files/:persistentId/?persistentId={record_id}"
META_FILES_SINGLE_JSONPATH = "data"

# paths to file attributes
ATTR_NAME_JSONPATH = "dataFile.filename"
ATTR_SIZE_JSONPATH = "dataFile.filesize"
ATTR_HASH_JSONPATH = "dataFile.md5"
ATTR_NAME_JSONPATH = "filename"
ATTR_SIZE_JSONPATH = "filesize"
ATTR_HASH_JSONPATH = "md5"
ATTR_HASH_TYPE_VALUE = "md5"

def _get_attr_link(self, record, base_url=None):
return "{}/api/access/datafile/{}".format(base_url, record["dataFile"]["id"])

def _pre_files(self):
if "type" in self._params and self._params["type"] == "file":
self.is_singleton = True

@property
def API_URL_META(self):
if self._params.get("version", None):
v = self._params["version"]
else:
v = ":latest-published"

return (
"{base_url}/api/datasets/:persistentId/versions/"
f"{v}/?persistentId={{record_id}}"
)
if self._params.get("type", None) == "file":
return "{base_url}/api/files/:persistentId/?persistentId={record_id}"
else:
return (
"{base_url}/api/datasets/:persistentId/versions/"
f"{v}/?persistentId={{record_id}}"
)

@property
def META_FILES_JSONPATH(self):
if self._params.get("type", None) == "file":
return "data.dataFile"
else:
return "data.files[*].dataFile"

def _get_attr_link(self, record, base_url=None):
return f"{base_url}/api/access/datafile/{record['id']}"


class FigShareDataset(DatasetDownloader):
Expand All @@ -87,7 +88,7 @@ class FigShareDataset(DatasetDownloader):
API_URL = "https://api.figshare.com/v2"

# the files and metadata about the dataset
META_FILES_JSONPATH = "files"
META_FILES_JSONPATH = "files[*]"

# paths to file attributes
ATTR_FILE_LINK_JSONPATH = "download_url"
Expand Down Expand Up @@ -125,7 +126,7 @@ class OSFDataset(DatasetDownloader):

# the files and metadata about the dataset
API_URL_META = "{api_url}{record_id}/files/osfstorage/?format=jsonapi"
META_FILES_JSONPATH = "data"
META_FILES_JSONPATH = "data[*]"

PAGINATION_JSONPATH = "links.next"

Expand All @@ -151,7 +152,9 @@ class DataDryadDataset(DatasetDownloader):

# the files and metadata about the dataset
API_URL_META = "{api_url}{record_id}/files/osfstorage/?format=jsonapi"
META_FILES_JSONPATH = Fields("_embedded").child(Fields("stash:files"))
META_FILES_JSONPATH = (
Fields("_embedded").child(Fields("stash:files")).child(Slice())
)

# paths to file attributes
ATTR_NAME_JSONPATH = "path"
Expand Down