Skip to content

Commit

Permalink
Simplify single file DOI workflow for Dataverse (#68)
Browse files Browse the repository at this point in the history
  • Loading branch information
J535D165 committed Sep 20, 2023
1 parent 3198b6f commit c66fb32
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 67 deletions.
51 changes: 8 additions & 43 deletions datahugger/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,8 @@ def _get_files_recursive(self, url, folder_name=None, base_url=None):
jsonpath_expression = parse(self.META_FILES_JSONPATH)
else:
jsonpath_expression = self.META_FILES_JSONPATH
files_raw = jsonpath_expression.find(response)[0].value

files_raw = [x.value for x in jsonpath_expression.find(response)]
else:
files_raw = response

Expand Down Expand Up @@ -259,34 +260,6 @@ def _get_files_recursive(self, url, folder_name=None, base_url=None):

return result

def _get_single_file(self, url, folder_name=None, base_url=None):
if not isinstance(url, str):
ValueError(f"Expected url to be string type, got {type(url)}")

# get the data from URL
res = requests.get(url)
response = res.json()

# find path to raw files
if hasattr(self, "META_FILES_SINGLE_JSONPATH"):
jsonpath_expression = parse(self.META_FILES_SINGLE_JSONPATH)
file_raw = jsonpath_expression.find(response)[0].value

if folder_name is None:
f_path = self._get_attr_name(file_raw)
else:
f_path = str(Path(folder_name, self._get_attr_name(file_raw)))

return [
{
"link": self._get_attr_link(file_raw, base_url=base_url),
"name": f_path,
"size": self._get_attr_size(file_raw),
"hash": self._get_attr_hash(file_raw),
"hash_type": self._get_attr_hash_type(file_raw),
}
]

@property
def _params(self):
"""Params including url params."""
Expand Down Expand Up @@ -315,20 +288,12 @@ def files(self):
uri = urlparse(url)
base_url = uri.scheme + "://" + uri.netloc

if hasattr(self, "is_singleton") and self.is_singleton:
self._files = self._get_single_file(
self.API_URL_META_SINGLE.format(
api_url=self.API_URL, base_url=base_url, **self._params
),
base_url=base_url,
)
else:
self._files = self._get_files_recursive(
self.API_URL_META.format(
api_url=self.API_URL, base_url=base_url, **self._params
),
base_url=base_url,
)
self._files = self._get_files_recursive(
self.API_URL_META.format(
api_url=self.API_URL, base_url=base_url, **self._params
),
base_url=base_url,
)

return self._files

Expand Down
51 changes: 27 additions & 24 deletions datahugger/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import requests
from jsonpath_ng.jsonpath import Fields
from jsonpath_ng.jsonpath import Slice

from datahugger.base import DatasetDownloader
from datahugger.utils import _get_url
Expand All @@ -26,7 +27,7 @@ class ZenodoDataset(DatasetDownloader):

# the files and metadata about the dataset
API_URL_META = "{api_url}records/{record_id}"
META_FILES_JSONPATH = "files"
META_FILES_JSONPATH = "files[*]"

# paths to file attributes
ATTR_NAME_JSONPATH = "key"
Expand All @@ -46,36 +47,36 @@ class DataverseDataset(DatasetDownloader):

REGEXP_ID = r"(?P<type>dataset|file)\.xhtml\?persistentId=(?P<record_id>.*)"

# the files and metadata about the dataset
META_FILES_JSONPATH = "data.files"

API_URL_META_SINGLE = "{base_url}/api/files/:persistentId/?persistentId={record_id}"
META_FILES_SINGLE_JSONPATH = "data"

# paths to file attributes
ATTR_NAME_JSONPATH = "dataFile.filename"
ATTR_SIZE_JSONPATH = "dataFile.filesize"
ATTR_HASH_JSONPATH = "dataFile.md5"
ATTR_NAME_JSONPATH = "filename"
ATTR_SIZE_JSONPATH = "filesize"
ATTR_HASH_JSONPATH = "md5"
ATTR_HASH_TYPE_VALUE = "md5"

def _get_attr_link(self, record, base_url=None):
return "{}/api/access/datafile/{}".format(base_url, record["dataFile"]["id"])

def _pre_files(self):
if "type" in self._params and self._params["type"] == "file":
self.is_singleton = True

@property
def API_URL_META(self):
if self._params.get("version", None):
v = self._params["version"]
else:
v = ":latest-published"

return (
"{base_url}/api/datasets/:persistentId/versions/"
f"{v}/?persistentId={{record_id}}"
)
if self._params.get("type", None) == "file":
return "{base_url}/api/files/:persistentId/?persistentId={record_id}"
else:
return (
"{base_url}/api/datasets/:persistentId/versions/"
f"{v}/?persistentId={{record_id}}"
)

@property
def META_FILES_JSONPATH(self):
if self._params.get("type", None) == "file":
return "data.dataFile"
else:
return "data.files[*].dataFile"

def _get_attr_link(self, record, base_url=None):
return f"{base_url}/api/access/datafile/{record['id']}"


class FigShareDataset(DatasetDownloader):
Expand All @@ -87,7 +88,7 @@ class FigShareDataset(DatasetDownloader):
API_URL = "https://api.figshare.com/v2"

# the files and metadata about the dataset
META_FILES_JSONPATH = "files"
META_FILES_JSONPATH = "files[*]"

# paths to file attributes
ATTR_FILE_LINK_JSONPATH = "download_url"
Expand Down Expand Up @@ -125,7 +126,7 @@ class OSFDataset(DatasetDownloader):

# the files and metadata about the dataset
API_URL_META = "{api_url}{record_id}/files/osfstorage/?format=jsonapi"
META_FILES_JSONPATH = "data"
META_FILES_JSONPATH = "data[*]"

PAGINATION_JSONPATH = "links.next"

Expand All @@ -151,7 +152,9 @@ class DataDryadDataset(DatasetDownloader):

# the files and metadata about the dataset
API_URL_META = "{api_url}{record_id}/files/osfstorage/?format=jsonapi"
META_FILES_JSONPATH = Fields("_embedded").child(Fields("stash:files"))
META_FILES_JSONPATH = (
Fields("_embedded").child(Fields("stash:files")).child(Slice())
)

# paths to file attributes
ATTR_NAME_JSONPATH = "path"
Expand Down

0 comments on commit c66fb32

Please sign in to comment.