Add support for DOIs pointing to single files (#51)

J535D165 · Sep 10, 2023 · e0f44da · e0f44da
1 parent 5520eb4
commit e0f44da
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 57 deletions.
diff --git a/datahugger/base.py b/datahugger/base.py
@@ -201,23 +201,11 @@ def _parse_url(self, url):
         if not isinstance(url, str) or not _is_url(url):
             raise ValueError("Not a valid URL.")
 
-        # first try to parse with version number
-        if hasattr(self, "REGEXP_ID_AND_VERSION"):
-            match = re.search(self.REGEXP_ID_AND_VERSION, url)
-
-            if match and match.group(1):
-                if match.group(2) == "":
-                    return match.group(1), None
-                return match.group(1), match.group(2)
-
-        # then try to parse without version number
-        if hasattr(self, "REGEXP_ID"):
+        try:
             match = re.search(self.REGEXP_ID, url)
-
-            if match and match.group(1):
-                return match.group(1), None
-
-        raise ValueError(f"Failed to parse record identifier from URL '{url}'")
+            return match.groupdict()
+        except Exception as err:
+            raise ValueError(f"Failed to parse URL '{url}'") from err
 
     def _unpack_single_folder(self, zip_url, output_folder):
         r = requests.get(zip_url)
@@ -230,16 +218,16 @@ def _unpack_single_folder(self, zip_url, output_folder):
             z.extract(zip_info, output_folder)
 
     @property
-    def api_record_id(self):
-        if hasattr(self, "_api_record_id"):
-            return self._api_record_id
+    def _params(self):
+        if hasattr(self, "__params"):
+            return self.__params
 
         if isinstance(self.url, str) and _is_url(self.url):
-            self._api_record_id, self.version = self._parse_url(self.url)
+            self.__params = self._parse_url(self.url)
         else:
-            self._api_record_id, self.version = self.url, self.version
+            self.__params = {"record_id": self.url, "version": None}
 
-        return self._api_record_id
+        return self.__params
 
     def _pre_files(self):
         pass
@@ -296,21 +284,53 @@ def _get_files_recursive(self, url, folder_name=None):
 
         return result
 
+    def _get_single_file(self, url, folder_name=None):
+        if not isinstance(url, str):
+            ValueError(f"Expected url to be string type, got {type(url)}")
+
+        # get the data from URL
+        res = requests.get(url)
+        response = res.json()
+
+        # find path to raw files
+        if hasattr(self, "META_FILES_SINGLE_JSONPATH"):
+            jsonpath_expression = parse(self.META_FILES_SINGLE_JSONPATH)
+            file_raw = jsonpath_expression.find(response)[0].value
+
+        if folder_name is None:
+            f_path = self._get_attr_name(file_raw)
+        else:
+            f_path = str(Path(folder_name, self._get_attr_name(file_raw)))
+
+        return [
+            {
+                "link": self._get_attr_link(file_raw),
+                "name": f_path,
+                "size": self._get_attr_size(file_raw),
+                "hash": self._get_attr_hash(file_raw),
+                "hash_type": self._get_attr_hash_type(file_raw),
+            }
+        ]
+
     @property
     def files(self):
         if hasattr(self, "_files"):
             return self._files
 
         self._pre_files()
 
-        self._files = self._get_files_recursive(
-            self.API_URL_META.format(
-                api_url=self.API_URL,
-                api_record_id=self.api_record_id,
-                version=self.version,
-                base_url=self.base_url,
+        if hasattr(self, "is_singleton") and self.is_singleton:
+            self._files = self._get_single_file(
+                self.API_URL_META_SINGLE.format(
+                    api_url=self.API_URL, base_url=self.base_url, **self._params
+                )
+            )
+        else:
+            self._files = self._get_files_recursive(
+                self.API_URL_META.format(
+                    api_url=self.API_URL, base_url=self.base_url, **self._params
+                )
             )
-        )
 
         return self._files
 

diff --git a/datahugger/services.py b/datahugger/services.py
@@ -18,13 +18,13 @@ class ZenodoDataset(DatasetDownloader, DatasetResult):
     For Zenodo records, new versions have new identifiers.
     """
 
-    REGEXP_ID = r"zenodo\.org\/record\/(\d+).*"
+    REGEXP_ID = r"zenodo\.org\/record\/(?P<record_id>\d+).*"
 
     # the base entry point of the REST API
     API_URL = "https://zenodo.org/api/"
 
     # the files and metadata about the dataset
-    API_URL_META = "{api_url}records/{api_record_id}"
+    API_URL_META = "{api_url}records/{record_id}"
     META_FILES_JSONPATH = "files"
 
     # paths to file attributes
@@ -43,12 +43,15 @@ def _get_attr_hash_type(self, record):
 class DataverseDataset(DatasetDownloader, DatasetResult):
     """Downloader for Dataverse repository."""
 
-    REGEXP_ID = r"dataset\.xhtml\?persistentId=(.*)"
+    REGEXP_ID = r"(?P<type>dataset|file)\.xhtml\?persistentId=(?P<record_id>.*)"
 
     # the files and metadata about the dataset
-    API_URL_META = "{base_url}/api/datasets/:persistentId/?persistentId={api_record_id}"
+    API_URL_META = "{base_url}/api/datasets/:persistentId/?persistentId={record_id}"
     META_FILES_JSONPATH = "data.latestVersion.files"
 
+    API_URL_META_SINGLE = "{base_url}/api/files/:persistentId/?persistentId={record_id}"
+    META_FILES_SINGLE_JSONPATH = "data"
+
     # paths to file attributes
     ATTR_NAME_JSONPATH = "dataFile.filename"
     ATTR_SIZE_JSONPATH = "dataFile.filesize"
@@ -60,18 +63,21 @@ def _get_attr_link(self, record):
             self.base_url, record["dataFile"]["id"]
         )
 
+    def _pre_files(self):
+        if "type" in self._params and self._params["type"] == "file":
+            self.is_singleton = True
+
 
 class FigShareDataset(DatasetDownloader, DatasetResult):
     """Downloader for FigShare repository."""
 
-    REGEXP_ID_AND_VERSION = r"articles\/.*\/.*\/(\d+)\/(\d+)"
-    REGEXP_ID = r"articles\/.*\/.*\/(\d+)"
+    REGEXP_ID = r"articles\/.*?\/.*?\/(?P<record_id>\d+)(?:\/(?P<version>\d+)|)"
 
     # the base entry point of the REST API
     API_URL = "https://api.figshare.com/v2"
 
     # the files and metadata about the dataset
-    API_URL_META = "{api_url}/articles/{api_record_id}/files"
+    API_URL_META = "{api_url}/articles/{record_id}/files"
 
     # paths to file attributes
     ATTR_FILE_LINK_JSONPATH = "download_url"
@@ -84,8 +90,7 @@ class FigShareDataset(DatasetDownloader, DatasetResult):
 class Djehuty(FigShareDataset):
     """Downloader for Djehuty repository."""
 
-    REGEXP_ID_AND_VERSION = r"articles\/.*\/(\d+)\/(\d+)"
-    REGEXP_ID = r"articles\/.*\/(\d+)"
+    REGEXP_ID = r"articles\/.*?\/(?P<record_id>\d+)(?:\/(?P<version>\d+)|)"
 
     # the base entry point of the REST API
     API_URL = "https://data.4tu.nl/v2"
@@ -94,13 +99,13 @@ class Djehuty(FigShareDataset):
 class OSFDataset(DatasetDownloader, DatasetResult):
     """Downloader for OSF repository."""
 
-    REGEXP_ID = r"osf\.io\/(.*)/"
+    REGEXP_ID = r"osf\.io\/(?P<record_id>.*)/"
 
     # the base entry point of the REST API
     API_URL = "https://api.osf.io/v2/registrations/"
 
     # the files and metadata about the dataset
-    API_URL_META = "{api_url}{api_record_id}/files/osfstorage/?format=jsonapi"
+    API_URL_META = "{api_url}{record_id}/files/osfstorage/?format=jsonapi"
     META_FILES_JSONPATH = "data"
 
     PAGINATION_JSONPATH = "links.next"
@@ -120,7 +125,7 @@ class OSFDataset(DatasetDownloader, DatasetResult):
 class DataDryadDataset(DatasetDownloader, DatasetResult):
     """Downloader for DataDryad repository."""
 
-    REGEXP_ID = r"datadryad\.org[\:]*[43]{0,3}\/stash\/dataset\/doi:(.*)"
+    REGEXP_ID = r"datadryad\.org[\:]*[43]{0,3}\/stash\/dataset\/doi:(?P<record_id>.*)"
 
     # the base entry point of the REST API
     API_URL = "https://datadryad.org/api/v2"
@@ -134,7 +139,7 @@ def files(self):
         if hasattr(self, "_files"):
             return self._files
 
-        doi_safe = quote(f"doi:{self.api_record_id}", safe="")
+        doi_safe = quote(f"doi:{self._params['record_id']}", safe="")
         dataset_metadata_url = self.API_URL + "/datasets/" + doi_safe
 
         res = requests.get(dataset_metadata_url)
@@ -178,7 +183,7 @@ def _get_attr_link(self, record):
 class DataOneDataset(DatasetDownloader, DatasetResult):
     """Downloader for DataOne repositories."""
 
-    REGEXP_ID = r"view/doi:(.*)"
+    REGEXP_ID = r"view/doi:(?P<record_id>.*)"
 
     # the base entry point of the REST API
     API_URL = "https://cn.dataone.org/cn/v2/object/"
@@ -188,7 +193,7 @@ def files(self):
         if hasattr(self, "_files"):
             return self._files
 
-        doi_safe = quote(f"doi:{self.api_record_id}", safe="")
+        doi_safe = quote(f"doi:{self._params['record_id']}", safe="")
 
         res = requests.get(self.API_URL + doi_safe)
         res.raise_for_status()
@@ -216,7 +221,7 @@ def files(self):
 class DSpaceDataset(DatasetDownloader, DatasetResult):
     """Downloader for DSpaceDataset repositories."""
 
-    REGEXP_ID = r"handle/(\d+\/\d+)"
+    REGEXP_ID = r"handle/(?P<record_id>\d+\/\d+)"
 
     # paths to file attributes
     ATTR_KIND_JSONPATH = "attributes.kind"
@@ -232,7 +237,7 @@ def _get_attr_link(self, record):
         return self.base_url + record["retrieveLink"]
 
     def _pre_files(self):
-        handle_id_url = f"{self.base_url}/rest/handle/{self.api_record_id}"
+        handle_id_url = f"{self.base_url}/rest/handle/{self._params['record_id']}"
         res = requests.get(handle_id_url)
         res.raise_for_status()
 
@@ -243,18 +248,17 @@ def _pre_files(self):
 class MendeleyDataset(DatasetDownloader, DatasetResult):
     """Downloader for Mendeley repository."""
 
-    REGEXP_ID_WITH_VERSION = r"data\.mendeley\.com\/datasets\/([0-9a-z]+)\/(\d+)"
-    REGEXP_ID = r"data\.mendeley\.com\/datasets\/([0-9a-z]+)"
+    REGEXP_ID = r"data\.mendeley\.com\/datasets\/(?P<record_id>[0-9a-z]+)(?:\/(?P<version>\d+)|)"  # noqa
 
     # the base entry point of the REST API
     API_URL = "https://data.mendeley.com/public-api/"
 
     # version url
-    API_URL_VERSION = "{api_url}datasets/{api_record_id}/versions"
+    API_URL_VERSION = "{api_url}datasets/{record_id}/versions"
 
     # the files and metadata about the dataset
     API_URL_META = (
-        "{api_url}datasets/{api_record_id}/files?folder_id=root&version={version}"
+        "{api_url}datasets/{record_id}/files?folder_id=root&version={version}"
     )
 
     # paths to file attributes
@@ -267,9 +271,7 @@ class MendeleyDataset(DatasetDownloader, DatasetResult):
     def _pre_files(self):
         if self.version is None:
             r_version = requests.get(
-                self.API_URL_VERSION.format(
-                    api_url=self.API_URL, api_record_id=self.api_record_id
-                )
+                self.API_URL_VERSION.format(api_url=self.API_URL, **self._params)
             )
             r_version.raise_for_status()
             self.version = r_version.json()[-1]["version"]
@@ -279,11 +281,11 @@ class GitHubDataset(DatasetDownloader, DatasetResult):
     """Downloader for GitHub repository."""
 
     API_URL = "https://github.com/"
-    REGEXP_ID = r"github\.com\/([a-zA-Z0-9]+\/[a-zA-Z0-9]+)[\/]*.*"
+    REGEXP_ID = r"github\.com\/(?P<record_id>[a-zA-Z0-9]+\/[a-zA-Z0-9]+)[\/]*.*"
 
     def _get(self, output_folder: Union[Path, str], *args, **kwargs):
         res = requests.get(
-            f"{self.API_URL}{self.api_record_id}/archive/refs/heads/master.zip"
+            f"{self.API_URL}{self._params['record_id']}/archive/refs/heads/master.zip"
         )
         z = zipfile.ZipFile(io.BytesIO(res.content))
         z.extractall(output_folder)
@@ -292,7 +294,7 @@ def _get(self, output_folder: Union[Path, str], *args, **kwargs):
 class HuggingFaceDataset(DatasetDownloader, DatasetResult):
     """Downloader for Huggingface repository."""
 
-    REGEXP_ID = r"huggingface.co/datasets/(.*)"
+    REGEXP_ID = r"huggingface.co/datasets/(?P<record_id>.*)"
 
     def _get(
         self,
@@ -307,4 +309,4 @@ def _get(
                 " or use 'pip install datahugger[all]'"
             ) from err
 
-        load_dataset(self.api_record_id, cache_dir=output_folder, **kwargs)
+        load_dataset(self._params["record_id"], cache_dir=output_folder, **kwargs)
diff --git a/tests/test_repositories.py b/tests/test_repositories.py
@@ -14,6 +14,8 @@
         "tutorial1.py",
     ),
     ("https://doi.org/10.7910/DVN/KBHLOD", "tutorial1.py"),
+    # Dataverse single file
+    ("10.7910/DVN/HZBYG7/RQ26H2", "Table 2.do"),
     # Figshare
     ("https://doi.org/10.6084/m9.figshare.8851784.v1", "cross_year_data2.csv"),
     (