nasa · frankinspace · Apr 23, 2024 · Apr 22, 2024 · Apr 23, 2024
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -23,7 +23,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
-        poetry-version: ["1.7.1"]
+        poetry-version: ["1.8.2"]
         os: [ubuntu-latest]
     runs-on: ${{ matrix.os }}
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,9 +10,17 @@ adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ### Changed
 
-- [issues/35](https://github.com/nasa/python_cmr/issues/35) Eliminated accommodation for
-  Python versions older than 3.8 and updated CI build to test against Python versions
-  3.8 through 3.12.  Also, fixed all flake8 warnings.
+- [issues/35](https://github.com/nasa/python_cmr/issues/35) Eliminated
+  accommodation for Python versions older than 3.8 and updated CI build to test
+  against Python versions 3.8 through 3.12.  Also, fixed all flake8 warnings.
+
+## Fixed
+
+- [issues/42](https://github.com/nasa/python_cmr/issues/42) Fixed bug where a
+  `KeyError` was thrown from `Query.get` when the query format was a supported
+  format other than `"json"`.  Further, in such cases, too many items would be
+  fetched from the CMR due to a bug in how items were counted.  Now, no more
+  than `limit` items are fetched.
 
 ## [0.10.0]
 

diff --git a/cmr/queries.py b/cmr/queries.py
@@ -44,38 +44,35 @@ def get(self, limit=2000):
         :returns: query results as a list
         """
 
-        page_size = min(limit, 2000)
         url = self._build_url()
 
         results = []
+        headers = self.headers.copy() if self.headers else {}
         more_results = True
+        n_results = 0
 
         while more_results:
-            # Only get what we need
-            page_size = min(limit - len(results), page_size)
-            response = get(url, headers=self.headers, params={'page_size': page_size})
-            if self.headers is None:
-                self.headers = {}
-            self.headers['cmr-search-after'] = response.headers.get('cmr-search-after')
-
-            try:
-                response.raise_for_status()
-            except exceptions.HTTPError as ex:
-                raise RuntimeError(ex.response.text)
+            # Only get what we need on the last page.
+            page_size = min(limit - n_results, 2000)
+            response = get(url, headers=headers, params={"page_size": page_size})
+            response.raise_for_status()
 
-            if self._format == "json":
-                latest = response.json()['feed']['entry']
-            else:
-                latest = [response.text]
+            # Explicitly track the number of results we have because the length
+            # of the results list will only match the number of entries fetched
+            # when the format is JSON.  Otherwise, the length of the results
+            # list is the number of *pages* fetched, not the number of *items*.
+            n_results += page_size
 
-            results.extend(latest)
+            results.extend(
+                response.json()["feed"]["entry"]
+                if self._format == "json"
+                else [response.text]
+            )
 
-            if page_size > len(response.json()['feed']['entry']) or len(results) >= limit:
-                more_results = False
+            if cmr_search_after := response.headers.get("cmr-search-after"):
+                headers["cmr-search-after"] = cmr_search_after
 
-        # This header is transient. We need to get rid of it before we do another different query
-        if self.headers['cmr-search-after']:
-            del self.headers['cmr-search-after']
+            more_results = n_results < limit and cmr_search_after is not None
 
         return results