diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index c2cb5b3d..767a691e 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -6,10 +6,10 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Setup up Python 3.10 + - name: Setup up Python 3.11 uses: actions/setup-python@v2 with: - python-version: "3.10" + python-version: "3.11" - name: Update pip run: python -m pip install -U pip @@ -26,15 +26,15 @@ jobs: strategy: matrix: include: - - {python: '3.7', os: ubuntu-20.04} - {python: '3.8', os: ubuntu-20.04} - {python: '3.9', os: ubuntu-20.04} - {python: '3.10', os: ubuntu-20.04} + - {python: '3.11', os: ubuntu-20.04} - - {python: '3.7', os: windows-2019} - {python: '3.8', os: windows-2019} - {python: '3.9', os: windows-2019} - {python: '3.10', os: windows-2019} + - {python: '3.11', os: windows-2019} steps: - uses: actions/checkout@v2 @@ -63,10 +63,10 @@ jobs: strategy: matrix: include: - - {python: '3.7', os: ubuntu-20.04} - {python: '3.8', os: ubuntu-20.04} - {python: '3.9', os: ubuntu-20.04} - {python: '3.10', os: ubuntu-20.04} + - {python: '3.11', os: ubuntu-20.04} # # Some of the doctests don't pass on Windows because of Windows-specific @@ -105,10 +105,10 @@ jobs: strategy: matrix: include: - - {python: '3.7', os: ubuntu-20.04, moto_server: true} - {python: '3.8', os: ubuntu-20.04} - {python: '3.9', os: ubuntu-20.04} - {python: '3.10', os: ubuntu-20.04} + - {python: '3.11', os: ubuntu-20.04} # Not sure why we exclude these, perhaps for historical reasons? # @@ -159,10 +159,10 @@ jobs: strategy: matrix: include: - - {python: '3.7', os: ubuntu-20.04} - {python: '3.8', os: ubuntu-20.04} - {python: '3.9', os: ubuntu-20.04} - {python: '3.10', os: ubuntu-20.04} + - {python: '3.11', os: ubuntu-20.04} # - {python: '3.7', os: windows-2019} # - {python: '3.8', os: windows-2019} diff --git a/CHANGELOG.md b/CHANGELOG.md index eb914a2c..01671d47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,15 @@ # Unreleased +## 6.4.0, 2023-09-07 + +* Ignore S3 seeks to the current position (PR [#782](https://github.com/RaRe-Technologies/smart_open/pull/782), [@beck3905](https://github.com/beck3905)) +* Set binary mode prior to FTP write (PR [#781](https://github.com/RaRe-Technologies/smart_open/pull/781), [@beck3905](https://github.com/beck3905)) +* Improve S3 URI Parsing for URIs with "@", "/", and ":" (PR [#776](https://github.com/RaRe-Technologies/smart_open/pull/776), [@rileypeterson](https://github.com/rileypeterson)) +* Add python 3.11 to setup.py (PR [#775](https://github.com/RaRe-Technologies/smart_open/pull/775), [@tooptoop4](https://github.com/tooptoop4)) +* Fix retrieving empty but existing object from S3 (PR [#771](https://github.com/RaRe-Technologies/smart_open/pull/771), [@Darkheir](https://github.com/Darkheir)) +* Avoid overfilling buffer when reading from Azure (PR [#767](https://github.com/RaRe-Technologies/smart_open/pull/767), [@ronreiter](https://github.com/ronreiter)) +* Add required import for example to work (PR [#756](https://github.com/RaRe-Technologies/smart_open/pull/756), [@jensenbox](https://github.com/jensenbox)) + ## 6.3.0, 2022-12-12 * Refactor Google Cloud Storage to use blob.open (__[ddelange](https://github.com/ddelange)__, [#744](https://github.com/RaRe-Technologies/smart_open/pull/744)) diff --git a/README.rst b/README.rst index 64435232..79b45844 100644 --- a/README.rst +++ b/README.rst @@ -151,6 +151,7 @@ For the sake of simplicity, the examples below assume you have all the dependenc .. code-block:: python >>> import os, boto3 + >>> from smart_open import open >>> >>> # stream content *into* S3 (write mode) using a custom session >>> session = boto3.Session( diff --git a/setup.py b/setup.py index 57ad9e66..8c08b8e9 100644 --- a/setup.py +++ b/setup.py @@ -95,6 +95,7 @@ def read(fname): 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', 'Topic :: System :: Distributed Computing', 'Topic :: Database :: Front-Ends', ], diff --git a/smart_open/azure.py b/smart_open/azure.py index 96f944a1..ccc19059 100644 --- a/smart_open/azure.py +++ b/smart_open/azure.py @@ -306,7 +306,7 @@ def read(self, size=-1): if self._position == self._size: return self._read_from_buffer() - self._fill_buffer() + self._fill_buffer(size) return self._read_from_buffer(size) def read1(self, size=-1): diff --git a/smart_open/ftp.py b/smart_open/ftp.py index 3dbe26f1..7d4a5ad5 100644 --- a/smart_open/ftp.py +++ b/smart_open/ftp.py @@ -14,6 +14,7 @@ import smart_open.utils from ftplib import FTP, FTP_TLS, error_reply import types + logger = logging.getLogger(__name__) SCHEMES = ("ftp", "ftps") @@ -55,8 +56,13 @@ def open_uri(uri, mode, transport_params): uri_path = parsed_uri.pop("uri_path") scheme = parsed_uri.pop("scheme") secure_conn = True if scheme == "ftps" else False - return open(uri_path, mode, secure_connection=secure_conn, - transport_params=transport_params, **parsed_uri) + return open( + uri_path, + mode, + secure_connection=secure_conn, + transport_params=transport_params, + **parsed_uri, + ) def convert_transport_params_to_args(transport_params): @@ -90,7 +96,9 @@ def _connect(hostname, username, port, password, secure_connection, transport_pa try: ftp.login(username, password) except error_reply as e: - logger.error("Unable to login to FTP server: try checking the username and password!") + logger.error( + "Unable to login to FTP server: try checking the username and password!" + ) raise e if secure_connection: ftp.prot_p() @@ -99,7 +107,7 @@ def _connect(hostname, username, port, password, secure_connection, transport_pa def open( path, - mode="r", + mode="rb", host=None, user=None, password=None, @@ -146,6 +154,7 @@ def open( except KeyError: raise ValueError(f"unsupported mode: {mode!r}") ftp_mode, file_obj_mode = mode_to_ftp_cmds[mode] + conn.voidcmd("TYPE I") socket = conn.transfercmd(f"{ftp_mode} {path}") fobj = socket.makefile(file_obj_mode) @@ -153,6 +162,7 @@ def full_close(self): self.orig_close() self.socket.close() self.conn.close() + fobj.orig_close = fobj.close fobj.socket = socket fobj.conn = conn diff --git a/smart_open/s3.py b/smart_open/s3.py index d8ce60ce..32c55202 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -335,10 +335,13 @@ def open( def _get(client, bucket, key, version, range_string): try: + params = dict(Bucket=bucket, Key=key) if version: - return client.get_object(Bucket=bucket, Key=key, VersionId=version, Range=range_string) - else: - return client.get_object(Bucket=bucket, Key=key, Range=range_string) + params["VersionId"] = version + if range_string: + params["Range"] = range_string + + return client.get_object(**params) except botocore.client.ClientError as error: wrapped_error = IOError( 'unable to access bucket: %r key: %r version: %r error: %s' % ( @@ -458,8 +461,19 @@ def _open_body(self, start=None, stop=None): error_response = _unwrap_ioerror(ioe) if error_response is None or error_response.get('Code') != _OUT_OF_RANGE: raise - self._position = self._content_length = int(error_response['ActualObjectSize']) - self._body = io.BytesIO() + try: + self._position = self._content_length = int(error_response['ActualObjectSize']) + self._body = io.BytesIO() + except KeyError: + response = _get( + self._client, + self._bucket, + self._key, + self._version_id, + None, + ) + self._position = self._content_length = response["ContentLength"] + self._body = response["Body"] else: # # Keep track of how many times boto3's built-in retry mechanism @@ -472,7 +486,7 @@ def _open_body(self, start=None, stop=None): self, response['ResponseMetadata']['RetryAttempts'], ) - units, start, stop, length = smart_open.utils.parse_content_range(response['ContentRange']) + _, start, stop, length = smart_open.utils.parse_content_range(response['ContentRange']) self._content_length = length self._position = start self._body = response['Body'] @@ -575,6 +589,7 @@ def __init__( self._buffer = smart_open.bytebuffer.ByteBuffer(buffer_size) self._eof = False self._line_terminator = line_terminator + self._seek_initialized = False # # This member is part of the io.BufferedIOBase interface. @@ -674,10 +689,16 @@ def seek(self, offset, whence=constants.WHENCE_START): whence = constants.WHENCE_START offset += self._current_pos - self._current_pos = self._raw_reader.seek(offset, whence) + if not self._seek_initialized or not ( + whence == constants.WHENCE_START and offset == self._current_pos + ): + self._current_pos = self._raw_reader.seek(offset, whence) + + self._buffer.empty() - self._buffer.empty() self._eof = self._current_pos == self._raw_reader._content_length + + self._seek_initialized = True return self._current_pos def tell(self): diff --git a/smart_open/tests/test_s3.py b/smart_open/tests/test_s3.py index a91a731e..fa907101 100644 --- a/smart_open/tests/test_s3.py +++ b/smart_open/tests/test_s3.py @@ -73,6 +73,8 @@ def mock_get(*args, **kwargs): error_response['ActualObjectSize'] = actual_size error_response['Code'] = 'InvalidRange' error_response['Message'] = 'The requested range is not satisfiable' + if actual_size is None: + error_response.pop('ActualObjectSize', None) raise with mock.patch('smart_open.s3._get', new=mock_get): @@ -399,6 +401,15 @@ def test_read_empty_file(self): self.assertEqual(data, b'') + def test_read_empty_file_no_actual_size(self): + _resource('s3').Object(BUCKET_NAME, KEY_NAME).put(Body=b'') + + with self.assertApiCalls(GetObject=2), patch_invalid_range_response(None): + with smart_open.s3.Reader(BUCKET_NAME, KEY_NAME) as fin: + data = fin.read() + + self.assertEqual(data, b'') + @moto.mock_s3 class MultipartWriterTest(unittest.TestCase): diff --git a/smart_open/version.py b/smart_open/version.py index 676f190a..15d95919 100644 --- a/smart_open/version.py +++ b/smart_open/version.py @@ -1,4 +1,4 @@ -__version__ = '6.3.0' +__version__ = '6.4.0' if __name__ == '__main__':