Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Perfstress][Storage] Added Datalake perf tests #15861

Merged
merged 7 commits into from
Mar 3, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

import os
import uuid

from azure_devtools.perfstress_tests import PerfStressTest

from azure.core.exceptions import ResourceNotFoundError
from azure.storage.filedatalake import DataLakeServiceClient as SyncDataLakeServiceClient
from azure.storage.filedatalake.aio import DataLakeServiceClient as AsyncDataLakeServiceClient


class _ServiceTest(PerfStressTest):
service_client = None
async_service_client = None

def __init__(self, arguments):
super().__init__(arguments)
connection_string = self.get_from_env("AZURE_STORAGE_CONNECTION_STRING")
if not _ServiceTest.service_client or self.args.no_client_share:
_ServiceTest.service_client = SyncDataLakeServiceClient.from_connection_string(conn_str=connection_string)
_ServiceTest.async_service_client = AsyncDataLakeServiceClient.from_connection_string(conn_str=connection_string)
self.service_client = _ServiceTest.service_client
self.async_service_client =_ServiceTest.async_service_client

async def close(self):
await self.async_service_client.close()
await super().close()

@staticmethod
def add_arguments(parser):
super(_ServiceTest, _ServiceTest).add_arguments(parser)
parser.add_argument('-c', '--max-concurrency', nargs='?', type=int, help='Maximum number of concurrent threads used for data transfer. Defaults to 1', default=1)
parser.add_argument('-s', '--size', nargs='?', type=int, help='Size of data to transfer. Default is 10240.', default=10240)
parser.add_argument('--no-client-share', action='store_true', help='Create one ServiceClient per test instance. Default is to share a single ServiceClient.', default=False)


class _FileSystemTest(_ServiceTest):
fs_name = "perfstress-" + str(uuid.uuid4())

def __init__(self, arguments):
super().__init__(arguments)
self.fs_client = self.service_client.get_file_system_client(self.fs_name)
self.async_fs_client = self.async_service_client.get_file_system_client(self.fs_name)

async def global_setup(self):
await super().global_setup()
await self.async_fs_client.create_file_system()

async def global_cleanup(self):
await self.async_fs_client.delete_file_system()
await super().global_cleanup()

async def close(self):
await self.async_fs_client.close()
await super().close()


class _FileTest(_FileSystemTest):
def __init__(self, arguments):
super().__init__(arguments)
file_name = "sharefiletest-" + str(uuid.uuid4())
self.file_client = self.fs_client.get_file_client(file_name)
self.async_file_client = self.async_fs_client.get_file_client(file_name)

async def global_setup(self):
await super().global_setup()
try:
await self.async_file_client.delete_file()
except ResourceNotFoundError:
pass

async def global_cleanup(self):
try:
await self.async_file_client.delete_file()
except ResourceNotFoundError:
pass
await super().global_cleanup()

async def close(self):
await self.async_file_client.close()
await super().close()
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

import uuid

from azure_devtools.perfstress_tests import RandomStream, AsyncRandomStream

from ._test_base import _FileSystemTest


class AppendTest(_FileSystemTest):
def __init__(self, arguments):
super().__init__(arguments)
file_name = "filetest-" + str(uuid.uuid4())
mikeharder marked this conversation as resolved.
Show resolved Hide resolved
self.file_client = self.fs_client.get_file_client(file_name)
self.async_file_client = self.async_fs_client.get_file_client(file_name)

async def setup(self):
await self.async_file_client.create_file()

def run_sync(self):
data = RandomStream(self.args.size)
mikeharder marked this conversation as resolved.
Show resolved Hide resolved
self.file_client.append_data(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can appending to the same file over and over cause issues with service perf or correctness (e.g. is there a max file size)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will check in with the Storage team to see if they can shed light on that

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There should be no problems. When you keep re-appending, you would be rewriting the appended data (i.e the last append wins - assuming same length and offset). As for the maximum size, its 4.75 TiB (100 MiB X 50,000 blocks) for version 2016-05-31 and later, and 195 GiB (4 MiB X 50,000 blocks) for all older versions (since a datalake file is a block blob).

data,
length=self.args.size,
offset=0)

async def run_async(self):
data = AsyncRandomStream(self.args.size)
await self.async_file_client.append_data(
data,
length=self.args.size,
offset=0)
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

from azure_devtools.perfstress_tests import get_random_bytes, WriteStream

from ._test_base import _FileSystemTest


class DownloadTest(_FileSystemTest):
annatisch marked this conversation as resolved.
Show resolved Hide resolved
def __init__(self, arguments):
super().__init__(arguments)
file_name = "downloadtest"
self.file_client = self.fs_client.get_file_client(file_name)
self.async_file_client = self.async_fs_client.get_file_client(file_name)

async def global_setup(self):
await super().global_setup()
data = get_random_bytes(self.args.size)
await self.async_file_client.create_file()
await self.async_file_client.upload_data(data, overwrite=True)

def run_sync(self):
download = WriteStream()
mikeharder marked this conversation as resolved.
Show resolved Hide resolved
stream = self.file_client.download_file(max_concurrency=self.args.max_concurrency)
stream.readinto(download)

async def run_async(self):
download = WriteStream()
stream = await self.async_file_client.download_file(max_concurrency=self.args.max_concurrency)
await stream.readinto(download)

async def close(self):
await self.async_file_client.close()
await super().close()
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

from ._test_base import _FileTest

from azure_devtools.perfstress_tests import RandomStream
from azure_devtools.perfstress_tests import AsyncRandomStream


class UploadTest(_FileTest):
mikeharder marked this conversation as resolved.
Show resolved Hide resolved

def run_sync(self):
data = RandomStream(self.args.size)
self.file_client.upload_data(
data,
length=self.args.size,
overwrite=True,
max_concurrency=self.args.max_concurrency)

async def run_async(self):
data = AsyncRandomStream(self.args.size)
await self.async_file_client.upload_data(
data,
length=self.args.size,
overwrite=True,
max_concurrency=self.args.max_concurrency)
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

import os
import tempfile

from azure_devtools.perfstress_tests import get_random_bytes

from ._test_base import _FileTest


class UploadFromFileTest(_FileTest):
temp_file = None

async def global_setup(self):
await super().global_setup()
data = get_random_bytes(self.args.size)
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
UploadFromFileTest.temp_file = temp_file.name
temp_file.write(data)

async def global_cleanup(self):
os.remove(UploadFromFileTest.temp_file)
await super().global_cleanup()

def run_sync(self):
with open(UploadFromFileTest.temp_file, 'rb') as fp:
mikeharder marked this conversation as resolved.
Show resolved Hide resolved
self.file_client.upload_data(
fp,
overwrite=True,
max_concurrency=self.args.max_concurrency)

async def run_async(self):
with open(UploadFromFileTest.temp_file, 'rb') as fp:
await self.async_file_client.upload_data(
fp,
overwrite=True,
max_concurrency=self.args.max_concurrency)
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@

from .perf_stress_runner import PerfStressRunner
from .perf_stress_test import PerfStressTest
from .random_stream import RandomStream, get_random_bytes
from .random_stream import RandomStream, WriteStream, get_random_bytes
from .async_random_stream import AsyncRandomStream

__all__ = [
"PerfStressRunner",
"PerfStressTest",
"RandomStream",
"WriteStream",
"AsyncRandomStream",
"get_random_bytes"
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,20 @@ def read(self, size=None):
self._base_data = get_random_bytes(e)
self._base_data_length = e
self._remaining = self._remaining - e
self._position += e
return self._base_data[:e]

def seek(self, index, whence=0):
if whence == 0:
self._position = index
elif whence == 1:
self._position = self._position + index
elif whence == 2:
self._position = self._length - 1 + index

def tell(self):
return self._position

def remaining(self):
return self._remaining

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ def __init__(self, length, initial_buffer_length=1024*1024):
self._base_data_length = initial_buffer_length
self._position = 0
self._remaining = length
self._length = length

def read(self, size=None):
if self._remaining == 0:
return None
return b""

if size is None:
e = self._base_data_length
Expand All @@ -29,7 +30,39 @@ def read(self, size=None):
self._base_data = get_random_bytes(e)
self._base_data_length = e
self._remaining = self._remaining - e
self._position += e
return self._base_data[:e]

def tell(self):
return self._position

def seek(self, index, whence=0):
if whence == 0:
self._position = index
elif whence == 1:
self._position = self._position + index
elif whence == 2:
self._position = self._length - 1 + index

def remaining(self):
return self._remaining
return self._remaining


class WriteStream:

def __init__(self):
self._position = 0

def write(self, content):
length = len(content)
self._position += length
return length

def seek(self, index):
self._position = index

def seekable(self):
return True

def tell(self):
return self._position