From f232cf1da1a9f798a0802d4ea880d17bf908d376 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 12 Jul 2022 16:18:56 +0200 Subject: [PATCH 01/79] SIgnificant simplification of baseparticlefile by using ds.to_zarr() --- parcels/particlefile/baseparticlefile.py | 236 ++++++----------------- 1 file changed, 55 insertions(+), 181 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index c31fe4287..678a3e811 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -1,13 +1,9 @@ -"""Module controlling the writing of ParticleSets to NetCDF file""" -import os -import random -import shutil -import string +"""Module controlling the writing of ParticleSets to Zarr file""" from abc import ABC from abc import abstractmethod -import gzip - +import os import numpy as np +import xarray as xr try: from mpi4py import MPI @@ -17,12 +13,6 @@ from parcels._version import version as parcels_version except: raise EnvironmentError('Parcels version can not be retrieved. Have you run ''python setup.py install''?') -try: - from os import getuid -except: - # Windows does not have getuid(), so define to simply return 'tmp' - def getuid(): - return 'tmp' __all__ = ['BaseParticleFile'] @@ -45,17 +35,11 @@ class BaseParticleFile(ABC): It is either a timedelta object or a positive double. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False :param convert_at_end: Boolean to convert npy files to netcdf at end of run. Default is True - :param tempwritedir: directories to write temporary files to during executing. - Default is out-XXXXXX where Xs are random capitals. Files for individual - processors are written to subdirectories 0, 1, 2 etc under tempwritedir - :param pset_info: dictionary of info on the ParticleSet, stored in tempwritedir/XX/pset_info.npy, - used to create NetCDF file from npy-files. """ write_ondelete = None convert_at_end = None outputdt = None lasttime_written = None - dataset = None name = None particleset = None parcels_mesh = None @@ -63,74 +47,58 @@ class BaseParticleFile(ABC): lonlatdepth_dtype = None var_names = None var_dtypes = None - file_list = None var_names_once = None var_dtypes_once = None - file_list_once = None maxid_written = -1 - tempwritedir_base = None - tempwritedir = None - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=True, - tempwritedir=None, pset_info=None): + def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=True): self.write_ondelete = write_ondelete - self.convert_at_end = convert_at_end self.outputdt = outputdt self.lasttime_written = None # variable to check if time has been written already - self.dataset = None - if pset_info: - for v in pset_info.keys(): - setattr(self, v, pset_info[v]) - else: - self.name = name - self.particleset = particleset - self.parcels_mesh = 'spherical' - if self.particleset.fieldset is not None: - self.parcels_mesh = self.particleset.fieldset.gridset.grids[0].mesh - self.time_origin = self.particleset.time_origin - self.lonlatdepth_dtype = self.particleset.collection.lonlatdepth_dtype - self.var_names = [] - self.var_dtypes = [] - self.var_names_once = [] - self.var_dtypes_once = [] - for v in self.particleset.collection.ptype.variables: - if v.to_write == 'once': - self.var_names_once += [v.name] - self.var_dtypes_once += [v.dtype] - elif v.to_write is True: - self.var_names += [v.name] - self.var_dtypes += [v.dtype] - if len(self.var_names_once) > 0: - self.written_once = [] - self.file_list_once = [] - - self.file_list = [] + self.name = name + self.particleset = particleset + self.parcels_mesh = 'spherical' + if self.particleset.fieldset is not None: + self.parcels_mesh = self.particleset.fieldset.gridset.grids[0].mesh + self.time_origin = self.particleset.time_origin + self.lonlatdepth_dtype = self.particleset.collection.lonlatdepth_dtype + self.var_names = [] + self.var_dtypes = [] + self.var_names_once = [] + self.var_dtypes_once = [] + for v in self.particleset.collection.ptype.variables: + if v.to_write == 'once': + self.var_names_once += [v.name] + self.var_dtypes_once += [v.dtype] + elif v.to_write is True: + self.var_names += [v.name] + self.var_dtypes += [v.dtype] + if len(self.var_names_once) > 0: + self.written_once = [] + self.written_first = False self.metadata = {"feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", "ncei_template_version": "NCEI_NetCDF_Trajectory_Template_v2.0", "parcels_version": parcels_version, "parcels_mesh": self.parcels_mesh} - tmp_dir = tempwritedir - if tempwritedir is None: - tmp_dir = os.path.join(os.path.dirname(str(self.name)), "out-%s" % ''.join(random.choice(string.ascii_uppercase) for _ in range(8))) - else: - tmp_dir = tempwritedir - - if MPI: - mpi_rank = MPI.COMM_WORLD.Get_rank() - self.tempwritedir_base = MPI.COMM_WORLD.bcast(tmp_dir, root=0) - else: - self.tempwritedir_base = tmp_dir - mpi_rank = 0 - self.tempwritedir = os.path.join(self.tempwritedir_base, "%d" % mpi_rank) + # Create dictionary to translate datatypes and fill_values + self.fmt_map = {np.float16: 'f2', np.float32: 'f4', np.float64: 'f8', + np.bool_: 'i1', np.int8: 'i1', np.int16: 'i2', + np.int32: 'i4', np.int64: 'i8', np.uint8: 'u1', + np.uint16: 'u2', np.uint32: 'u4', np.uint64: 'u8'} + self.fill_value_map = {np.float16: np.nan, np.float32: np.nan, np.float64: np.nan, + np.bool_: np.iinfo(np.int8).max, np.int8: np.iinfo(np.int8).max, + np.int16: np.iinfo(np.int16).max, np.int32: np.iinfo(np.int32).max, + np.int64: np.iinfo(np.int64).max, np.uint8: np.iinfo(np.uint8).max, + np.uint16: np.iinfo(np.uint16).max, np.uint32: np.iinfo(np.uint32).max, + np.uint64: np.iinfo(np.uint64).max} - if not os.path.exists(self.tempwritedir): - os.makedirs(self.tempwritedir) - elif pset_info is None: - raise IOError("output directory %s already exists. Please remove the directory." % self.tempwritedir) + extension = os.path.splitext(str(self.name))[1] + self.fname = self.name if extension in ['.nc', '.nc4', '.zarr'] else "%s.zarr" % self.name + self.outputformat = extension @abstractmethod def _reserved_var_names(self): @@ -139,26 +107,6 @@ def _reserved_var_names(self): """ pass - def open_output_file(self, data_shape): - """Initialise file for trajectory output. - The output follows the format outlined in the Discrete Sampling Geometries - section of the CF-conventions: - http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#discrete-sampling-geometries - The current implementation is based on the NCEI template: - http://www.nodc.noaa.gov/data/formats/netcdf/v2.0/trajectoryIncomplete.cdl - - :param data_shape: shape of the variables in the output file - """ - extension = os.path.splitext(str(self.name))[1] - self.fname = self.name if extension in ['.nc', '.nc4', '.zarr'] else "%s.nc" % self.name - self.outputformat = extension - if os.path.exists(str(self.fname)): - if 'zarr' in self.outputformat: - shutil.rmtree(str(self.fname)) - else: - os.remove(str(self.fname)) - self.attrs = self._create_variables_attribute_dict() - def _create_variables_attribute_dict(self): """ creates the dictionary with variable attributes. @@ -208,18 +156,10 @@ def _create_variables_attribute_dict(self): return attrs def __del__(self): - if self.convert_at_end: - self.close() + self.close() def close(self, delete_tempfiles=True): - """Close the ParticleFile object by exporting and then deleting - the temporary npy files""" - self.export() - mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 - if mpi_rank == 0: - if delete_tempfiles: - self.delete_tempwritedir(tempwritedir=self.tempwritedir_base) - self.convert_at_end = False + pass def add_metadata(self, name, message): """Add metadata to :class:`parcels.particleset.ParticleSet` @@ -229,91 +169,25 @@ def add_metadata(self, name, message): """ self.metadata[name] = message - def dump_dict_to_npy(self, data_dict, data_dict_once): - """Buffer data to set of temporary numpy files, using np.save""" - - if not os.path.exists(self.tempwritedir): - os.makedirs(self.tempwritedir) - - if len(data_dict) > 0: - tmpfilename = os.path.join(self.tempwritedir, str(len(self.file_list)) + ".npy.gz") - with gzip.open(tmpfilename, 'wb') as f: - np.save(f, data_dict) - self.file_list.append(tmpfilename) - - if len(data_dict_once) > 0: - tmpfilename = os.path.join(self.tempwritedir, str(len(self.file_list)) + '_once.npy.gz') - with gzip.open(tmpfilename, 'wb') as f: - np.save(f, data_dict_once) - self.file_list_once.append(tmpfilename) - - @abstractmethod - def get_pset_info_attributes(self): - """ - returns the main attributes of the pset_info.npy file. - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - return None - - def dump_psetinfo_to_npy(self): - """ - function writes the major attributes and values to a pset information file (*.npy). - """ - pset_info = {} - attrs_to_dump = self.get_pset_info_attributes() - if attrs_to_dump is None: - return - for a in attrs_to_dump: - if hasattr(self, a): - pset_info[a] = getattr(self, a) - with open(os.path.join(self.tempwritedir, 'pset_info.npy'), 'wb') as f: - np.save(f, pset_info) - def write(self, pset, time, deleted_only=False): - """Write all data from one time step to a temporary npy-file - using a python dictionary. The data is saved in the folder 'out'. + """Write all data from one time step to the zarr file :param pset: ParticleSet object to write :param time: Time at which to write ParticleSet :param deleted_only: Flag to write only the deleted Particles """ - data_dict, data_dict_once = pset.to_dict(self, time, deleted_only=deleted_only) - self.dump_dict_to_npy(data_dict, data_dict_once) - self.dump_psetinfo_to_npy() + ds = xr.Dataset(attrs=self.metadata) + attrs = self._create_variables_attribute_dict() - @abstractmethod - def read_from_npy(self, file_list, time_steps, var): - """ - Read NPY-files for one variable using a loop over all files. - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - - :param file_list: List that contains all file names in the output directory - :param time_steps: Number of time steps that were written in out directory - :param var: name of the variable to read - """ - return None - - @abstractmethod - def export(self): - """ - Exports outputs in temporary NPY-files to NetCDF file - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - pass - - def delete_tempwritedir(self, tempwritedir=None): - """Deleted all temporary npy files - - :param tempwritedir Optional path of the directory to delete - """ - if tempwritedir is None: - tempwritedir = self.tempwritedir - if os.path.exists(tempwritedir): - shutil.rmtree(tempwritedir) + for var, dtype in zip(self.var_names, self.var_dtypes): + varout = 'z' if var == 'depth' else var + varout = 'trajectory' if varout == 'id' else varout + ds[varout] = xr.DataArray(data=[getattr(pset, var)], dims=["obs", "traj"], attrs=attrs[varout]) + if self.written_first and "_FillValue" in ds[varout].attrs: + del ds[varout].attrs["_FillValue"] + if not self.written_first: + ds.to_zarr(self.fname, mode='w') + self.written_first = True + else: + ds.to_zarr(self.fname, mode='a', append_dim='obs') From f08fea65b4cdd231f1227ceef2526c3af05071e2 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 13 Jul 2022 14:50:41 +0200 Subject: [PATCH 02/79] Updating particlefile.write() to support deleting particles --- parcels/particlefile/baseparticlefile.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 678a3e811..71da9b089 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -49,7 +49,6 @@ class BaseParticleFile(ABC): var_dtypes = None var_names_once = None var_dtypes_once = None - maxid_written = -1 def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=True): @@ -179,11 +178,15 @@ def write(self, pset, time, deleted_only=False): ds = xr.Dataset(attrs=self.metadata) attrs = self._create_variables_attribute_dict() + datalen = max(pset.id) + 1 + data = np.nan * np.ones((datalen, 1)) for var, dtype in zip(self.var_names, self.var_dtypes): varout = 'z' if var == 'depth' else var varout = 'trajectory' if varout == 'id' else varout - ds[varout] = xr.DataArray(data=[getattr(pset, var)], dims=["obs", "traj"], attrs=attrs[varout]) + + data[pset.id, 0] = getattr(pset, var) + ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) if self.written_first and "_FillValue" in ds[varout].attrs: del ds[varout].attrs["_FillValue"] if not self.written_first: From f354d1af02a58e5ae33f3924f6474bb9c2a8ca49 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 13 Jul 2022 14:51:20 +0200 Subject: [PATCH 03/79] Updating particlefilesoa and -aos to new simpler writing --- parcels/particlefile/particlefileaos.py | 15 ++++----------- parcels/particlefile/particlefilesoa.py | 15 ++++----------- 2 files changed, 8 insertions(+), 22 deletions(-) diff --git a/parcels/particlefile/particlefileaos.py b/parcels/particlefile/particlefileaos.py index bcff77290..77cbf8a03 100644 --- a/parcels/particlefile/particlefileaos.py +++ b/parcels/particlefile/particlefileaos.py @@ -24,19 +24,12 @@ class ParticleFileAOS(BaseParticleFile): while ParticleFile is given as an argument of ParticleSet.execute() It is either a timedelta object or a positive double. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False - :param convert_at_end: Boolean to convert npy files to netcdf at end of run. Default is True - :param tempwritedir: directories to write temporary files to during executing. - Default is out-XXXXXX where Xs are random capitals. Files for individual - processors are written to subdirectories 0, 1, 2 etc under tempwritedir - :param pset_info: dictionary of info on the ParticleSet, stored in tempwritedir/XX/pset_info.npy, - used to create NetCDF file from npy-files. + :param convert_at_end: Boolean to convert zarr file to netcdf at end of run. Default is False """ - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=True, - tempwritedir=None, pset_info=None): + def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=False): super(ParticleFileAOS, self).__init__(name=name, particleset=particleset, outputdt=outputdt, - write_ondelete=write_ondelete, convert_at_end=convert_at_end, - tempwritedir=tempwritedir, pset_info=pset_info) + write_ondelete=write_ondelete, convert_at_end=convert_at_end) def __del__(self): super(ParticleFileAOS, self).__del__() @@ -160,7 +153,7 @@ def export(self): for var, dtype in zip(self.var_names, self.var_dtypes): data = self.read_from_npy(global_file_list, n_timesteps, var, dtype) if var == self.var_names[0]: - self.open_output_file(data.shape) + self.open_output_file() varout = 'z' if var == 'depth' else var varout = 'trajectory' if varout == 'id' else varout ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=self.attrs[varout]) diff --git a/parcels/particlefile/particlefilesoa.py b/parcels/particlefile/particlefilesoa.py index 2180f0b44..3f59f16d1 100644 --- a/parcels/particlefile/particlefilesoa.py +++ b/parcels/particlefile/particlefilesoa.py @@ -24,19 +24,12 @@ class ParticleFileSOA(BaseParticleFile): while ParticleFile is given as an argument of ParticleSet.execute() It is either a timedelta object or a positive double. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False - :param convert_at_end: Boolean to convert npy files to netcdf at end of run. Default is True - :param tempwritedir: directories to write temporary files to during executing. - Default is out-XXXXXX where Xs are random capitals. Files for individual - processors are written to subdirectories 0, 1, 2 etc under tempwritedir - :param pset_info: dictionary of info on the ParticleSet, stored in tempwritedir/XX/pset_info.npy, - used to create NetCDF file from npy-files. + :param convert_at_end: Boolean to convert zarr file to netcdf at end of run. Default is False """ - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=True, - tempwritedir=None, pset_info=None): + def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=False): super(ParticleFileSOA, self).__init__(name=name, particleset=particleset, outputdt=outputdt, - write_ondelete=write_ondelete, convert_at_end=convert_at_end, - tempwritedir=tempwritedir, pset_info=pset_info) + write_ondelete=write_ondelete, convert_at_end=convert_at_end) def __del__(self): super(ParticleFileSOA, self).__del__() @@ -161,7 +154,7 @@ def export(self): for var, dtype in zip(self.var_names, self.var_dtypes): data = self.read_from_npy(global_file_list, n_timesteps, var, dtype) if var == self.var_names[0]: - self.open_output_file(data.shape) + self.open_output_file() varout = 'z' if var == 'depth' else var varout = 'trajectory' if varout == 'id' else varout ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=self.attrs[varout]) From 831ca7c69a5b0ef39f51dc47f3afcd41172f046d Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 13 Jul 2022 14:51:46 +0200 Subject: [PATCH 04/79] Updating output-logger message in pset.execute() --- parcels/particleset/baseparticleset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/parcels/particleset/baseparticleset.py b/parcels/particleset/baseparticleset.py index 9e5c99807..50e60b368 100644 --- a/parcels/particleset/baseparticleset.py +++ b/parcels/particleset/baseparticleset.py @@ -436,9 +436,7 @@ def execute(self, pyfunc=AdvectionRK4, pyfunc_inter=None, endtime=None, runtime= if verbose_progress is None and time_module.time() - walltime_start > 10: # Showing progressbar if runtime > 10 seconds if output_file: - logger.info('Temporary output files are stored in %s.' % output_file.tempwritedir_base) - logger.info('You can use "parcels_convert_npydir_to_netcdf %s" to convert these ' - 'to a NetCDF file during the run.' % output_file.tempwritedir_base) + logger.info('Output files are stored in %s.' % output_file.fname) pbar = self.__create_progressbar(_starttime, endtime) verbose_progress = True From 684d9cc9aefa8f45fa74255a0a990d364ef10155 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 13 Jul 2022 14:52:38 +0200 Subject: [PATCH 05/79] Updating test_particle_file to zarr dumping (first part) --- tests/test_particle_file.py | 59 +++++++++---------------------------- 1 file changed, 14 insertions(+), 45 deletions(-) diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index 8ab28b576..4ddb86a4d 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -33,39 +33,6 @@ def fieldset_ficture(xdim=40, ydim=100): return fieldset(xdim=xdim, ydim=ydim) -def close_and_compare_netcdffiles(filepath, ofile, assystemcall=False): - if assystemcall: - os.system('parcels_convert_npydir_to_netcdf %s' % ofile.tempwritedir_base) - else: - import parcels.scripts.convert_npydir_to_netcdf as convert - convert.convert_npydir_to_netcdf(ofile.tempwritedir_base, pfile_class=ofile.__class__) - - engine = 'zarr' if 'zarr' in str(filepath) else 'netcdf4' - ncfile1 = xr.open_dataset(filepath, engine=engine) - - ofile.name = filepath + 'b.nc' - ofile.export() - - if engine == 'zarr': - assert os.path.getsize(filepath) < os.path.getsize(ofile.name) # zarr expected to be smaller filesize - else: - assert os.path.getsize(filepath) == os.path.getsize(ofile.name) - - ncfile2 = xr.open_dataset(filepath + 'b.nc') - for v in ncfile2.keys(): - if v == 'time': - assert np.allclose(ncfile1[v].values, ncfile2[v].values, atol=np.timedelta64(1, 's'), equal_nan=True) - else: - assert np.allclose(ncfile1[v].values, ncfile2[v].values, equal_nan=True) - - for a in ncfile2.attrs: - if a != 'parcels_version': - assert getattr(ncfile1, a) == getattr(ncfile2, a) - - ncfile2.close() - return ncfile1 - - @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_pfile_array_remove_particles(fieldset, pset_mode, mode, tmpdir, npart=10): @@ -79,11 +46,11 @@ def test_pfile_array_remove_particles(fieldset, pset_mode, mode, tmpdir, npart=1 for p in pset: p.time = 1 pfile.write(pset, 1) - ncfile = close_and_compare_netcdffiles(filepath, pfile) - timearr = ncfile.variables['time'][:] - assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) - ncfile.close() + ds = xr.open_zarr(filepath) + timearr = ds['time'][:] + assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) + ds.close() @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) @@ -100,11 +67,12 @@ def Update_lon(particle, fieldset, time): particle.lon += 0.1 pset.execute(Update_lon, runtime=10, output_file=pfile) - ncfile = close_and_compare_netcdffiles(filepath, pfile) - assert 'time' in ncfile.variables - assert 'depth' not in ncfile.variables - assert 'lat' not in ncfile.variables - ncfile.close() + + ds = xr.open_zarr(filepath) + assert 'time' in ds + assert 'depth' not in ds + assert 'lat' not in ds + ds.close() # For pytest purposes, we need to reset to original status pset.set_variable_write_status('depth', True) @@ -125,9 +93,10 @@ def test_pfile_array_remove_all_particles(fieldset, pset_mode, mode, tmpdir, npa pset.remove_indices(-1) pfile.write(pset, 1) pfile.write(pset, 2) - ncfile = close_and_compare_netcdffiles(filepath, pfile) - assert ncfile.variables['time'][:].shape == (npart, 1) - ncfile.close() + + ds = xr.open_zarr(filepath) + assert ds['time'][:].shape == (npart, 1) + ds.close() @pytest.mark.parametrize('pset_mode', pset_modes) From 084a68d103487fe8a249140e6b113509d97c8f6d Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Fri, 15 Jul 2022 16:16:09 +0200 Subject: [PATCH 06/79] Update baseparticlefile.py --- parcels/particlefile/baseparticlefile.py | 34 ++++++++++-------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 71da9b089..acc1b739b 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -45,10 +45,6 @@ class BaseParticleFile(ABC): parcels_mesh = None time_origin = None lonlatdepth_dtype = None - var_names = None - var_dtypes = None - var_names_once = None - var_dtypes_once = None def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=True): @@ -63,19 +59,15 @@ def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, c self.parcels_mesh = self.particleset.fieldset.gridset.grids[0].mesh self.time_origin = self.particleset.time_origin self.lonlatdepth_dtype = self.particleset.collection.lonlatdepth_dtype - self.var_names = [] - self.var_dtypes = [] - self.var_names_once = [] - self.var_dtypes_once = [] + self.vars_to_write = {} + self.vars_to_write_once = {} for v in self.particleset.collection.ptype.variables: if v.to_write == 'once': - self.var_names_once += [v.name] - self.var_dtypes_once += [v.dtype] + self.vars_to_write_once[v.name] = v.dtype elif v.to_write is True: - self.var_names += [v.name] - self.var_dtypes += [v.dtype] - if len(self.var_names_once) > 0: - self.written_once = [] + self.vars_to_write[v.name] = v.dtype + # if len(self.var_names_once) > 0: + # self.written_once = [] self.written_first = False self.metadata = {"feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", @@ -97,6 +89,8 @@ def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, c extension = os.path.splitext(str(self.name))[1] self.fname = self.name if extension in ['.nc', '.nc4', '.zarr'] else "%s.zarr" % self.name + if extension == '': + extension = '.zarr' self.outputformat = extension @abstractmethod @@ -139,15 +133,15 @@ def _create_variables_attribute_dict(self): attrs['time']['units'] = "seconds since " + str(self.time_origin) attrs['time']['calendar'] = 'standard' if self.time_origin.calendar == 'np_datetime64' else self.time_origin.calendar - for vname, dtype in zip(self.var_names, self.var_dtypes): + for vname in self.vars_to_write: if vname not in self._reserved_var_names(): - attrs[vname] = {"_FillValue": self.fill_value_map[dtype], + attrs[vname] = {"_FillValue": self.fill_value_map[self.vars_to_write[vname]], "long_name": "", "standard_name": vname, "units": "unknown"} - for vname, dtype in zip(self.var_names_once, self.var_dtypes_once): - attrs[vname] = {"_FillValue": self.fill_value_map[dtype], + for vname in self.vars_to_write_once: + attrs[vname] = {"_FillValue": self.fill_value_map[self.vars_to_write_once[vname]], "long_name": "", "standard_name": vname, "units": "unknown"} @@ -179,12 +173,12 @@ def write(self, pset, time, deleted_only=False): ds = xr.Dataset(attrs=self.metadata) attrs = self._create_variables_attribute_dict() datalen = max(pset.id) + 1 - data = np.nan * np.ones((datalen, 1)) - for var, dtype in zip(self.var_names, self.var_dtypes): + for var in self.vars_to_write: varout = 'z' if var == 'depth' else var varout = 'trajectory' if varout == 'id' else varout + data = np.ones((datalen, 1), dtype=self.vars_to_write[var]) data[pset.id, 0] = getattr(pset, var) ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) if self.written_first and "_FillValue" in ds[varout].attrs: From 8101eb8f82dd4527f6494b2d69f75495bde49665 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Fri, 15 Jul 2022 16:16:29 +0200 Subject: [PATCH 07/79] Further additions to test_particle_file --- tests/test_particle_file.py | 41 +++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index 4ddb86a4d..3bd24282b 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -6,7 +6,6 @@ import numpy as np import pytest import os -from netCDF4 import Dataset import cftime import random as py_random import xarray as xr @@ -36,7 +35,7 @@ def fieldset_ficture(xdim=40, ydim=100): @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_pfile_array_remove_particles(fieldset, pset_mode, mode, tmpdir, npart=10): - filepath = tmpdir.join("pfile_array_remove_particles.nc") + filepath = tmpdir.join("pfile_array_remove_particles.zarr") pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=np.linspace(0, 1, npart), lat=0.5*np.ones(npart), time=0) @@ -52,10 +51,11 @@ def test_pfile_array_remove_particles(fieldset, pset_mode, mode, tmpdir, npart=1 assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) ds.close() + @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_pfile_set_towrite_False(fieldset, pset_mode, mode, tmpdir, npart=10): - filepath = tmpdir.join("pfile_set_towrite_False.nc") + filepath = tmpdir.join("pfile_set_towrite_False.zarr") pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=np.linspace(0, 1, npart), lat=0.5*np.ones(npart)) @@ -83,7 +83,7 @@ def Update_lon(particle, fieldset, time): @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_pfile_array_remove_all_particles(fieldset, pset_mode, mode, tmpdir, npart=10): - filepath = tmpdir.join("pfile_array_remove_particles.nc") + filepath = tmpdir.join("pfile_array_remove_particles.zarr") pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=np.linspace(0, 1, npart), lat=0.5*np.ones(npart), time=0) @@ -101,9 +101,8 @@ def test_pfile_array_remove_all_particles(fieldset, pset_mode, mode, tmpdir, npa @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) -@pytest.mark.parametrize('assystemcall', [True, False]) -def test_variable_written_ondelete(fieldset, pset_mode, mode, tmpdir, assystemcall, npart=3): - filepath = tmpdir.join("pfile_on_delete_written_variables.nc") +def test_variable_written_ondelete(fieldset, pset_mode, mode, tmpdir, npart=3): + filepath = tmpdir.join("pfile_on_delete_written_variables.zarr") def move_west(particle, fieldset, time): tmp1, tmp2 = fieldset.UV[time, particle.depth, particle.lat, particle.lon] # to trigger out-of-bounds error @@ -126,17 +125,18 @@ def DeleteP(particle, fieldset, time): pset.execute(move_west, runtime=runtime, dt=dt, output_file=outfile, recovery={ErrorCode.ErrorOutOfBounds: DeleteP}) - ncfile = close_and_compare_netcdffiles(filepath, outfile, assystemcall=assystemcall) - assert ncfile.runtime == runtime - lon = ncfile.variables['lon'][:] + ds = xr.open_zarr(filepath) + assert ds.runtime == runtime + lon = ds['lon'][:] assert (lon.size == noutside) - ncfile.close() + ds.close() +# test_variable_written_ondelete(fieldset(), 'aos', 'jit', '') @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_variable_write_double(fieldset, pset_mode, mode, tmpdir): - filepath = tmpdir.join("pfile_variable_write_double.nc") + filepath = tmpdir.join("pfile_variable_write_double.zarr") def Update_lon(particle, fieldset, time): particle.lon += 0.1 @@ -145,16 +145,16 @@ def Update_lon(particle, fieldset, time): ofile = pset.ParticleFile(name=filepath, outputdt=0.00001) pset.execute(pset.Kernel(Update_lon), endtime=0.001, dt=0.00001, output_file=ofile) - ncfile = close_and_compare_netcdffiles(filepath, ofile) - lons = ncfile.variables['lon'][:] + ds = xr.open_zarr(filepath) + lons = ds['lon'][:] assert (isinstance(lons.values[0, 0], np.float64)) - ncfile.close() + ds.close() @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) -def test_write_dtypes_pfile(fieldset, mode, pset_mode, tmpdir): - filepath = tmpdir.join("pfile_dtypes.nc") +def test_write_dtypes_pfile(fieldset, pset_mode, mode, tmpdir): + filepath = tmpdir.join("pfile_dtypes.zarr") dtypes = ['float32', 'float64', 'int32', 'uint32', 'int64', 'uint64'] if mode == 'scipy' or pset_mode == 'soa': @@ -168,12 +168,13 @@ class MyParticle(ptype[mode]): pset = pset_type[pset_mode]['pset'](fieldset, pclass=MyParticle, lon=0, lat=0) pfile = pset.ParticleFile(name=filepath, outputdt=1) pfile.write(pset, 0) - pfile.close() - ncfile = Dataset(filepath, 'r', 'NETCDF4') # using netCDF4.Dataset here because xarray does not observe all dtypes correctly + + ds = xr.open_zarr(filepath, mask_and_scale=False) # Note masking issue at https://stackoverflow.com/questions/68460507/xarray-loading-int-data-as-float for d in dtypes: nc_fmt = d if d != 'bool_' else 'i1' - assert ncfile.variables[f'v_{d}'].dtype == nc_fmt + assert ds[f'v_{d}'].dtype == nc_fmt +test_write_dtypes_pfile(fieldset(), 'aos', 'jit', '') @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) From 762a4ea3069615b13b6bc9da8b040ce235712367 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 19 Jul 2022 17:37:09 +0200 Subject: [PATCH 08/79] Updating dump_to_zarr to support adding and removing particles --- parcels/collection/collectionaos.py | 6 +- parcels/collection/collectionsoa.py | 6 +- parcels/particlefile/baseparticlefile.py | 111 +++++++++++++++++++---- tests/test_particle_file.py | 16 ++-- 4 files changed, 108 insertions(+), 31 deletions(-) diff --git a/parcels/collection/collectionaos.py b/parcels/collection/collectionaos.py index adddb8fe7..97ad0d328 100644 --- a/parcels/collection/collectionaos.py +++ b/parcels/collection/collectionaos.py @@ -925,7 +925,7 @@ def toDictionary(self, pfile, time, deleted_only=False): else: indices_to_write = _to_write_particles(self._data, time) if len(indices_to_write) > 0: - for var in pfile.var_names: + for var in pfile.vars_to_write: if 'id' in var: data_dict[var] = np.array([np.int64(getattr(p, var)) for p in self._data[indices_to_write]]) else: @@ -935,12 +935,12 @@ def toDictionary(self, pfile, time, deleted_only=False): for p in pset_errs: logger.warning_once('time argument in pfile.write() is %g, but a particle has time % g.' % (time, p.time)) - if len(pfile.var_names_once) > 0: + if len(pfile.vars_to_write_once) > 0: # _to_write_particles(self._data, time) first_write = [p for p in self._data if _is_particle_started_yet(p, time) and (np.int64(p.id) not in pfile.written_once)] if np.any(first_write): data_dict_once['id'] = np.array([p.id for p in first_write]).astype(dtype=np.int64) - for var in pfile.var_names_once: + for var in pfile.vars_to_write_once: data_dict_once[var] = np.array([getattr(p, var) for p in first_write]) pfile.written_once.extend(np.array(data_dict_once['id']).astype(dtype=np.int64).tolist()) diff --git a/parcels/collection/collectionsoa.py b/parcels/collection/collectionsoa.py index 3d24c0867..191729c04 100644 --- a/parcels/collection/collectionsoa.py +++ b/parcels/collection/collectionsoa.py @@ -848,18 +848,18 @@ def toDictionary(self, pfile, time, deleted_only=False): else: indices_to_write = _to_write_particles(self._data, time) if np.any(indices_to_write): - for var in pfile.var_names: + for var in pfile.vars_to_write: data_dict[var] = self._data[var][indices_to_write] pset_errs = ((self._data['state'][indices_to_write] != OperationCode.Delete) & np.greater(np.abs(time - self._data['time'][indices_to_write]), 1e-3, where=np.isfinite(self._data['time'][indices_to_write]))) if np.count_nonzero(pset_errs) > 0: logger.warning_once('time argument in pfile.write() is {}, but particles have time {}'.format(time, self._data['time'][pset_errs])) - if len(pfile.var_names_once) > 0: + if len(pfile.vars_to_write_once) > 0: first_write = (_to_write_particles(self._data, time) & _is_particle_started_yet(self._data, time) & np.isin(self._data['id'], pfile.written_once, invert=True)) if np.any(first_write): data_dict_once['id'] = np.array(self._data['id'][first_write]).astype(dtype=np.int64) - for var in pfile.var_names_once: + for var in pfile.vars_to_write_once: data_dict_once[var] = self._data[var][first_write] pfile.written_once.extend(np.array(self._data['id'][first_write]).astype(dtype=np.int64).tolist()) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index acc1b739b..bddb5c338 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -4,6 +4,7 @@ import os import numpy as np import xarray as xr +import zarr try: from mpi4py import MPI @@ -68,6 +69,8 @@ def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, c self.vars_to_write[v.name] = v.dtype # if len(self.var_names_once) > 0: # self.written_once = [] + self.IDs_written = {} + self.maxobs = {} self.written_first = False self.metadata = {"feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", @@ -169,22 +172,94 @@ def write(self, pset, time, deleted_only=False): :param time: Time at which to write ParticleSet :param deleted_only: Flag to write only the deleted Particles """ + data_dict, data_dict_once = pset.to_dict(self, time, deleted_only=deleted_only) - ds = xr.Dataset(attrs=self.metadata) - attrs = self._create_variables_attribute_dict() - datalen = max(pset.id) + 1 - - for var in self.vars_to_write: - varout = 'z' if var == 'depth' else var - varout = 'trajectory' if varout == 'id' else varout - - data = np.ones((datalen, 1), dtype=self.vars_to_write[var]) - data[pset.id, 0] = getattr(pset, var) - ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) - if self.written_first and "_FillValue" in ds[varout].attrs: - del ds[varout].attrs["_FillValue"] - if not self.written_first: - ds.to_zarr(self.fname, mode='w') - self.written_first = True - else: - ds.to_zarr(self.fname, mode='a', append_dim='obs') + maxtraj = len(self.IDs_written) + if len(data_dict) > 0: + for i in data_dict['id']: + if i not in self.IDs_written: + self.IDs_written[i] = maxtraj + self.maxobs[i] = 0 + maxtraj += 1 + else: + self.maxobs[i] += 1 + + if len(data_dict) > 0: + if not self.written_first: + ds = xr.Dataset(attrs=self.metadata) + attrs = self._create_variables_attribute_dict() + ids = [self.IDs_written[i] for i in data_dict['id']] + for var in data_dict: + varout = 'z' if var == 'depth' else var + varout = 'trajectory' if varout == 'id' else varout + data = np.full((maxtraj, 1), np.nan, dtype=self.vars_to_write[var]) + data[ids, 0] = data_dict[var] + ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) + ds.to_zarr(self.fname, mode='w') + self.written_first = True + else: + store = zarr.DirectoryStore(self.fname) + Z = zarr.group(store=store, overwrite=False) + ids = [self.IDs_written[i] for i in data_dict['id']] + maxobs = [self.maxobs[i] for i in data_dict['id']] + + for var in data_dict: + varout = 'z' if var == 'depth' else var + varout = 'trajectory' if varout == 'id' else varout + for i, t, v in zip(ids, maxobs, data_dict[var]): + if t >= Z[varout].shape[1]: + a = np.full((Z[varout].shape[0], 1), np.nan, dtype=self.vars_to_write[var]) + Z[varout].append(a, axis=1) + zarr.consolidate_metadata(store) + if i >= Z[varout].shape[0]: + a = np.full((maxtraj-Z[varout].shape[0], Z[varout].shape[1]), np.nan, dtype=self.vars_to_write[var]) + Z[varout].append(a, axis=0) + zarr.consolidate_metadata(store) + + Z[varout][i, t] = v + + # if expanded_trajs and self.written_first: + # for z in Z: + # zin = 'id' if z == 'trajectory' else z + # zin = 'depth' if zin == 'z' else zin + # + # if Z[z].ndim == 2: + # a = np.full((expanded_trajs, Z[z].shape[1]), np.nan, dtype=self.vars_to_write[zin]) + # else: + # a = np.full((expanded_trajs,), np.nan, dtype=self.vars_to_write_once[zin]) + # Z[z].append(a) + # + # + # if len(data_dict) > 0: + # ids = [self.IDs_written[i] for i in data_dict['id']] + # for var in data_dict: + # varout = 'z' if var == 'depth' else var + # varout = 'trajectory' if varout == 'id' else varout + # data = np.full((datalen, 1), np.nan, dtype=self.vars_to_write[var]) + # data[ids, 0] = data_dict[var] + # ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) + # if self.written_first and "_FillValue" in ds[varout].attrs: + # del ds[varout].attrs["_FillValue"] + # + # if len(data_dict_once) > 0: + # ids = [self.IDs_written[i] for i in data_dict_once['id']] + # if self.written_first: + # ds_in = xr.open_zarr(self.name) + # for var in data_dict_once: + # if var != 'id': + # if self.written_first: + # data = ds_in[var].values + # print(data, ids) + # else: + # data = np.full((datalen,), np.nan, dtype=self.vars_to_write_once[var]) + # data[ids] = data_dict_once[var] + # ds[var] = xr.DataArray(data=data, dims=["traj"], attrs=attrs[var]) + # if self.written_first and "_FillValue" in ds[var].attrs: + # del ds[var].attrs["_FillValue"] + # + # if len(ds) > 0: + # if not self.written_first: + # ds.to_zarr(self.fname, mode='w') + # self.written_first = True + # else: + # ds.to_zarr(self.fname, mode='a', append_dim='obs') diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index 3bd24282b..b44d098ba 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -51,6 +51,7 @@ def test_pfile_array_remove_particles(fieldset, pset_mode, mode, tmpdir, npart=1 assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) ds.close() +# test_pfile_array_remove_particles(fieldset(), 'soa', 'jit', '') @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) @@ -174,7 +175,6 @@ class MyParticle(ptype[mode]): nc_fmt = d if d != 'bool_' else 'i1' assert ds[f'v_{d}'].dtype == nc_fmt -test_write_dtypes_pfile(fieldset(), 'aos', 'jit', '') @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) @@ -197,12 +197,13 @@ class MyParticle(ptype[mode]): pset.execute(pset.Kernel(Update_v), endtime=1, dt=0.1, output_file=ofile) assert np.allclose(pset.v_once - time - pset.age*10, 0, atol=1e-5) - ncfile = close_and_compare_netcdffiles(filepath, ofile) - vfile = np.ma.filled(ncfile.variables['v_once'][:], np.nan) + ds = xr.open_zarr(filepath) + vfile = np.ma.filled(ds['v_once'][:], np.nan) assert (vfile.shape == (npart, )) assert np.allclose(vfile, time) - ncfile.close() + ds.close() +# test_variable_written_once(fieldset(), 'soa', 'jit', '', 10) @pytest.mark.parametrize('type', ['repeatdt', 'timearr']) @pytest.mark.parametrize('pset_mode', pset_modes) @@ -232,8 +233,8 @@ def IncrLon(particle, fieldset, time): for i in range(runtime): pset.execute(IncrLon, dt=dt, runtime=1., output_file=pfile) - ncfile = close_and_compare_netcdffiles(outfilepath, pfile) - samplevar = ncfile.variables['sample_var'][:] + ds = xr.open_zarr(outfilepath) + samplevar = ds['sample_var'][:] if type == 'repeatdt': assert samplevar.shape == (runtime // repeatdt+1, min(maxvar+1, runtime)+1) assert np.allclose(pset.sample_var, np.arange(maxvar, -1, -repeatdt)) @@ -244,8 +245,9 @@ def IncrLon(particle, fieldset, time): assert np.allclose([p for p in samplevar[:, k] if np.isfinite(p)], k) filesize = os.path.getsize(str(outfilepath)) assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB - ncfile.close() + ds.close() +# test_pset_repeated_release_delayed_adding_deleting('repeatdt', fieldset(), 'soa', 'jit', 2, '', 1, 10) @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) From d5554a959ca717d0c69733033d00827f297e5ca5 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 09:08:18 +0200 Subject: [PATCH 09/79] dump_to_zarr passing all tests in test_particle_file --- parcels/particlefile/baseparticlefile.py | 28 ++++++++++++++++-- tests/test_particle_file.py | 37 ++++++------------------ 2 files changed, 34 insertions(+), 31 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index bddb5c338..3fa27e5a9 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -67,8 +67,8 @@ def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, c self.vars_to_write_once[v.name] = v.dtype elif v.to_write is True: self.vars_to_write[v.name] = v.dtype - # if len(self.var_names_once) > 0: - # self.written_once = [] + if len(self.vars_to_write_once) > 0: + self.written_once = [] self.IDs_written = {} self.maxobs = {} self.written_first = False @@ -184,6 +184,13 @@ def write(self, pset, time, deleted_only=False): else: self.maxobs[i] += 1 + if len(data_dict_once) > 0: + for i in data_dict_once['id']: + if i not in self.IDs_written: + self.IDs_written[i] = maxtraj + self.maxobs[i] = -1 + maxtraj += 1 + if len(data_dict) > 0: if not self.written_first: ds = xr.Dataset(attrs=self.metadata) @@ -195,6 +202,11 @@ def write(self, pset, time, deleted_only=False): data = np.full((maxtraj, 1), np.nan, dtype=self.vars_to_write[var]) data[ids, 0] = data_dict[var] ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) + for var in data_dict_once: + if var != 'id': # TODO check if needed + data = np.full((maxtraj,), np.nan, dtype=self.vars_to_write_once[var]) + data[ids] = data_dict_once[var] + ds[var] = xr.DataArray(data=data, dims=["traj"], attrs=attrs[var]) ds.to_zarr(self.fname, mode='w') self.written_first = True else: @@ -215,8 +227,18 @@ def write(self, pset, time, deleted_only=False): a = np.full((maxtraj-Z[varout].shape[0], Z[varout].shape[1]), np.nan, dtype=self.vars_to_write[var]) Z[varout].append(a, axis=0) zarr.consolidate_metadata(store) - Z[varout][i, t] = v + if len(data_dict_once) > 0: + ids = [self.IDs_written[i] for i in data_dict_once['id']] + for var in data_dict_once: + if var != 'id': # TODO check if needed + for i, v in zip(ids, data_dict_once[var]): + if i >= Z[var].shape[0]: + a = np.full((maxtraj - Z[var].shape[0],), np.nan, + dtype=self.vars_to_write_once[var]) + Z[var].append(a, axis=0) + zarr.consolidate_metadata(store) + Z[var][i] = v # if expanded_trajs and self.written_first: # for z in Z: diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index b44d098ba..b1060ea22 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -158,7 +158,7 @@ def test_write_dtypes_pfile(fieldset, pset_mode, mode, tmpdir): filepath = tmpdir.join("pfile_dtypes.zarr") dtypes = ['float32', 'float64', 'int32', 'uint32', 'int64', 'uint64'] - if mode == 'scipy' or pset_mode == 'soa': + if mode == 'scipy': dtypes.extend(['bool_', 'int8', 'uint8', 'int16', 'uint16']) # Not implemented in AoS JIT class MyParticle(ptype[mode]): @@ -166,15 +166,15 @@ class MyParticle(ptype[mode]): # need an exec() here because we need to dynamically set the variable name exec(f'v_{d} = Variable("v_{d}", dtype=np.{d}, initial=0.)') - pset = pset_type[pset_mode]['pset'](fieldset, pclass=MyParticle, lon=0, lat=0) + pset = pset_type[pset_mode]['pset'](fieldset, pclass=MyParticle, lon=0, lat=0, time=0) pfile = pset.ParticleFile(name=filepath, outputdt=1) pfile.write(pset, 0) ds = xr.open_zarr(filepath, mask_and_scale=False) # Note masking issue at https://stackoverflow.com/questions/68460507/xarray-loading-int-data-as-float for d in dtypes: - nc_fmt = d if d != 'bool_' else 'i1' - assert ds[f'v_{d}'].dtype == nc_fmt + assert ds[f'v_{d}'].dtype == d +# test_write_dtypes_pfile(fieldset(), 'soa', 'scipy', '') @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) @@ -247,7 +247,7 @@ def IncrLon(particle, fieldset, time): assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB ds.close() -# test_pset_repeated_release_delayed_adding_deleting('repeatdt', fieldset(), 'soa', 'jit', 2, '', 1, 10) +# test_pset_repeated_release_delayed_adding_deleting('timearr', fieldset(), 'soa', 'jit', 1, '', 1, 4) @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) @@ -262,9 +262,10 @@ def Update_lon(particle, fieldset, time): pfile = pset.ParticleFile(name=outfilepath, outputdt=1.) pset.execute(pset.Kernel(Update_lon), runtime=4, dt=-1., output_file=pfile) - ncfile = close_and_compare_netcdffiles(outfilepath, pfile) - trajs = ncfile.variables['trajectory'][:, 0] - assert np.all(np.diff(trajs) > 0) # all particles written in order of traj ID + ds = xr.open_zarr(outfilepath) + trajs = ds['trajectory'][:, 0] + assert np.all(np.diff(trajs.values) < 0) # all particles written in order of start time + ds.close() def test_set_calendar(): @@ -274,26 +275,6 @@ def test_set_calendar(): assert _set_calendar('np_datetime64') == 'standard' -@pytest.mark.parametrize('pset_mode', pset_modes) -def test_error_duplicate_outputdir(fieldset, tmpdir, pset_mode): - outfilepath = tmpdir.join("error_duplicate_outputdir.nc") - pset1 = pset_type[pset_mode]['pset'](fieldset, pclass=JITParticle, lat=0, lon=0) - pset2 = pset_type[pset_mode]['pset'](fieldset, pclass=JITParticle, lat=0, lon=0) - - py_random.seed(1234) - pfile1 = pset1.ParticleFile(name=outfilepath, outputdt=1., convert_at_end=False) - - py_random.seed(1234) - error_thrown = False - try: - pset2.ParticleFile(name=outfilepath, outputdt=1., convert_at_end=False) - except IOError: - error_thrown = True - assert error_thrown - - pfile1.delete_tempwritedir() - - @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_reset_dt(fieldset, pset_mode, mode, tmpdir): From 97a2950426faa10ea0cf432fc94e1c105a7cc39e Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 09:26:23 +0200 Subject: [PATCH 10/79] Vecorising the dump_to_zarr --- parcels/particlefile/baseparticlefile.py | 34 ++++++++++++------------ 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 3fa27e5a9..f98577ac1 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -218,27 +218,27 @@ def write(self, pset, time, deleted_only=False): for var in data_dict: varout = 'z' if var == 'depth' else var varout = 'trajectory' if varout == 'id' else varout - for i, t, v in zip(ids, maxobs, data_dict[var]): - if t >= Z[varout].shape[1]: - a = np.full((Z[varout].shape[0], 1), np.nan, dtype=self.vars_to_write[var]) - Z[varout].append(a, axis=1) - zarr.consolidate_metadata(store) - if i >= Z[varout].shape[0]: - a = np.full((maxtraj-Z[varout].shape[0], Z[varout].shape[1]), np.nan, dtype=self.vars_to_write[var]) - Z[varout].append(a, axis=0) - zarr.consolidate_metadata(store) - Z[varout][i, t] = v + if max(maxobs) >= Z[varout].shape[1]: + a = np.full((Z[varout].shape[0], 1), np.nan, + dtype=self.vars_to_write[var]) + Z[varout].append(a, axis=1) + zarr.consolidate_metadata(store) + if max(ids) >= Z[varout].shape[0]: + a = np.full((maxtraj-Z[varout].shape[0], Z[varout].shape[1]), np.nan, + dtype=self.vars_to_write[var]) + Z[varout].append(a, axis=0) + zarr.consolidate_metadata(store) + Z[varout].vindex[ids, maxobs] = data_dict[var] if len(data_dict_once) > 0: ids = [self.IDs_written[i] for i in data_dict_once['id']] for var in data_dict_once: if var != 'id': # TODO check if needed - for i, v in zip(ids, data_dict_once[var]): - if i >= Z[var].shape[0]: - a = np.full((maxtraj - Z[var].shape[0],), np.nan, - dtype=self.vars_to_write_once[var]) - Z[var].append(a, axis=0) - zarr.consolidate_metadata(store) - Z[var][i] = v + if max(ids) >= Z[var].shape[0]: + a = np.full((maxtraj - Z[var].shape[0],), np.nan, + dtype=self.vars_to_write_once[var]) + Z[var].append(a, axis=0) + zarr.consolidate_metadata(store) + Z[var].vindex[ids] = data_dict_once[var] # if expanded_trajs and self.written_first: # for z in Z: From a40c1100af518f0094a9fc70a640c66e9c44fe7a Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 10:03:44 +0200 Subject: [PATCH 11/79] Removing convert_npydir_to_netcdf.py script as not needed with dump_to_zarr --- parcels/scripts/convert_npydir_to_netcdf.py | 55 --------------------- 1 file changed, 55 deletions(-) delete mode 100644 parcels/scripts/convert_npydir_to_netcdf.py diff --git a/parcels/scripts/convert_npydir_to_netcdf.py b/parcels/scripts/convert_npydir_to_netcdf.py deleted file mode 100644 index 53d0c8a78..000000000 --- a/parcels/scripts/convert_npydir_to_netcdf.py +++ /dev/null @@ -1,55 +0,0 @@ -from argparse import ArgumentParser -from glob import glob -from os import path - -import numpy as np - -# == here those classes need to be impported to parse available ParticleFile classes and create the type from its name == # -from parcels import ParticleFile, ParticleFileSOA, ParticleFileAOS # NOQA - - -def convert_npydir_to_netcdf(tempwritedir_base, delete_tempfiles=False, pfile_class=None): - """Convert npy files in tempwritedir to a NetCDF file - :param tempwritedir_base: directory where the directories for temporary npy files - are stored (can be obtained from ParticleFile.tempwritedir_base attribute) - """ - - tempwritedir = sorted(glob(path.join("%s" % tempwritedir_base, "*")), - key=lambda x: int(path.basename(x)))[0] - pyset_file = path.join(tempwritedir, 'pset_info.npy') - if not path.isdir(tempwritedir): - raise ValueError('Output directory "%s" does not exist' % tempwritedir) - if not path.isfile(pyset_file): - raise ValueError('Output directory "%s" does not contain a pset_info.npy file' % tempwritedir) - - pset_info = np.load(pyset_file, allow_pickle=True).item() - pfconstructor = ParticleFile if pfile_class is None else pfile_class - pfile = pfconstructor(None, None, pset_info=pset_info, tempwritedir=tempwritedir_base, convert_at_end=False) - pfile.close(delete_tempfiles) - - -def main(tempwritedir_base=None, delete_tempfiles=False): - if tempwritedir_base is None: - p = ArgumentParser(description="""Script to convert temporary npy output files to NetCDF""") - p.add_argument('tempwritedir', help='Name of directory where temporary npy files are stored ' - '(not including numbered subdirectories)') - p.add_argument('-d', '--delete_tempfiles', default=False, - help='Flag to delete temporary files at end of call (default False)') - p.add_argument('-c', '--pfclass_name', default='ParticleFileSOA', - help='Class name of the stored particle file (default ParticleFileSOA)') - args = p.parse_args() - tempwritedir_base = args.tempwritedir - pfclass = ParticleFile - if hasattr(args, 'delete_tempfiles'): - delete_tempfiles = args.delete_tempfiles - if hasattr(args, 'pfclass_name'): - try: - pfclass = locals()[args.pfclass_name] - except: - pfclass = ParticleFile - - convert_npydir_to_netcdf(tempwritedir_base, delete_tempfiles, pfile_class=pfclass) - - -if __name__ == "__main__": - main() From 6a4f2c4831f4af56c6108bc77e7e134d7f1f1687 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 10:08:02 +0200 Subject: [PATCH 12/79] Cleanign up particlefilesoa.py and particlefileaos.py --- parcels/particlefile/particlefileaos.py | 145 +---------------------- parcels/particlefile/particlefilesoa.py | 148 +----------------------- 2 files changed, 5 insertions(+), 288 deletions(-) diff --git a/parcels/particlefile/particlefileaos.py b/parcels/particlefile/particlefileaos.py index 77cbf8a03..3ae0ce993 100644 --- a/parcels/particlefile/particlefileaos.py +++ b/parcels/particlefile/particlefileaos.py @@ -1,14 +1,5 @@ """Module controlling the writing of ParticleSets to NetCDF file""" -import os -from glob import glob import numpy as np -import xarray as xr -import gzip - -try: - from mpi4py import MPI -except: - MPI = None from parcels.particlefile.baseparticlefile import BaseParticleFile @@ -24,12 +15,11 @@ class ParticleFileAOS(BaseParticleFile): while ParticleFile is given as an argument of ParticleSet.execute() It is either a timedelta object or a positive double. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False - :param convert_at_end: Boolean to convert zarr file to netcdf at end of run. Default is False """ - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=False): + def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False): super(ParticleFileAOS, self).__init__(name=name, particleset=particleset, outputdt=outputdt, - write_ondelete=write_ondelete, convert_at_end=convert_at_end) + write_ondelete=write_ondelete) def __del__(self): super(ParticleFileAOS, self).__del__() @@ -39,134 +29,3 @@ def _reserved_var_names(self): returns the reserved dimension names not to be written just once. """ return ['time', 'lat', 'lon', 'depth', 'id'] - - def _create_trajectory_records(self, coords): - super(ParticleFileAOS, self)._create_trajectory_records(coords=coords) - - def get_pset_info_attributes(self): - """ - returns the main attributes of the pset_info.npy file. - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - attributes = ['name', 'var_names', 'var_dtypes', 'var_names_once', 'var_dtypes_once', - 'time_origin', 'lonlatdepth_dtype', 'file_list', 'file_list_once', - 'parcels_mesh', 'metadata'] - return attributes - - def read_from_npy(self, file_list, n_timesteps, var, dtype): - """ - Read NPY-files for one variable using a loop over all files. - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - - :param file_list: List that contains all file names in the output directory - :param n_timesteps: Dictionary with (for each particle) number of time steps that were written in out directory - :param var: name of the variable to read - """ - max_timesteps = max(n_timesteps.values()) if n_timesteps.keys() else 0 - fill_value = self.fill_value_map[dtype] - data = fill_value * np.ones((len(n_timesteps), max_timesteps), dtype=dtype) - time_index = np.zeros(len(n_timesteps)) - id_index = {} - count = 0 - for i in sorted(n_timesteps.keys()): - id_index[i] = count - count += 1 - - # loop over all files - for npyfile in file_list: - try: - with gzip.open(npyfile, 'rb') as f: - data_dict = np.load(f, allow_pickle=True).item() - except NameError: - raise RuntimeError('Cannot combine npy files into netcdf file because your ParticleFile is ' - 'still open on interpreter shutdown.\nYou can use ' - '"parcels_convert_npydir_to_netcdf %s" to convert these to ' - 'a NetCDF file yourself.\nTo avoid this error, make sure you ' - 'close() your ParticleFile at the end of your script.' % self.tempwritedir) - for ii, i in enumerate(data_dict["id"]): - id_ind = id_index[i] - t_ind = int(time_index[id_ind]) if 'once' not in file_list[0] else 0 - data[id_ind, t_ind] = data_dict[var][ii] - time_index[id_ind] = time_index[id_ind] + 1 - - if dtype == np.bool_: - data = data.astype(np.bool_) - # remove rows and columns that are completely filled with nan values - return data[time_index > 0, :] - - def export(self): - """ - Exports outputs in temporary NPY-files to output file (either netcdf or zarr) - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - if MPI: - # The export can only start when all threads are done. - MPI.COMM_WORLD.Barrier() - if MPI.COMM_WORLD.Get_rank() > 0: - return # export only on threat 0 - - # Create dictionary to translate datatypes and fill_values - self.fmt_map = {np.float16: 'f2', np.float32: 'f4', np.float64: 'f8', - np.bool_: 'i1', np.int8: 'i1', np.int16: 'i2', - np.int32: 'i4', np.int64: 'i8', np.uint8: 'u1', - np.uint16: 'u2', np.uint32: 'u4', np.uint64: 'u8'} - self.fill_value_map = {np.float16: np.nan, np.float32: np.nan, np.float64: np.nan, - np.bool_: np.iinfo(np.int8).max, np.int8: np.iinfo(np.int8).max, - np.int16: np.iinfo(np.int16).max, np.int32: np.iinfo(np.int32).max, - np.int64: np.iinfo(np.int64).max, np.uint8: np.iinfo(np.uint8).max, - np.uint16: np.iinfo(np.uint16).max, np.uint32: np.iinfo(np.uint32).max, - np.uint64: np.iinfo(np.uint64).max} - - # Retrieve all temporary writing directories and sort them in numerical order - temp_names = sorted(glob(os.path.join("%s" % self.tempwritedir_base, "*")), - key=lambda x: int(os.path.basename(x))) - - if len(temp_names) == 0: - raise RuntimeError("No npy files found in %s" % self.tempwritedir_base) - - n_timesteps = {} - global_file_list = [] - if len(self.var_names_once) > 0: - global_file_list_once = [] - for tempwritedir in temp_names: - if os.path.exists(tempwritedir): - pset_info_local = np.load(os.path.join(tempwritedir, 'pset_info.npy'), allow_pickle=True).item() - for npyfile in pset_info_local['file_list']: - with gzip.open(npyfile, 'rb') as f: - tmp_dict = np.load(f, allow_pickle=True).item() - for i in tmp_dict['id']: - if i in n_timesteps: - n_timesteps[i] += 1 - else: - n_timesteps[i] = 1 - global_file_list += pset_info_local['file_list'] - if len(self.var_names_once) > 0: - global_file_list_once += pset_info_local['file_list_once'] - - ds = xr.Dataset(attrs=pset_info_local['metadata']) - for var, dtype in zip(self.var_names, self.var_dtypes): - data = self.read_from_npy(global_file_list, n_timesteps, var, dtype) - if var == self.var_names[0]: - self.open_output_file() - varout = 'z' if var == 'depth' else var - varout = 'trajectory' if varout == 'id' else varout - ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=self.attrs[varout]) - - if len(self.var_names_once) > 0: - n_timesteps_once = {} - for i in n_timesteps: - n_timesteps_once[i] = 1 - for var in self.var_names_once: - data = self.read_from_npy(global_file_list_once, n_timesteps_once, var, dtype) - ds[var] = xr.DataArray(data=data.flatten(), dims=["traj"], attrs=self.attrs[var]) - - if 'zarr' in self.outputformat: - ds.to_zarr(self.fname) - else: - ds.to_netcdf(self.fname) diff --git a/parcels/particlefile/particlefilesoa.py b/parcels/particlefile/particlefilesoa.py index 3f59f16d1..b7fe33f0e 100644 --- a/parcels/particlefile/particlefilesoa.py +++ b/parcels/particlefile/particlefilesoa.py @@ -1,14 +1,5 @@ """Module controlling the writing of ParticleSets to NetCDF file""" -import os -from glob import glob import numpy as np -import xarray as xr -import gzip - -try: - from mpi4py import MPI -except: - MPI = None from parcels.particlefile.baseparticlefile import BaseParticleFile @@ -24,12 +15,11 @@ class ParticleFileSOA(BaseParticleFile): while ParticleFile is given as an argument of ParticleSet.execute() It is either a timedelta object or a positive double. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False - :param convert_at_end: Boolean to convert zarr file to netcdf at end of run. Default is False """ - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=False): + def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False): super(ParticleFileSOA, self).__init__(name=name, particleset=particleset, outputdt=outputdt, - write_ondelete=write_ondelete, convert_at_end=convert_at_end) + write_ondelete=write_ondelete) def __del__(self): super(ParticleFileSOA, self).__del__() @@ -38,136 +28,4 @@ def _reserved_var_names(self): """ returns the reserved dimension names not to be written just once. """ - return ['time', 'lat', 'lon', 'depth', 'id'] # , 'index' - - def _create_trajectory_records(self, coords): - super(ParticleFileSOA, self)._create_trajectory_records(coords=coords) - - def get_pset_info_attributes(self): - """ - returns the main attributes of the pset_info.npy file. - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - attributes = ['name', 'var_names', 'var_dtypes', 'var_names_once', 'var_dtypes_once', - 'time_origin', 'lonlatdepth_dtype', 'file_list', 'file_list_once', - 'parcels_mesh', 'metadata'] - return attributes - - def read_from_npy(self, file_list, n_timesteps, var, dtype): - """ - Read NPY-files for one variable using a loop over all files. - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - - :param file_list: List that contains all file names in the output directory - :param n_timesteps: Dictionary with (for each particle) number of time steps that were written in out directory - :param var: name of the variable to read - """ - max_timesteps = max(n_timesteps.values()) if n_timesteps.keys() else 0 - fill_value = self.fill_value_map[dtype] - data = fill_value * np.ones((len(n_timesteps), max_timesteps), dtype=dtype) - time_index = np.zeros(len(n_timesteps)) - id_index = {} - count = 0 - for i in sorted(n_timesteps.keys()): - id_index[i] = count - count += 1 - - # loop over all files - for npyfile in file_list: - try: - with gzip.open(npyfile, 'rb') as f: - data_dict = np.load(f, allow_pickle=True).item() - except NameError: - raise RuntimeError('Cannot combine npy files into netcdf file because your ParticleFile is ' - 'still open on interpreter shutdown.\nYou can use ' - '"parcels_convert_npydir_to_netcdf %s" to convert these to ' - 'a NetCDF file yourself.\nTo avoid this error, make sure you ' - 'close() your ParticleFile at the end of your script.' % self.tempwritedir) - for ii, i in enumerate(data_dict["id"]): - id_ind = id_index[i] - t_ind = int(time_index[id_ind]) if 'once' not in file_list[0] else 0 - data[id_ind, t_ind] = data_dict[var][ii] - time_index[id_ind] = time_index[id_ind] + 1 - - if dtype == np.bool_: - data = data.astype(np.bool_) - # remove rows and columns that are completely filled with nan values - return data[time_index > 0, :] - - def export(self): - """ - Exports outputs in temporary NPY-files to output file (either netcdf or zarr) - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - - if MPI: - # The export can only start when all threads are done. - MPI.COMM_WORLD.Barrier() - if MPI.COMM_WORLD.Get_rank() > 0: - return # export only on threat 0 - - # Create dictionary to translate datatypes and fill_values - self.fmt_map = {np.float16: 'f2', np.float32: 'f4', np.float64: 'f8', - np.bool_: 'i1', np.int8: 'i1', np.int16: 'i2', - np.int32: 'i4', np.int64: 'i8', np.uint8: 'u1', - np.uint16: 'u2', np.uint32: 'u4', np.uint64: 'u8'} - self.fill_value_map = {np.float16: np.nan, np.float32: np.nan, np.float64: np.nan, - np.bool_: np.iinfo(np.int8).max, np.int8: np.iinfo(np.int8).max, - np.int16: np.iinfo(np.int16).max, np.int32: np.iinfo(np.int32).max, - np.int64: np.iinfo(np.int64).max, np.uint8: np.iinfo(np.uint8).max, - np.uint16: np.iinfo(np.uint16).max, np.uint32: np.iinfo(np.uint32).max, - np.uint64: np.iinfo(np.uint64).max} - - # Retrieve all temporary writing directories and sort them in numerical order - temp_names = sorted(glob(os.path.join("%s" % self.tempwritedir_base, "*")), - key=lambda x: int(os.path.basename(x))) - - if len(temp_names) == 0: - raise RuntimeError("No npy files found in %s" % self.tempwritedir_base) - - n_timesteps = {} - global_file_list = [] - if len(self.var_names_once) > 0: - global_file_list_once = [] - for tempwritedir in temp_names: - if os.path.exists(tempwritedir): - pset_info_local = np.load(os.path.join(tempwritedir, 'pset_info.npy'), allow_pickle=True).item() - for npyfile in pset_info_local['file_list']: - with gzip.open(npyfile, 'rb') as f: - tmp_dict = np.load(f, allow_pickle=True).item() - for i in tmp_dict['id']: - if i in n_timesteps: - n_timesteps[i] += 1 - else: - n_timesteps[i] = 1 - global_file_list += pset_info_local['file_list'] - if len(self.var_names_once) > 0: - global_file_list_once += pset_info_local['file_list_once'] - - ds = xr.Dataset(attrs=pset_info_local['metadata']) - for var, dtype in zip(self.var_names, self.var_dtypes): - data = self.read_from_npy(global_file_list, n_timesteps, var, dtype) - if var == self.var_names[0]: - self.open_output_file() - varout = 'z' if var == 'depth' else var - varout = 'trajectory' if varout == 'id' else varout - ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=self.attrs[varout]) - - if len(self.var_names_once) > 0: - n_timesteps_once = {} - for i in n_timesteps: - n_timesteps_once[i] = 1 - for var in self.var_names_once: - data = self.read_from_npy(global_file_list_once, n_timesteps_once, var, dtype) - ds[var] = xr.DataArray(data=data.flatten(), dims=["traj"], attrs=self.attrs[var]) - - if 'zarr' in self.outputformat: - ds.to_zarr(self.fname) - else: - ds.to_netcdf(self.fname) + return ['time', 'lat', 'lon', 'depth', 'id'] From 4ff05edaa87b5f9a46d6d89854ee7f545efff212 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 10:09:20 +0200 Subject: [PATCH 13/79] Removing old dump_to_zarr attempt via xarray DataSets appending Did not work because can't append in two dimensions at the same time --- parcels/particlefile/baseparticlefile.py | 46 ------------------------ 1 file changed, 46 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index f98577ac1..b51cfc3b6 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -239,49 +239,3 @@ def write(self, pset, time, deleted_only=False): Z[var].append(a, axis=0) zarr.consolidate_metadata(store) Z[var].vindex[ids] = data_dict_once[var] - - # if expanded_trajs and self.written_first: - # for z in Z: - # zin = 'id' if z == 'trajectory' else z - # zin = 'depth' if zin == 'z' else zin - # - # if Z[z].ndim == 2: - # a = np.full((expanded_trajs, Z[z].shape[1]), np.nan, dtype=self.vars_to_write[zin]) - # else: - # a = np.full((expanded_trajs,), np.nan, dtype=self.vars_to_write_once[zin]) - # Z[z].append(a) - # - # - # if len(data_dict) > 0: - # ids = [self.IDs_written[i] for i in data_dict['id']] - # for var in data_dict: - # varout = 'z' if var == 'depth' else var - # varout = 'trajectory' if varout == 'id' else varout - # data = np.full((datalen, 1), np.nan, dtype=self.vars_to_write[var]) - # data[ids, 0] = data_dict[var] - # ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) - # if self.written_first and "_FillValue" in ds[varout].attrs: - # del ds[varout].attrs["_FillValue"] - # - # if len(data_dict_once) > 0: - # ids = [self.IDs_written[i] for i in data_dict_once['id']] - # if self.written_first: - # ds_in = xr.open_zarr(self.name) - # for var in data_dict_once: - # if var != 'id': - # if self.written_first: - # data = ds_in[var].values - # print(data, ids) - # else: - # data = np.full((datalen,), np.nan, dtype=self.vars_to_write_once[var]) - # data[ids] = data_dict_once[var] - # ds[var] = xr.DataArray(data=data, dims=["traj"], attrs=attrs[var]) - # if self.written_first and "_FillValue" in ds[var].attrs: - # del ds[var].attrs["_FillValue"] - # - # if len(ds) > 0: - # if not self.written_first: - # ds.to_zarr(self.fname, mode='w') - # self.written_first = True - # else: - # ds.to_zarr(self.fname, mode='a', append_dim='obs') From 8cb7a2dc37df0f47a28146e3baac45488aed0f1d Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 10:09:51 +0200 Subject: [PATCH 14/79] Removing conert_at_end keyword from baseparticlefile (as not needed with zarr output) --- parcels/particlefile/baseparticlefile.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index b51cfc3b6..a6581b80a 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -35,10 +35,8 @@ class BaseParticleFile(ABC): while ParticleFile is given as an argument of ParticleSet.execute() It is either a timedelta object or a positive double. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False - :param convert_at_end: Boolean to convert npy files to netcdf at end of run. Default is True """ write_ondelete = None - convert_at_end = None outputdt = None lasttime_written = None name = None @@ -47,7 +45,7 @@ class BaseParticleFile(ABC): time_origin = None lonlatdepth_dtype = None - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=True): + def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False): self.write_ondelete = write_ondelete self.outputdt = outputdt From 0e6f5d762ba513f3b87cf9232b27d73558000660 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 10:10:54 +0200 Subject: [PATCH 15/79] Cleaning up test_particle_file --- tests/test_particle_file.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index b1060ea22..4d53d827c 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -7,7 +7,6 @@ import pytest import os import cftime -import random as py_random import xarray as xr pset_modes = ['soa', 'aos'] @@ -51,7 +50,6 @@ def test_pfile_array_remove_particles(fieldset, pset_mode, mode, tmpdir, npart=1 assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) ds.close() -# test_pfile_array_remove_particles(fieldset(), 'soa', 'jit', '') @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) @@ -132,7 +130,6 @@ def DeleteP(particle, fieldset, time): assert (lon.size == noutside) ds.close() -# test_variable_written_ondelete(fieldset(), 'aos', 'jit', '') @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) @@ -174,7 +171,6 @@ class MyParticle(ptype[mode]): for d in dtypes: assert ds[f'v_{d}'].dtype == d -# test_write_dtypes_pfile(fieldset(), 'soa', 'scipy', '') @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) @@ -203,7 +199,6 @@ class MyParticle(ptype[mode]): assert np.allclose(vfile, time) ds.close() -# test_variable_written_once(fieldset(), 'soa', 'jit', '', 10) @pytest.mark.parametrize('type', ['repeatdt', 'timearr']) @pytest.mark.parametrize('pset_mode', pset_modes) @@ -247,7 +242,6 @@ def IncrLon(particle, fieldset, time): assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB ds.close() -# test_pset_repeated_release_delayed_adding_deleting('timearr', fieldset(), 'soa', 'jit', 1, '', 1, 4) @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) From 19090473b70cf0b9c96b822e726352747c8d0cc4 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 11:19:41 +0200 Subject: [PATCH 16/79] Fixing dump_to_zarr in unit tests in other function than test_particle_file --- parcels/particleset/particlesetaos.py | 11 +++++++---- parcels/particleset/particlesetsoa.py | 6 +++--- parcels/scripts/plottrajectoriesfile.py | 4 ++-- tests/test_advection.py | 17 +++++++---------- tests/test_particle_sets.py | 2 +- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/parcels/particleset/particlesetaos.py b/parcels/particleset/particlesetaos.py index 6897f0b98..74debd624 100644 --- a/parcels/particleset/particlesetaos.py +++ b/parcels/particleset/particlesetaos.py @@ -507,9 +507,9 @@ def monte_carlo_sample(cls, start_field, size, mode='monte_carlo'): @classmethod def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime=None, repeatdt=None, lonlatdepth_dtype=None, **kwargs): - """Initialise the ParticleSet from a netcdf ParticleFile. - This creates a new ParticleSet based on the last locations and time of all particles - in the netcdf ParticleFile. Particle IDs are preserved if restart=True + """Initialise the ParticleSet from a zarr ParticleFile. + This creates a new ParticleSet based on locations of all particles written + in a zarr ParticleFile at a certain time. Particle IDs are preserved if restart=True :param fieldset: :mod:`parcels.fieldset.FieldSet` object from which to sample velocity :param pclass: mod:`parcels.particle.JITParticle` or :mod:`parcels.particle.ScipyParticle` @@ -517,6 +517,9 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime :param filename: Name of the particlefile from which to read initial conditions :param restart: Boolean to signal if pset is used for a restart (default is True). In that case, Particle IDs are preserved. + :param restarttime: time at which the Particles will be restarted. Default is the last time written. + Alternatively, restarttime could be a time value (including np.datetime64) or + a callable function such as np.nanmin. The last is useful when running with dt < 0. :param repeatdt: Optional interval (in seconds) on which to repeat the release of the ParticleSet :param lonlatdepth_dtype: Floating precision for lon, lat, depth particle coordinates. It is either np.float32 or np.float64. Default is np.float32 if fieldset.U.interp_method is 'linear' @@ -527,7 +530,7 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime 'setting a new repeatdt will start particles from the _new_ particle ' 'locations.' % filename) - pfile = xr.open_dataset(str(filename), decode_cf=True) + pfile = xr.open_zarr(str(filename)) pfile_vars = [v for v in pfile.data_vars] vars = {} diff --git a/parcels/particleset/particlesetsoa.py b/parcels/particleset/particlesetsoa.py index f1b10b728..9274ef111 100644 --- a/parcels/particleset/particlesetsoa.py +++ b/parcels/particleset/particlesetsoa.py @@ -414,9 +414,9 @@ def monte_carlo_sample(cls, start_field, size, mode='monte_carlo'): @classmethod def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime=None, repeatdt=None, lonlatdepth_dtype=None, **kwargs): - """Initialise the ParticleSet from a netcdf ParticleFile. + """Initialise the ParticleSet from a zarr ParticleFile. This creates a new ParticleSet based on locations of all particles written - in a netcdf ParticleFile at a certain time. Particle IDs are preserved if restart=True + in a zarr ParticleFile at a certain time. Particle IDs are preserved if restart=True :param fieldset: :mod:`parcels.fieldset.FieldSet` object from which to sample velocity :param pclass: mod:`parcels.particle.JITParticle` or :mod:`parcels.particle.ScipyParticle` @@ -438,7 +438,7 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime 'setting a new repeatdt will start particles from the _new_ particle ' 'locations.' % filename) - pfile = xr.open_dataset(str(filename), decode_cf=True) + pfile = xr.open_zarr(str(filename)) pfile_vars = [v for v in pfile.data_vars] vars = {} diff --git a/parcels/scripts/plottrajectoriesfile.py b/parcels/scripts/plottrajectoriesfile.py index 96ed6bc4b..fb6d673ad 100644 --- a/parcels/scripts/plottrajectoriesfile.py +++ b/parcels/scripts/plottrajectoriesfile.py @@ -38,9 +38,9 @@ def plotTrajectoriesFile(filename, mode='2d', tracerfile=None, tracerfield='P', environ["HDF5_USE_FILE_LOCKING"] = "FALSE" try: - pfile = xr.open_dataset(str(filename), decode_cf=True) + pfile = xr.open_zarr(str(filename), decode_cf=True) except: - pfile = xr.open_dataset(str(filename), decode_cf=False) + pfile = xr.open_zarr(str(filename), decode_cf=False) lon = np.ma.filled(pfile.variables['lon'], np.nan) lat = np.ma.filled(pfile.variables['lat'], np.nan) time = np.ma.filled(pfile.variables['time'], np.nan) diff --git a/tests/test_advection.py b/tests/test_advection.py index 6a418dfcd..cbc189197 100644 --- a/tests/test_advection.py +++ b/tests/test_advection.py @@ -6,9 +6,9 @@ import numpy as np import pytest import math -from netCDF4 import Dataset from datetime import timedelta as delta from parcels import logger +import xarray as xr pset_modes = ['soa', 'aos'] ptype = {'scipy': ScipyParticle, 'jit': JITParticle} @@ -489,21 +489,18 @@ def test_uniform_analytical(pset_mode, mode, u, v, w, direction, tmpdir): x0, y0, z0 = 6.1, 6.2, 20 pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=x0, lat=y0, depth=z0) - outfile_path = tmpdir.join("uniformanalytical.nc") + outfile_path = tmpdir.join("uniformanalytical.zarr") outfile = pset.ParticleFile(name=outfile_path, outputdt=1) pset.execute(AdvectionAnalytical, runtime=4, dt=direction, output_file=outfile) - outfile.close() assert np.abs(pset.lon - x0 - 4 * u * direction) < 1e-6 assert np.abs(pset.lat - y0 - 4 * v * direction) < 1e-6 if w: assert np.abs(pset.depth - z0 - 4 * w * direction) < 1e-4 - dataset = Dataset(outfile_path, 'r', 'NETCDF4') - times = dataset.variables['time'][:] - timeref = direction * np.arange(0, 5) - logger.info("analytical - time: {}".format(times)) - logger.info("analytical - reference: {}".format(timeref)) - assert np.allclose(times, timeref) - lons = dataset.variables['lon'][:] + ds = xr.open_zarr(outfile_path, mask_and_scale=False) + times = ds['time'][:].values.astype('timedelta64[s]')[0] + timeref = direction * np.arange(0, 5).astype('timedelta64[s]') + assert np.allclose(times, timeref, atol=np.timedelta64(1, 'ms')) + lons = ds['lon'][:].values assert np.allclose(lons, x0+direction*u*np.arange(0, 5)) diff --git a/tests/test_particle_sets.py b/tests/test_particle_sets.py index 3c65b2869..43055dd4a 100644 --- a/tests/test_particle_sets.py +++ b/tests/test_particle_sets.py @@ -70,7 +70,7 @@ class MyParticle(ptype[mode]): @pytest.mark.parametrize('mode', ['scipy', 'jit']) @pytest.mark.parametrize('restart', [True, False]) def test_pset_create_fromparticlefile(fieldset, pset_mode, mode, restart, tmpdir): - filename = tmpdir.join("pset_fromparticlefile.nc") + filename = tmpdir.join("pset_fromparticlefile.zarr") lon = np.linspace(0, 1, 10, dtype=np.float32) lat = np.linspace(1, 0, 10, dtype=np.float32) From ad70153321edb193dfcfe0faaa20c28229a6a7d7 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 12:08:49 +0200 Subject: [PATCH 17/79] Update example_mitgcm.py --- parcels/examples/example_mitgcm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parcels/examples/example_mitgcm.py b/parcels/examples/example_mitgcm.py index 854c73c7f..5118f1945 100644 --- a/parcels/examples/example_mitgcm.py +++ b/parcels/examples/example_mitgcm.py @@ -48,7 +48,7 @@ def periodicBC(particle, fieldset, time): size=10, ) pfile = ParticleFile( - "MIT_particles_" + str(mode) + ".nc", pset, outputdt=delta(days=1) + "MIT_particles_" + str(mode) + ".zarr", pset, outputdt=delta(days=1) ) kernels = AdvectionRK4 + pset.Kernel(periodicBC) pset.execute( @@ -61,8 +61,8 @@ def test_mitgcm_output_compare(): run_mitgcm_zonally_reentrant("scipy") run_mitgcm_zonally_reentrant("jit") - ds_jit = xr.open_dataset("MIT_particles_jit.nc") - ds_scipy = xr.open_dataset("MIT_particles_scipy.nc") + ds_jit = xr.open_zarr("MIT_particles_jit.zarr") + ds_scipy = xr.open_zarr("MIT_particles_scipy.zarr") np.testing.assert_allclose(ds_jit.lat.data, ds_scipy.lat.data) np.testing.assert_allclose(ds_jit.lon.data, ds_scipy.lon.data) From fe3ae3d55a455ba2f6f58491112453715c006b10 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 12:15:37 +0200 Subject: [PATCH 18/79] Update parcels_tutorial to zarr output --- parcels/examples/parcels_tutorial.ipynb | 29 +++++++++++-------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/parcels/examples/parcels_tutorial.ipynb b/parcels/examples/parcels_tutorial.ipynb index 163e2b041..8a000500f 100644 --- a/parcels/examples/parcels_tutorial.ipynb +++ b/parcels/examples/parcels_tutorial.ipynb @@ -178,7 +178,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The final step is to run (or 'execute') the `ParticelSet`. We run the particles using the `AdvectionRK4` kernel, which is a 4th order Runge-Kutte implementation that comes with Parcels. We run the particles for 6 days (using the `timedelta` function from `datetime`), at an RK4 timestep of 5 minutes. We store the trajectory information at an interval of 1 hour in a file called `EddyParticles.nc`. Because `time` was `not_yet_set`, the particles will be advected from the first date available in the `fieldset`, which is the default behaviour." + "The final step is to run (or 'execute') the `ParticelSet`. We run the particles using the `AdvectionRK4` kernel, which is a 4th order Runge-Kutte implementation that comes with Parcels. We run the particles for 6 days (using the `timedelta` function from `datetime`), at an RK4 timestep of 5 minutes. We store the trajectory information at an interval of 1 hour in a file called `EddyParticles.zarr`. Because `time` was `not_yet_set`, the particles will be advected from the first date available in the `fieldset`, which is the default behaviour." ] }, { @@ -195,7 +195,7 @@ } ], "source": [ - "output_file = pset.ParticleFile(name=\"EddyParticles.nc\", outputdt=timedelta(hours=1)) # the file name and the time step of the outputs\n", + "output_file = pset.ParticleFile(name=\"EddyParticles.zarr\", outputdt=timedelta(hours=1)) # the file name and the time step of the outputs\n", "pset.execute(AdvectionRK4, # the kernel (which defines how particles move)\n", " runtime=timedelta(days=6), # the total length of the run\n", " dt=timedelta(minutes=5), # the timestep of the kernel\n", @@ -251,7 +251,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The trajectory information of the particles can be written to the `EddyParticles.nc` file by using the `.export()` method on the output file. The trajectory can then be quickly plotted using the `plotTrajectoriesFile` function." + "The trajectories in the `EddyParticles.zarr` file can be quickly plotted using the `plotTrajectoriesFile` function." ] }, { @@ -273,8 +273,7 @@ } ], "source": [ - "output_file.export()\n", - "plotTrajectoriesFile('EddyParticles.nc');" + "plotTrajectoriesFile('EddyParticles.zarr');" ] }, { @@ -1151,7 +1150,7 @@ } ], "source": [ - "plotTrajectoriesFile('EddyParticles.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('EddyParticles.zarr', mode='movie2d_notebook')" ] }, { @@ -1180,7 +1179,7 @@ } ], "source": [ - "plotTrajectoriesFile('EddyParticles.nc', mode='hist2d', bins=[30, 20]);" + "plotTrajectoriesFile('EddyParticles.zarr', mode='hist2d', bins=[30, 20]);" ] }, { @@ -1226,7 +1225,7 @@ "metadata": {}, "outputs": [], "source": [ - "output_file = pset.ParticleFile(name=\"EddyParticles_Bwd.nc\", outputdt=timedelta(hours=1)) # the file name and the time step of the outputs\n", + "output_file = pset.ParticleFile(name=\"EddyParticles_Bwd.zarr\", outputdt=timedelta(hours=1)) # the file name and the time step of the outputs\n", "pset.execute(AdvectionRK4,\n", " dt=-timedelta(minutes=5), # negative timestep for backward run\n", " runtime=timedelta(days=6), # the run time\n", @@ -1329,7 +1328,7 @@ "\n", "k_WestVel = pset.Kernel(WestVel) # casting the WestVel function to a kernel object\n", "\n", - "output_file = pset.ParticleFile(name=\"EddyParticles_WestVel.nc\", outputdt=timedelta(hours=1))\n", + "output_file = pset.ParticleFile(name=\"EddyParticles_WestVel.zarr\", outputdt=timedelta(hours=1))\n", "pset.execute(AdvectionRK4 + k_WestVel, # simply add kernels using the + operator\n", " runtime=timedelta(days=2),\n", " dt=timedelta(minutes=5),\n", @@ -1362,8 +1361,7 @@ } ], "source": [ - "output_file.export()\n", - "plotTrajectoriesFile('EddyParticles_WestVel.nc');" + "plotTrajectoriesFile('EddyParticles_WestVel.zarr');" ] }, { @@ -1492,7 +1490,7 @@ } ], "source": [ - "output_file = pset.ParticleFile(name=\"GlobCurrentParticles.nc\", outputdt=timedelta(hours=6))\n", + "output_file = pset.ParticleFile(name=\"GlobCurrentParticles.zarr\", outputdt=timedelta(hours=6))\n", "pset.execute(AdvectionRK4,\n", " runtime=timedelta(days=10),\n", " dt=timedelta(minutes=5),\n", @@ -1532,8 +1530,7 @@ } ], "source": [ - "output_file.export()\n", - "plotTrajectoriesFile('GlobCurrentParticles.nc',\n", + "plotTrajectoriesFile('GlobCurrentParticles.zarr',\n", " tracerfile='GlobCurrent_example_data/20020101000000-GLOBCURRENT-L4-CUReul_hs-ALT_SUM-v02.0-fv01.0.nc',\n", " tracerlon='lon',\n", " tracerlat='lat',\n", @@ -1825,14 +1822,14 @@ "pset.execute(AdvectionRK4 + k_dist, # Add kernels using the + operator.\n", " runtime=timedelta(days=6),\n", " dt=timedelta(minutes=5),\n", - " output_file=pset.ParticleFile(name=\"GlobCurrentParticles_Dist.nc\", outputdt=timedelta(hours=1)))" + " output_file=pset.ParticleFile(name=\"GlobCurrentParticles_Dist.zarr\", outputdt=timedelta(hours=1)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "And finally print the distance in km that each particle has travelled (note that this is also stored in the `EddyParticles_Dist.nc` file)" + "And finally print the distance in km that each particle has travelled (note that this is also stored in the `EddyParticles_Dist.zarr` file)" ] }, { From e527c009250f645fddff67545829727edc2a9645 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 12:16:08 +0200 Subject: [PATCH 19/79] Update NedetedFields tutorial to zarr output --- parcels/examples/tutorial_NestedFields.ipynb | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/parcels/examples/tutorial_NestedFields.ipynb b/parcels/examples/tutorial_NestedFields.ipynb index f9292f22b..186638344 100644 --- a/parcels/examples/tutorial_NestedFields.ipynb +++ b/parcels/examples/tutorial_NestedFields.ipynb @@ -133,11 +133,10 @@ ], "source": [ "pset = ParticleSet(fieldset, pclass=JITParticle, lon=[0], lat=[1000])\n", - "output_file = pset.ParticleFile(name='NestedFieldParticle.nc', outputdt=50)\n", + "output_file = pset.ParticleFile(name='NestedFieldParticle.zarr', outputdt=50)\n", "pset.execute(AdvectionRK4, runtime=14000, dt=10, output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file\n", "\n", - "plt = plotTrajectoriesFile('NestedFieldParticle.nc', show_plt=False)\n", + "plt = plotTrajectoriesFile('NestedFieldParticle.zarr', show_plt=False)\n", "plt.plot([0,2e3,2e3,0,0],[0,0,2e3,2e3,0], c='orange')\n", "plt.plot([-2e3,18e3,18e3,-2e3,-2e3],[-1e3,-1e3,3e3,3e3,-1e3], c='green');" ] From 8bed36e67684a3c69958dabe9e556d1533e2e0f0 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 12:17:24 +0200 Subject: [PATCH 20/79] Update tutorial_Argofloats to use zarr --- parcels/examples/tutorial_Argofloats.ipynb | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/parcels/examples/tutorial_Argofloats.ipynb b/parcels/examples/tutorial_Argofloats.ipynb index d823792da..6ad6897a9 100644 --- a/parcels/examples/tutorial_Argofloats.ipynb +++ b/parcels/examples/tutorial_Argofloats.ipynb @@ -156,17 +156,15 @@ ], "source": [ "%matplotlib inline\n", - "import netCDF4\n", + "import xarray as xr\n", "from mpl_toolkits.mplot3d import Axes3D\n", "import matplotlib.pyplot as plt\n", "\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "\n", - "nc = netCDF4.Dataset(\"argo_float.nc\")\n", - "x = nc.variables[\"lon\"][:].squeeze()\n", - "y = nc.variables[\"lat\"][:].squeeze()\n", - "z = nc.variables[\"z\"][:].squeeze()\n", - "nc.close()\n", + "ds = xr.from_zarr(\"argo_float.zarr\")\n", + "x = ds[\"lon\"][:].squeeze()\n", + "y = ds[\"lat\"][:].squeeze()\n", + "z = ds[\"z\"][:].squeeze()\n", + "ds.close()\n", "\n", "fig = plt.figure(figsize=(13,10))\n", "ax = plt.axes(projection='3d')\n", From 01cfbc4878b168fbaf3bc6ceaacf8e7ca516733c Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 12:18:58 +0200 Subject: [PATCH 21/79] Update tutorial_SummedFields.ipynb --- parcels/examples/tutorial_SummedFields.ipynb | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/parcels/examples/tutorial_SummedFields.ipynb b/parcels/examples/tutorial_SummedFields.ipynb index 63d7c04ea..55b0bb226 100644 --- a/parcels/examples/tutorial_SummedFields.ipynb +++ b/parcels/examples/tutorial_SummedFields.ipynb @@ -82,10 +82,10 @@ ], "source": [ "pset = ParticleSet(fieldset_flow, pclass=JITParticle, lon=[0], lat=[900])\n", - "output_file = pset.ParticleFile(name='SummedFieldParticle_flow.nc', outputdt=1)\n", + "output_file = pset.ParticleFile(name='SummedFieldParticle_flow.zarr', outputdt=1)\n", "pset.execute(AdvectionRK4, runtime=10, dt=1, output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('SummedFieldParticle_flow.nc');" + "\n", + "plotTrajectoriesFile('SummedFieldParticle_flow.zarr');" ] }, { @@ -148,10 +148,10 @@ ], "source": [ "pset = ParticleSet(fieldset_stokes, pclass=JITParticle, lon=[0], lat=[900])\n", - "output_file = pset.ParticleFile(name='SummedFieldParticle_stokes.nc', outputdt=1)\n", + "output_file = pset.ParticleFile(name='SummedFieldParticle_stokes.zarr', outputdt=1)\n", "pset.execute(AdvectionRK4, runtime=10, dt=1, output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('SummedFieldParticle_stokes.nc');" + "\n", + "plotTrajectoriesFile('SummedFieldParticle_stokes.zarr');" ] }, { @@ -202,10 +202,10 @@ ], "source": [ "pset = ParticleSet(fieldset_sum, pclass=JITParticle, lon=[0], lat=[900])\n", - "output_file = pset.ParticleFile(name='SummedFieldParticle_sum.nc', outputdt=1)\n", + "output_file = pset.ParticleFile(name='SummedFieldParticle_sum.zarr', outputdt=1)\n", "pset.execute(AdvectionRK4, runtime=10, dt=1, output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('SummedFieldParticle_sum.nc');" + "\n", + "plotTrajectoriesFile('SummedFieldParticle_sum.zarr');" ] }, { From d98efa2cc909ad70273647b319c42e7377835552 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 12:21:12 +0200 Subject: [PATCH 22/79] Update tutorial_delaystart.ipynb --- parcels/examples/tutorial_delaystart.ipynb | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/parcels/examples/tutorial_delaystart.ipynb b/parcels/examples/tutorial_delaystart.ipynb index 61831e7b5..d3eed5f51 100644 --- a/parcels/examples/tutorial_delaystart.ipynb +++ b/parcels/examples/tutorial_delaystart.ipynb @@ -102,10 +102,9 @@ } ], "source": [ - "output_file = pset.ParticleFile(name=\"DelayParticle_time.nc\", outputdt=delta(hours=1))\n", + "output_file = pset.ParticleFile(name=\"DelayParticle_time.zarr\", outputdt=delta(hours=1))\n", "pset.execute(AdvectionRK4, runtime=delta(hours=24), dt=delta(minutes=5),\n", - " output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file" + " output_file=output_file)" ] }, { @@ -549,7 +548,7 @@ } ], "source": [ - "plotTrajectoriesFile('DelayParticle_time.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('DelayParticle_time.zarr', mode='movie2d_notebook')" ] }, { @@ -1162,8 +1161,7 @@ } ], "source": [ - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('DelayParticle_releasedt.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('DelayParticle_releasedt.zarr', mode='movie2d_notebook')" ] }, { @@ -1717,8 +1715,7 @@ "pset.execute(AdvectionRK4, runtime=delta(hours=15), dt=delta(minutes=5),\n", " output_file=output_file)\n", "\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('DelayParticle_releasedt_9hrs.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('DelayParticle_releasedt_9hrs.zarr', mode='movie2d_notebook')" ] }, { @@ -1766,7 +1763,7 @@ } ], "source": [ - "outfilepath = \"DelayParticle_nonmatchingtime.nc\"\n", + "outfilepath = \"DelayParticle_nonmatchingtime.zarr\"\n", "\n", "pset = ParticleSet(fieldset=fieldset, pclass=JITParticle,\n", " lat=[3e3]*3, lon=[3e3]*3, time=[0, 1, 2])\n", From 93e4340c0493e97bb1669fd43c2c9ae10eef5ab5 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 12:49:29 +0200 Subject: [PATCH 23/79] Update tutorial_Argofloats.ipynb --- parcels/examples/tutorial_Argofloats.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parcels/examples/tutorial_Argofloats.ipynb b/parcels/examples/tutorial_Argofloats.ipynb index 6ad6897a9..fdd3ef621 100644 --- a/parcels/examples/tutorial_Argofloats.ipynb +++ b/parcels/examples/tutorial_Argofloats.ipynb @@ -160,7 +160,7 @@ "from mpl_toolkits.mplot3d import Axes3D\n", "import matplotlib.pyplot as plt\n", "\n", - "ds = xr.from_zarr(\"argo_float.zarr\")\n", + "ds = xr.open_zarr(\"argo_float.zarr\")\n", "x = ds[\"lon\"][:].squeeze()\n", "y = ds[\"lat\"][:].squeeze()\n", "z = ds[\"z\"][:].squeeze()\n", From b97c73eb9669dd0db2cfa2872a4650ac4ad8a900 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 12:49:48 +0200 Subject: [PATCH 24/79] Update tutorial_diffusion.ipynb --- parcels/examples/tutorial_diffusion.ipynb | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/parcels/examples/tutorial_diffusion.ipynb b/parcels/examples/tutorial_diffusion.ipynb index 2d7ffaef2..c65629d06 100644 --- a/parcels/examples/tutorial_diffusion.ipynb +++ b/parcels/examples/tutorial_diffusion.ipynb @@ -225,15 +225,14 @@ "source": [ "dt = 0.001\n", "testParticles = get_test_particles()\n", - "output_file = testParticles.ParticleFile(name=\"M1_out.nc\",\n", + "output_file = testParticles.ParticleFile(name=\"M1_out.zarr\",\n", " outputdt=timedelta(seconds=dt))\n", "ParcelsRandom.seed(1636) # Random seed for reproducibility\n", "testParticles.execute(AdvectionDiffusionM1,\n", " runtime=timedelta(seconds=0.3),\n", " dt=timedelta(seconds=dt),\n", " output_file=output_file,\n", - " verbose_progress=True)\n", - "output_file.close() # to write the output to a netCDF file, since `output_file` does not close automatically when using notebooks" + " verbose_progress=True)" ] }, { @@ -242,7 +241,7 @@ "metadata": {}, "outputs": [], "source": [ - "M1_out = xr.open_dataset(\"M1_out.nc\")" + "M1_out = xr.open_zarr(\"M1_out.zarr\")" ] }, { @@ -312,15 +311,14 @@ "source": [ "dt = 0.001\n", "testParticles = get_test_particles()\n", - "output_file = testParticles.ParticleFile(name=\"EM_out.nc\",\n", + "output_file = testParticles.ParticleFile(name=\"EM_out.zarr\",\n", " outputdt=timedelta(seconds=dt))\n", "ParcelsRandom.seed(1636) # Random seed for reproducibility\n", "testParticles.execute(AdvectionDiffusionEM,\n", " runtime=timedelta(seconds=0.3),\n", " dt=timedelta(seconds=dt),\n", " output_file=output_file,\n", - " verbose_progress=True)\n", - "output_file.close() # to write the output to a netCDF file, since `output_file` does not close automatically when using notebooks" + " verbose_progress=True)" ] }, { @@ -329,7 +327,7 @@ "metadata": {}, "outputs": [], "source": [ - "EM_out = xr.open_dataset(\"EM_out.nc\")" + "EM_out = xr.open_zarr(\"EM_out.zarr\")" ] }, { @@ -537,7 +535,7 @@ ], "source": [ "kernels = pset.Kernel(AdvectionRK4) + pset.Kernel(smagdiff)\n", - "output_file = pset.ParticleFile(name='Global_smagdiff.nc', outputdt=timedelta(hours=6))\n", + "output_file = pset.ParticleFile(name='Global_smagdiff.zarr', outputdt=timedelta(hours=6))\n", "\n", "pset.execute(kernels, runtime=timedelta(days=5), dt=timedelta(minutes=5), output_file=output_file, recovery={ErrorCode.ErrorOutOfBounds: DeleteParticle})\n", "pset.show(field=fieldset.U)" @@ -578,7 +576,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Save the output file and visualise the trajectories\n" + "Visualise the trajectories" ] }, { @@ -600,8 +598,7 @@ } ], "source": [ - "output_file.export()\n", - "plotTrajectoriesFile('Global_smagdiff.nc',\n", + "plotTrajectoriesFile('Global_smagdiff.zarr',\n", " tracerfile='GlobCurrent_example_data/20020120000000-GLOBCURRENT-L4-CUReul_hs-ALT_SUM-v02.0-fv01.0.nc',\n", " tracerlon='lon', tracerlat='lat', tracerfield='eastward_eulerian_current_velocity');" ] From 494145e7597f62f33941d0d86b730e76360c2d0c Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 12:54:08 +0200 Subject: [PATCH 25/79] Update tutorial_interaction to zarr output --- parcels/examples/tutorial_interaction.ipynb | 26 ++++++++------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/parcels/examples/tutorial_interaction.ipynb b/parcels/examples/tutorial_interaction.ipynb index c835090a6..509893ea4 100644 --- a/parcels/examples/tutorial_interaction.ipynb +++ b/parcels/examples/tutorial_interaction.ipynb @@ -101,7 +101,7 @@ " n.lon += dlon\n", " n.depth += ddepth\n", "\n", - " mutator[n.id].append((f, d_vec)) # add mutation to the mutator\n" + " mutator[n.id].append((f, d_vec)) # add mutation to the mutator" ] }, { @@ -141,13 +141,11 @@ " interaction_distance=0.5, # note the interaction_distance argument here\n", " attractor=attractor)\n", "\n", - "output_file = pset.ParticleFile(name=\"InteractingParticles.nc\", outputdt=1)\n", + "output_file = pset.ParticleFile(name=\"InteractingParticles.zarr\", outputdt=1)\n", "\n", "pset.execute(pyfunc=DiffusionUniformKh, \n", " pyfunc_inter=Pull, # note the pyfunc_inter here\n", - " runtime=60, dt=1, output_file=output_file)\n", - "\n", - "output_file.close()" + " runtime=60, dt=1, output_file=output_file)" ] }, { @@ -158,7 +156,7 @@ "outputs": [], "source": [ "%%capture\n", - "data_xarray = xr.open_dataset('InteractingParticles.nc')\n", + "data_xarray = xr.open_zarr('InteractingParticles.zarr')\n", "data_attr = data_xarray.where(data_xarray['attractor']==1, drop=True)\n", "data_other = data_xarray.where(data_xarray['attractor']==0, drop=True)\n", "\n", @@ -31273,13 +31271,11 @@ " lon=X, lat=Y,\n", " interaction_distance=0.05) # note the interaction_distance argument here\n", "\n", - "output_file = pset.ParticleFile(name=\"MergingParticles.nc\", outputdt=1)\n", + "output_file = pset.ParticleFile(name=\"MergingParticles.zarr\", outputdt=1)\n", "\n", "pset.execute(pyfunc=DiffusionUniformKh, \n", " pyfunc_inter=pset.InteractionKernel(NearestNeighborWithinRange) + MergeWithNearestNeighbor, # note the pyfunc_inter here\n", - " runtime=60, dt=1, output_file=output_file)\n", - "\n", - "output_file.close()" + " runtime=60, dt=1, output_file=output_file)" ] }, { @@ -31290,7 +31286,7 @@ "outputs": [], "source": [ "%%capture\n", - "data_xarray = xr.open_dataset('MergingParticles.nc')\n", + "data_xarray = xr.open_zarr('MergingParticles.zarr')\n", "\n", "timerange = np.arange(np.nanmin(data_xarray['time'].values),\n", " np.nanmax(data_xarray['time'].values), np.timedelta64(1, 's')) # timerange in nanoseconds\n", @@ -62047,7 +62043,7 @@ "\n", "# The function that creates the animation\n", "def Animate():\n", - " data_xarray = xr.open_dataset('InteractingParticles.nc')\n", + " data_xarray = xr.open_zarr('InteractingParticles.zarr')\n", " data_attr = data_xarray.where(data_xarray['ptype']==1, drop=True)\n", " data_other = data_xarray.where(data_xarray['ptype']==-1, drop=True)\n", "\n", @@ -86874,11 +86870,10 @@ " lon=X, lat=Y,\n", " interaction_distance=0.7, # note the interaction_distance argument here\n", " ptype=ptype)\n", - "output_file = pset.ParticleFile(name=\"InteractingParticles.nc\", outputdt=1)\n", + "output_file = pset.ParticleFile(name=\"InteractingParticles.zarr\", outputdt=1)\n", "pset.execute(#pyfunc=DiffusionUniformKh, \n", " pyfunc_inter=Move, # note the pyfunc_inter here\n", " runtime=60, dt=1, output_file=output_file)\n", - "output_file.close()\n", "anim = Animate() # Create animation\n", "HTML(anim.to_jshtml())" ] @@ -111541,12 +111536,11 @@ " lon=X, lat=Y,\n", " interaction_distance=0.7, # note the interaction_distance argument here\n", " ptype=ptype)\n", - "output_file = pset.ParticleFile(name=\"InteractingParticles.nc\", outputdt=1)\n", + "output_file = pset.ParticleFile(name=\"InteractingParticles.zarr\", outputdt=1)\n", "\n", "pset.execute(#pyfunc=DiffusionUniformKh, \n", " pyfunc_inter=Move, # note the pyfunc_inter here\n", " runtime=60, dt=1, output_file=output_file)\n", - "output_file.close()\n", "anim = Animate()\n", "HTML(anim.to_jshtml())" ] From c19e11258faf78afd3f27256fc2d2a37454ac06e Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 12:54:32 +0200 Subject: [PATCH 26/79] Update tutorial_interpolation to zarr output --- parcels/examples/tutorial_interpolation.ipynb | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/parcels/examples/tutorial_interpolation.ipynb b/parcels/examples/tutorial_interpolation.ipynb index 64f6b6581..8dbf10442 100644 --- a/parcels/examples/tutorial_interpolation.ipynb +++ b/parcels/examples/tutorial_interpolation.ipynb @@ -312,9 +312,8 @@ } ], "source": [ - "pfile = pset.ParticleFile(\"interpolation_offset.nc\", outputdt=1)\n", - "pset.execute(kernels, endtime=1, dt=1, output_file=pfile)\n", - "pfile.close()" + "pfile = pset.ParticleFile(\"interpolation_offset.zarr\", outputdt=1)\n", + "pset.execute(kernels, endtime=1, dt=1, output_file=pfile)" ] }, { @@ -347,7 +346,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds = xr.open_dataset(\"interpolation_offset.nc\").isel(obs=1)\n", + "ds = xr.open_zarr(\"interpolation_offset.zarr\").isel(obs=1)\n", "for i in range(len(ds['p'])):\n", " assert np.isclose(ds['p'].values[i], calc_p(float(ds['time'].values[i])/1e9, ds['lat'].values[i], ds['lon'].values[i]))" ] @@ -358,13 +357,6 @@ "source": [ "As a bit of background for why sampling needs to be done this way: the reason is that the particles are already moved within the AdvectionRK4 kernel, but the time is not updated yet until all concatenated kernels are completed. " ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From b172e26ff0a9caf239fa7fc2627a0d3a48ab78c3 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 12:55:49 +0200 Subject: [PATCH 27/79] Update tutorial_nemo_curvilinear to use zarr output --- parcels/examples/tutorial_nemo_curvilinear.ipynb | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/parcels/examples/tutorial_nemo_curvilinear.ipynb b/parcels/examples/tutorial_nemo_curvilinear.ipynb index c0a9864de..b0933bc93 100644 --- a/parcels/examples/tutorial_nemo_curvilinear.ipynb +++ b/parcels/examples/tutorial_nemo_curvilinear.ipynb @@ -185,8 +185,7 @@ } ], "source": [ - "pfile.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile(\"nemo_particles.nc\");" + "plotTrajectoriesFile(\"nemo_particles.zarr\");" ] }, { @@ -206,13 +205,6 @@ "pset = ParticleSet.from_list(field_set, JITParticle, lon=lonp, lat=latp)\n", "pset.populate_indices()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 3837830c8994e6331fbf4d8b995a90ebb4f15894 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 13:03:29 +0200 Subject: [PATCH 28/79] Updating tutorial_plotting to default zarr output --- parcels/examples/tutorial_output.ipynb | 185 +------------------------ 1 file changed, 6 insertions(+), 179 deletions(-) diff --git a/parcels/examples/tutorial_output.ipynb b/parcels/examples/tutorial_output.ipynb index 2e9b7eb51..58999f505 100644 --- a/parcels/examples/tutorial_output.ipynb +++ b/parcels/examples/tutorial_output.ipynb @@ -58,47 +58,9 @@ "\n", "pset = ParticleSet(fieldset=fieldset, pclass=JITParticle, lon=lon, lat=lat, time=time)\n", "\n", - "output_file = pset.ParticleFile(name=\"Output.nc\", outputdt=delta(hours=2))\n", + "output_file = pset.ParticleFile(name=\"Output.zarr\", outputdt=delta(hours=2))\n", "pset.execute(AdvectionRK4, runtime=delta(hours=24), dt=delta(minutes=5),\n", - " output_file=output_file)\n", - "output_file.close() # export the trajectory data to a netcdf file" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Exporting trajectory data in `zarr` format" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Parcels can also output trajectories in [`zarr` format](https://zarr.readthedocs.io/en/stable/). Files in `zarr` are typically _much_ smaller in size than the default netcdf output, but may be slightly more challenging to handle (although `xarray` has a fairly seamless [`open_zarr()` method](https://docs.xarray.dev/en/stable/generated/xarray.open_zarr.html)). IN order to output to `zarr` format, simply make sure the extension of the output file name is `.zarr`. " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "sh: None: command not found\n", - "INFO: Compiled ArrayJITParticleAdvectionRK4 ==> /var/folders/s5/gxtkk3c12yqgd7hkt1b_s0vr0000gq/T/parcels-503/libf8e8a9d3cb076076d75b2b159b1317da_0.so\n" - ] - } - ], - "source": [ - "pset = ParticleSet(fieldset=fieldset, pclass=JITParticle, lon=lon, lat=lat, time=time)\n", - "\n", - "output_file = pset.ParticleFile(name=\"Output.zarr\", outputdt=delta(hours=2)) # note .zarr extension in name!\n", - "pset.execute(AdvectionRK4, runtime=delta(hours=24), dt=delta(minutes=5),\n", - " output_file=output_file)\n", - "output_file.close() # export the trajectory data to a netcdf file" + " output_file=output_file)" ] }, { @@ -106,75 +68,8 @@ "metadata": {}, "source": [ "## Reading the output file\n", - "### Using the netCDF4 package\n", - "The [`parcels.particlefile.ParticleFile`](https://oceanparcels.org/gh-pages/html/#parcels.particlefile.ParticleFile) class creates a netCDF file to store the particle trajectories. It uses the [**`netCDF4` package**](https://unidata.github.io/netcdf4-python/netCDF4/index.html), which is also suitable to open and read the files for analysis. The [`Dataset` class](https://unidata.github.io/netcdf4-python/netCDF4/index.html#netCDF4.Dataset) opens a netCDF file in reading mode by default. Data can be accessed with the `Dataset.variables` dictionary which can return (masked) numpy arrays." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "root group (NETCDF4 data model, file format HDF5):\n", - " feature_type: trajectory\n", - " Conventions: CF-1.6/CF-1.7\n", - " ncei_template_version: NCEI_NetCDF_Trajectory_Template_v2.0\n", - " parcels_version: v2.3.0-176-gbe3424c9\n", - " parcels_mesh: flat\n", - " dimensions(sizes): traj(10), obs(13)\n", - " variables(dimensions): float64 time(traj, obs), int64 trajectory(traj, obs), float32 lon(traj, obs), float32 lat(traj, obs), float32 z(traj, obs)\n", - " groups: \n" - ] - } - ], - "source": [ - "import netCDF4\n", "\n", - "data_netcdf4 = netCDF4.Dataset('Output.nc')\n", - "print(data_netcdf4)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0 0 0 0 0 0 0 0 0 0 0 0 0]\n", - " [1 1 1 1 1 1 1 1 1 1 1 1 --]\n", - " [2 2 2 2 2 2 2 2 2 2 2 -- --]\n", - " [3 3 3 3 3 3 3 3 3 3 -- -- --]\n", - " [4 4 4 4 4 4 4 4 4 -- -- -- --]\n", - " [5 5 5 5 5 5 5 5 -- -- -- -- --]\n", - " [6 6 6 6 6 6 6 -- -- -- -- -- --]\n", - " [7 7 7 7 7 7 -- -- -- -- -- -- --]\n", - " [8 8 8 8 8 -- -- -- -- -- -- -- --]\n", - " [9 9 9 9 -- -- -- -- -- -- -- -- --]]\n" - ] - } - ], - "source": [ - "trajectory_netcdf4 = data_netcdf4.variables['trajectory'][:]\n", - "time_netcdf4 = data_netcdf4.variables['time'][:]\n", - "lon_netcdf4 = data_netcdf4.variables['lon'][:]\n", - "lat_netcdf4 = data_netcdf4.variables['lat'][:]\n", - "print(trajectory_netcdf4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using the xarray package\n", - "An often-used alternative to netCDF4, which also comes with the parcels installation, is [**xarray**](http://xarray.pydata.org/en/stable/index.html). Its labelled arrays allow for intuitive and accessible handling of data stored in the netCDF format." + "Parcels exports output trajectories in [`zarr` format](https://zarr.readthedocs.io/en/stable/). Files in `zarr` are typically _much_ smaller in size than netcdf, although may be slightly more challenging to handle (although `xarray` has a fairly seamless [`open_zarr()` method](https://docs.xarray.dev/en/stable/generated/xarray.open_zarr.html))." ] }, { @@ -207,49 +102,10 @@ "source": [ "import xarray as xr\n", "\n", - "data_xarray = xr.open_dataset('Output.nc')\n", + "data_xarray = xr.open_zarr('Output.zarr')\n", "print(data_xarray)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that opening the `.zarr` file (see [Exporting trajectory data in zarr format](#Exporting-trajectory-data-in-zarr-format)) using `xr.open_zarr()` leads to a similar object" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Dimensions: (traj: 10, obs: 13)\n", - "Dimensions without coordinates: traj, obs\n", - "Data variables:\n", - " lat (traj, obs) float32 dask.array\n", - " lon (traj, obs) float32 dask.array\n", - " time (traj, obs) timedelta64[ns] dask.array\n", - " trajectory (traj, obs) float64 dask.array\n", - " z (traj, obs) float32 dask.array\n", - "Attributes:\n", - " Conventions: CF-1.6/CF-1.7\n", - " feature_type: trajectory\n", - " ncei_template_version: NCEI_NetCDF_Trajectory_Template_v2.0\n", - " parcels_mesh: flat\n", - " parcels_version: v2.3.0-176-gbe3424c9\n" - ] - } - ], - "source": [ - "data_xarray_zarr = xr.open_zarr('Output.zarr')\n", - "print(data_xarray_zarr)" - ] - }, { "cell_type": "code", "execution_count": 8, @@ -286,7 +142,7 @@ "metadata": {}, "source": [ "## Trajectory data structure\n", - "The data in the netCDF file are organised according to the [CF-conventions](http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#discrete-sampling-geometries) implemented with the [NCEI trajectory template](http://www.nodc.noaa.gov/data/formats/netcdf/v2.0/trajectoryIncomplete.cdl). The data is stored in a **two-dimensional array** with the dimensions **`traj`** and **`obs`**. Each particle trajectory is essentially stored as a time series where the coordinate data (**`lon`**, **`lat`**, **`time`**) are a function of the observation (`obs`).\n", + "The data zarr file are organised according to the [CF-conventions](http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#discrete-sampling-geometries) implemented with the [NCEI trajectory template](http://www.nodc.noaa.gov/data/formats/netcdf/v2.0/trajectoryIncomplete.cdl). The data is stored in a **two-dimensional array** with the dimensions **`traj`** and **`obs`**. Each particle trajectory is essentially stored as a time series where the coordinate data (**`lon`**, **`lat`**, **`time`**) are a function of the observation (`obs`).\n", "\n", "The output dataset used here contains **10 particles** and **13 observations**. Not every particle has 13 observations however; since we released particles at different times some particle trajectories are shorter than others." ] @@ -447,33 +303,6 @@ " mean_lat_x += [np.nanmean(data_xarray['lat'].where(data_xarray['time']==time).values)] # find the data that share the time" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Conditional selection is even easier in numpy arrays without the xarray formatting since it accepts the 2D boolean array that results from `time_netcdf4 == time` as a mask that you can use to directly select the data." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# Using netCDF4\n", - "mean_lon_n = []\n", - "mean_lat_n = []\n", - "\n", - "timerange = np.arange(np.nanmin(time_netcdf4), \n", - " np.nanmax(time_netcdf4)+delta(hours=2).total_seconds(), \n", - " delta(hours=2).total_seconds()) \n", - "\n", - "for time in timerange:\n", - " if np.all(np.any(time_netcdf4 == time, axis=1)): # if all trajectories share an observation at time\n", - " mean_lon_n += [np.mean(lon_netcdf4[time_netcdf4 == time])] # find the data that share the time\n", - " mean_lat_n += [np.mean(lat_netcdf4[time_netcdf4 == time])] # find the data that share the time" - ] - }, { "cell_type": "code", "execution_count": 15, @@ -498,9 +327,7 @@ "ax.set_ylabel('Meridional distance [m]')\n", "ax.set_xlabel('Zonal distance [m]')\n", "ax.grid()\n", - "ax.scatter(mean_lon_x,mean_lat_x,marker='^',label='xarray',s = 80)\n", - "ax.scatter(mean_lon_n,mean_lat_n,marker='o',label='netcdf')\n", - "plt.legend()\n", + "ax.scatter(mean_lon_x,mean_lat_x,marker='^',s = 80)\n", "plt.show()" ] }, From 6859a426d7f6d8a194ca444143f45b693bf3c5c6 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 13:10:18 +0200 Subject: [PATCH 29/79] Update tutorial_parcels_structure to zarr output --- .../examples/tutorial_parcels_structure.ipynb | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/parcels/examples/tutorial_parcels_structure.ipynb b/parcels/examples/tutorial_parcels_structure.ipynb index 1cc6492d4..3dc01965d 100644 --- a/parcels/examples/tutorial_parcels_structure.ipynb +++ b/parcels/examples/tutorial_parcels_structure.ipynb @@ -313,7 +313,7 @@ } ], "source": [ - "output_file = pset.ParticleFile(name=\"GCParticles.nc\", outputdt=3600) # the file name and the time step of the outputs\n", + "output_file = pset.ParticleFile(name=\"GCParticles.zarr\", outputdt=3600) # the file name and the time step of the outputs\n", "\n", "pset.execute(kernels, # the kernel (which defines how particles move)\n", " runtime=86400*24, # the total length of the run in seconds\n", @@ -321,23 +321,6 @@ " output_file=output_file)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "While executing the `ParticleSet`, parcels stores the data in **npy** files in an output folder. To take all the data and store them in a netcdf file, you can use [**`ParticleFile.export()`**](https://oceanparcels.org/gh-pages/html/#parcels.particlefile.ParticleFile.export) if you want to keep the folder with npy files; or [**`ParticleFile.close()`**](https://oceanparcels.org/gh-pages/html/#parcels.particlefile.ParticleFile.close) if you only want to keep the netcdf file:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "output_file.export()\n", - "output_file.close()" - ] - }, { "cell_type": "markdown", "metadata": {}, From 1177146931dee9bc2d347d1c2c3e58e7148a332c Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 13:13:41 +0200 Subject: [PATCH 30/79] Update tutorial_periodic_boundaries.ipynb --- parcels/examples/tutorial_periodic_boundaries.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parcels/examples/tutorial_periodic_boundaries.ipynb b/parcels/examples/tutorial_periodic_boundaries.ipynb index 0c97af31c..b3a4f5df0 100644 --- a/parcels/examples/tutorial_periodic_boundaries.ipynb +++ b/parcels/examples/tutorial_periodic_boundaries.ipynb @@ -156,8 +156,7 @@ } ], "source": [ - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('PeriodicParticle.nc');" + "plotTrajectoriesFile('PeriodicParticle.zarr');" ] }, { From 7cb9a326f9281b01287b580f42bf438c9805b148 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 13:14:55 +0200 Subject: [PATCH 31/79] Update tutorial_sampling.ipynb --- parcels/examples/tutorial_sampling.ipynb | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/parcels/examples/tutorial_sampling.ipynb b/parcels/examples/tutorial_sampling.ipynb index 8ab497cd7..f332fb8d0 100644 --- a/parcels/examples/tutorial_sampling.ipynb +++ b/parcels/examples/tutorial_sampling.ipynb @@ -214,11 +214,9 @@ "source": [ "pset.execute(sample_kernel, dt=0) # by only executing the sample kernel we record the initial temperature of the particles\n", "\n", - "output_file = pset.ParticleFile(name=\"InitZero.nc\", outputdt=delta(hours=1))\n", + "output_file = pset.ParticleFile(name=\"InitZero.zarr\", outputdt=delta(hours=1))\n", "pset.execute(AdvectionRK4 + sample_kernel, runtime=delta(hours=30), dt=delta(minutes=5),\n", - " output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "output_file.close()" + " output_file=output_file)" ] }, { @@ -247,7 +245,7 @@ } ], "source": [ - "Particle_data = xr.open_dataset(\"InitZero.nc\")\n", + "Particle_data = xr.open_zarr(\"InitZero.zarr\")\n", "\n", "plt.figure()\n", "ax = plt.axes()\n", @@ -372,10 +370,9 @@ "source": [ "pset.execute(sample_kernel, dt=0) # by only executing the sample kernel we record the initial temperature of the particles\n", "\n", - "output_file = pset.ParticleFile(name=\"WriteOnce.nc\", outputdt=delta(hours=1))\n", + "output_file = pset.ParticleFile(name=\"WriteOnce.zarr\", outputdt=delta(hours=1))\n", "pset.execute(AdvectionRK4, runtime=delta(hours=24), dt=delta(minutes=5),\n", - " output_file=output_file)\n", - "output_file.close()" + " output_file=output_file)" ] }, { @@ -404,7 +401,7 @@ } ], "source": [ - "Particle_data = xr.open_dataset(\"WriteOnce.nc\")\n", + "Particle_data = xr.open_zarr(\"WriteOnce.zarr\")\n", "\n", "plt.figure()\n", "ax = plt.axes()\n", @@ -533,7 +530,7 @@ "pset = ParticleSet(fieldset=fieldset, pclass=SampleParticleInitZero, lon=[], lat=[], time=[]) # Using SampleParticleInitZero\n", "kernels = AdvectionRK4 + sample_kernel\n", "\n", - "output_file = pset.ParticleFile(name=\"RepeatLoop.nc\") # Do not specify the outputdt yet, so we can manually write the output\n", + "output_file = pset.ParticleFile(name=\"RepeatLoop.zarr\") # Do not specify the outputdt yet, so we can manually write the output\n", "\n", "for time in np.arange(0, runtime, outputdt):\n", " if np.isclose(np.fmod(time, repeatdt), 0): # time is a multiple of repeatdt\n", @@ -546,9 +543,7 @@ " pset.execute(kernels, runtime=outputdt, dt=delta(minutes=5))\n", " print('Length of pset at time %d: %d' % (time, len(pset)))\n", " \n", - "output_file.write(pset, time+outputdt) \n", - "\n", - "output_file.close()" + "output_file.write(pset, time+outputdt)" ] }, { @@ -574,7 +569,7 @@ } ], "source": [ - "Particle_data = xr.open_dataset(\"RepeatLoop.nc\")\n", + "Particle_data = xr.open_zarr(\"RepeatLoop.zarr\")\n", "print(Particle_data.time[:,0].values / np.timedelta64(1, 'h')) # The initial hour at which each particle is released\n", "assert np.allclose(Particle_data.time[:,0].values / np.timedelta64(1, 'h'), [int(k/10)*6 for k in range(40)])" ] From b91728e0a935f286fca6f1b34917972b5dd3b504 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 13:15:54 +0200 Subject: [PATCH 32/79] Update tutorial_timevaryingdepthdimensions.ipynb --- parcels/examples/tutorial_timevaryingdepthdimensions.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parcels/examples/tutorial_timevaryingdepthdimensions.ipynb b/parcels/examples/tutorial_timevaryingdepthdimensions.ipynb index 17f83e832..947ae7011 100644 --- a/parcels/examples/tutorial_timevaryingdepthdimensions.ipynb +++ b/parcels/examples/tutorial_timevaryingdepthdimensions.ipynb @@ -131,8 +131,7 @@ "pfile = pset.ParticleFile(\"SwashParticles\", outputdt=delta(seconds=0.05))\n", "pset.execute(AdvectionRK4, dt=delta(seconds=0.005), output_file=pfile)\n", "\n", - "pfile.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('SwashParticles.nc');" + "plotTrajectoriesFile('SwashParticles.zarr');" ] }, { From e4890362a47da42d59f85df0b5823089d153628a Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 13:22:13 +0200 Subject: [PATCH 33/79] Update tutorial_particle_field_interaction.ipynb --- .../tutorial_particle_field_interaction.ipynb | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/parcels/examples/tutorial_particle_field_interaction.ipynb b/parcels/examples/tutorial_particle_field_interaction.ipynb index a1b4fff6a..796d51f6d 100644 --- a/parcels/examples/tutorial_particle_field_interaction.ipynb +++ b/parcels/examples/tutorial_particle_field_interaction.ipynb @@ -49,6 +49,7 @@ "import numpy as np\n", "from datetime import timedelta as delta\n", "import netCDF4\n", + "import xarray as xr\n", "import matplotlib.pyplot as plt" ] }, @@ -229,7 +230,7 @@ "\n", "Before running the advection, we will execute the ```pset``` with the ```WriteInitial``` for ```dt=0```: this will write the initial condition of fieldset.C to a ```netCDF``` file.\n", "\n", - "While particle outputs will be written in a file named ```interaction.nc``` at every ```outputdt```, the field will be automatically written in ```netCDF``` files named ```interaction_wxyzC.nc```, with ```wxyz``` being the number of the output and ```C``` the ```FieldSet``` variable of our interest. Note that you can use tools like [ncrcat](https://linux.die.net/man/1/ncrcat) (on linux/macOS) to combine these separate files into one large ```netCDF``` file after the simualtion." + "While particle outputs will be written in a file named ```interaction.zarr``` at every ```outputdt```, the field will be automatically written in ```netCDF``` files named ```interaction_wxyzC.nc```, with ```wxyz``` being the number of the output and ```C``` the ```FieldSet``` variable of our interest. Note that you can use tools like [ncrcat](https://linux.die.net/man/1/ncrcat) (on linux/macOS) to combine these separate files into one large ```netCDF``` file after the simualtion." ] }, { @@ -247,16 +248,14 @@ } ], "source": [ - "output_file = pset.ParticleFile(name=r'interaction.nc', outputdt=delta(days=1))\n", + "output_file = pset.ParticleFile(name=r'interaction.zarr', outputdt=delta(days=1))\n", "\n", "pset.execute(WriteInitial, dt=0., output_file=output_file)\n", "\n", "pset.execute(AdvectionRK4 + pset.Kernel(Interaction), # the particle will FIRST be transported by currents and THEN interact with the field\n", " dt=delta(days=1),\n", " runtime=delta(days=24), # we are going to track the particle and save its trajectory and tracer concentration for 24 days\n", - " output_file=output_file)\n", - "\n", - "output_file.close()" + " output_file=output_file)" ] }, { @@ -298,7 +297,7 @@ } ], "source": [ - "pset_traj = netCDF4.Dataset(r'interaction.nc')\n", + "pset_traj = xr.open_zarr(r'interaction.zarr')\n", "\n", "print(pset_traj['c'][:])\n", "\n", @@ -406,7 +405,7 @@ "field_cbar = plt.colorbar(fieldplot,ax=ax)\n", "field_cbar.ax.text(.6,.070,'$C_{field}$ concentration', rotation=270, fontsize=12)\n", "\n", - "particle = plt.scatter(pset_traj['lon'][:].data[0,:],pset_traj['lat'][:].data[0,:], c=pset_traj['c'][:].data[0,:],vmin=0, s=100, edgecolor='white') \n", + "particle = plt.scatter(pset_traj['lon'][:].values[0,:],pset_traj['lat'][:].values[0,:], c=pset_traj['c'][:].values[0,:],vmin=0, s=100, edgecolor='white') \n", "particle_cbar = plt.colorbar(particle,ax=ax, location = 'top')\n", "particle_cbar.ax.text(40,300,'$c_{particle}$ concentration', fontsize=12);" ] @@ -456,7 +455,7 @@ "\n", " fieldplot=ax[i,j].pcolormesh(x_centers[-28:-17,22:41],y_centers[-28:-17,22:41],c_results[-28:-18,22:40], vmin=0, vmax=0.2,cmap='viridis') \n", " \n", - " particle = ax[i,j].scatter(pset_traj['lon'][:].data[0,daycounter-1],pset_traj['lat'][:].data[0,daycounter-1], c=pset_traj['c'][:].data[0,daycounter-1],vmin=0, vmax=100, s=100, edgecolor='white') \n", + " particle = ax[i,j].scatter(pset_traj['lon'][:].values[0,daycounter-1],pset_traj['lat'][:].values[0,daycounter-1], c=pset_traj['c'][:].values[0,daycounter-1],vmin=0, vmax=100, s=100, edgecolor='white') \n", " # plotting particle location at current time step -- daycounter-1 due to different indexing\n", " \n", " ax[i,j].set_title('Day '+ str(daycounter-1))\n", From 8c533e1c6dcdf7dc78ee8dffdd91de3fd8b9137e Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 14:46:53 +0200 Subject: [PATCH 34/79] Update tutorial_particle_field_interaction.ipynb --- parcels/examples/tutorial_particle_field_interaction.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parcels/examples/tutorial_particle_field_interaction.ipynb b/parcels/examples/tutorial_particle_field_interaction.ipynb index 796d51f6d..e5d84308e 100644 --- a/parcels/examples/tutorial_particle_field_interaction.ipynb +++ b/parcels/examples/tutorial_particle_field_interaction.ipynb @@ -299,9 +299,9 @@ "source": [ "pset_traj = xr.open_zarr(r'interaction.zarr')\n", "\n", - "print(pset_traj['c'][:])\n", + "print(pset_traj['c'].values)\n", "\n", - "plotTrajectoriesFile('interaction.nc');" + "plotTrajectoriesFile('interaction.zarr');" ] }, { From 767818921e387fff6ccbec564fbefb96c2e53b03 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 14:59:14 +0200 Subject: [PATCH 35/79] Updating Field.to_write for new zarr output --- parcels/particleset/baseparticleset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parcels/particleset/baseparticleset.py b/parcels/particleset/baseparticleset.py index 50e60b368..1a25c62e7 100644 --- a/parcels/particleset/baseparticleset.py +++ b/parcels/particleset/baseparticleset.py @@ -485,7 +485,7 @@ def execute(self, pyfunc=AdvectionRK4, pyfunc_inter=None, endtime=None, runtime= if hasattr(fld, 'to_write') and fld.to_write: if fld.grid.tdim > 1: raise RuntimeError('Field writing during execution only works for Fields with one snapshot in time') - fldfilename = str(output_file.name).replace('.nc', '_%.4d' % fld.to_write) + fldfilename = str(output_file.name).replace('.zarr', '_%.4d' % fld.to_write) fld.write(fldfilename) fld.to_write += 1 if abs(time - next_output) < tol: From 2a95e2979ca945a708e253cdbb4296e57a18bf17 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 20 Jul 2022 15:13:59 +0200 Subject: [PATCH 36/79] Updating unit test for field.to_write to zarr output --- tests/test_fieldset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index e0fd94f16..6a48593be 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -580,7 +580,7 @@ def SampleUV2(particle, fieldset, time): @pytest.mark.parametrize('pset_mode', pset_modes) def test_fieldset_write(pset_mode, tmpdir): - filepath = tmpdir.join("fieldset_write.nc") + filepath = tmpdir.join("fieldset_write.zarr") xdim, ydim = 3, 4 lon = np.linspace(0., 10., xdim, dtype=np.float32) lat = np.linspace(0., 10., ydim, dtype=np.float32) @@ -604,7 +604,7 @@ def UpdateU(particle, fieldset, time): assert fieldset.U.data[0, 1, 0] == 11 - da = xr.open_dataset(str(filepath).replace('.nc', '_0005U.nc')) + da = xr.open_dataset(str(filepath).replace('.zarr', '_0005U.nc')) assert np.allclose(fieldset.U.data, da['U'].values) From 04cb9ba2ad50b77f2da2337204418d651611651d Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 21 Jul 2022 12:00:06 +0200 Subject: [PATCH 37/79] Adding support for MPI in dump_to_zarr Note that for simplicity, all writing is done on the rank==0 processor. This could potentially be improved --- parcels/particlefile/baseparticlefile.py | 138 ++++++++++++----------- tests/test_mpirun.py | 24 ++-- 2 files changed, 88 insertions(+), 74 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index a6581b80a..67035f002 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -172,68 +172,80 @@ def write(self, pset, time, deleted_only=False): """ data_dict, data_dict_once = pset.to_dict(self, time, deleted_only=deleted_only) - maxtraj = len(self.IDs_written) - if len(data_dict) > 0: - for i in data_dict['id']: - if i not in self.IDs_written: - self.IDs_written[i] = maxtraj - self.maxobs[i] = 0 - maxtraj += 1 - else: - self.maxobs[i] += 1 - - if len(data_dict_once) > 0: - for i in data_dict_once['id']: - if i not in self.IDs_written: - self.IDs_written[i] = maxtraj - self.maxobs[i] = -1 - maxtraj += 1 - - if len(data_dict) > 0: - if not self.written_first: - ds = xr.Dataset(attrs=self.metadata) - attrs = self._create_variables_attribute_dict() - ids = [self.IDs_written[i] for i in data_dict['id']] - for var in data_dict: - varout = 'z' if var == 'depth' else var - varout = 'trajectory' if varout == 'id' else varout - data = np.full((maxtraj, 1), np.nan, dtype=self.vars_to_write[var]) - data[ids, 0] = data_dict[var] - ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) - for var in data_dict_once: - if var != 'id': # TODO check if needed - data = np.full((maxtraj,), np.nan, dtype=self.vars_to_write_once[var]) - data[ids] = data_dict_once[var] - ds[var] = xr.DataArray(data=data, dims=["traj"], attrs=attrs[var]) - ds.to_zarr(self.fname, mode='w') - self.written_first = True - else: - store = zarr.DirectoryStore(self.fname) - Z = zarr.group(store=store, overwrite=False) - ids = [self.IDs_written[i] for i in data_dict['id']] - maxobs = [self.maxobs[i] for i in data_dict['id']] - - for var in data_dict: - varout = 'z' if var == 'depth' else var - varout = 'trajectory' if varout == 'id' else varout - if max(maxobs) >= Z[varout].shape[1]: - a = np.full((Z[varout].shape[0], 1), np.nan, - dtype=self.vars_to_write[var]) - Z[varout].append(a, axis=1) - zarr.consolidate_metadata(store) - if max(ids) >= Z[varout].shape[0]: - a = np.full((maxtraj-Z[varout].shape[0], Z[varout].shape[1]), np.nan, - dtype=self.vars_to_write[var]) - Z[varout].append(a, axis=0) - zarr.consolidate_metadata(store) - Z[varout].vindex[ids, maxobs] = data_dict[var] + if MPI: + all_data_dict = MPI.COMM_WORLD.gather(data_dict, root=0) + all_data_dict_once = MPI.COMM_WORLD.gather(data_dict_once, root=0) + rank = MPI.COMM_WORLD.Get_rank() + else: + all_data_dict = [data_dict] + all_data_dict_once = [data_dict_once] + rank = 0 + + if rank == 0: + + maxtraj = len(self.IDs_written) + for data_dict, data_dict_once in zip(all_data_dict, all_data_dict_once): + if len(data_dict) > 0: + for i in data_dict['id']: + if i not in self.IDs_written: + self.IDs_written[i] = maxtraj + self.maxobs[i] = 0 + maxtraj += 1 + else: + self.maxobs[i] += 1 + if len(data_dict_once) > 0: - ids = [self.IDs_written[i] for i in data_dict_once['id']] - for var in data_dict_once: - if var != 'id': # TODO check if needed - if max(ids) >= Z[var].shape[0]: - a = np.full((maxtraj - Z[var].shape[0],), np.nan, - dtype=self.vars_to_write_once[var]) - Z[var].append(a, axis=0) + for i in data_dict_once['id']: + if i not in self.IDs_written: + self.IDs_written[i] = maxtraj + self.maxobs[i] = -1 + maxtraj += 1 + + if len(data_dict) > 0: + if not self.written_first: + ds = xr.Dataset(attrs=self.metadata) + attrs = self._create_variables_attribute_dict() + ids = [self.IDs_written[i] for i in data_dict['id']] + for var in data_dict: + varout = 'z' if var == 'depth' else var + varout = 'trajectory' if varout == 'id' else varout + data = np.full((maxtraj, 1), np.nan, dtype=self.vars_to_write[var]) + data[ids, 0] = data_dict[var] + ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) + for var in data_dict_once: + if var != 'id': # TODO check if needed + data = np.full((maxtraj,), np.nan, dtype=self.vars_to_write_once[var]) + data[ids] = data_dict_once[var] + ds[var] = xr.DataArray(data=data, dims=["traj"], attrs=attrs[var]) + ds.to_zarr(self.fname, mode='w') + self.written_first = True + else: + store = zarr.DirectoryStore(self.fname) + Z = zarr.group(store=store, overwrite=False) + ids = [self.IDs_written[i] for i in data_dict['id']] + maxobs = [self.maxobs[i] for i in data_dict['id']] + + for var in data_dict: + varout = 'z' if var == 'depth' else var + varout = 'trajectory' if varout == 'id' else varout + if max(maxobs) >= Z[varout].shape[1]: + a = np.full((Z[varout].shape[0], 1), np.nan, + dtype=self.vars_to_write[var]) + Z[varout].append(a, axis=1) + zarr.consolidate_metadata(store) + if max(ids) >= Z[varout].shape[0]: + a = np.full((maxtraj-Z[varout].shape[0], Z[varout].shape[1]), np.nan, + dtype=self.vars_to_write[var]) + Z[varout].append(a, axis=0) zarr.consolidate_metadata(store) - Z[var].vindex[ids] = data_dict_once[var] + Z[varout].vindex[ids, maxobs] = data_dict[var] + if len(data_dict_once) > 0: + ids = [self.IDs_written[i] for i in data_dict_once['id']] + for var in data_dict_once: + if var != 'id': # TODO check if needed + if max(ids) >= Z[var].shape[0]: + a = np.full((maxtraj - Z[var].shape[0],), np.nan, + dtype=self.vars_to_write_once[var]) + Z[var].append(a, axis=0) + zarr.consolidate_metadata(store) + Z[var].vindex[ids] = data_dict_once[var] diff --git a/tests/test_mpirun.py b/tests/test_mpirun.py index 9dece91f8..e1f8d00dc 100644 --- a/tests/test_mpirun.py +++ b/tests/test_mpirun.py @@ -1,8 +1,8 @@ from os import path, system -from netCDF4 import Dataset import numpy as np import pytest import sys +import xarray as xr try: from mpi4py import MPI except: @@ -17,21 +17,23 @@ def test_mpi_run(pset_mode, tmpdir, repeatdt, maxage): if MPI: stommel_file = path.join(path.dirname(__file__), '..', 'parcels', 'examples', 'example_stommel.py') - outputMPI = tmpdir.join('StommelMPI.nc') - outputNoMPI = tmpdir.join('StommelNoMPI.nc') + outputMPI = tmpdir.join('StommelMPI.zarr') + outputNoMPI = tmpdir.join('StommelNoMPI.zarr') system('mpirun -np 2 python %s -p 4 -o %s -r %d -a %d -psm %s' % (stommel_file, outputMPI, repeatdt, maxage, pset_mode)) system('python %s -p 4 -o %s -r %d -a %d -psm %s' % (stommel_file, outputNoMPI, repeatdt, maxage, pset_mode)) - ncfile1 = Dataset(outputMPI, 'r', 'NETCDF4') - ncfile2 = Dataset(outputNoMPI, 'r', 'NETCDF4') + ds1 = xr.open_zarr(outputMPI) + ds2 = xr.open_zarr(outputNoMPI) - for v in ncfile2.variables.keys(): - assert np.allclose(ncfile1.variables[v][:], ncfile2.variables[v][:]) + for v in ds2.variables.keys(): + if v == 'time': + continue # skip because np.allclose does not work well on np.datetime64 + assert np.allclose(ds1.variables[v][:], ds2.variables[v][:], equal_nan=True) - for a in ncfile2.ncattrs(): + for a in ds2.attrs: if a != 'parcels_version': - assert getattr(ncfile1, a) == getattr(ncfile2, a) + assert ds1.attrs[a] == ds2.attrs[a] - ncfile1.close() - ncfile2.close() + ds1.close() + ds2.close() From f7b0e6ac41f9635ac84b0057f25a1321b795d5e1 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 21 Jul 2022 15:58:20 +0200 Subject: [PATCH 38/79] Removing .nc extension in output files from tests, tutorials and examples Also generating an error when a filename in ParticleFile has extension '.nc' --- parcels/examples/example_decaying_moving_eddy.py | 4 ++-- parcels/examples/example_globcurrent.py | 2 +- parcels/examples/tutorial_analyticaladvection.ipynb | 12 ++++++------ parcels/particlefile/baseparticlefile.py | 11 ++++------- parcels/particleset/baseparticleset.py | 2 +- tests/test_particle_file.py | 2 +- tests/test_scripts.py | 2 +- 7 files changed, 16 insertions(+), 19 deletions(-) diff --git a/parcels/examples/example_decaying_moving_eddy.py b/parcels/examples/example_decaying_moving_eddy.py index 1df622d18..155e4d485 100644 --- a/parcels/examples/example_decaying_moving_eddy.py +++ b/parcels/examples/example_decaying_moving_eddy.py @@ -70,7 +70,7 @@ def decaying_moving_example(fieldset, outfile, mode='scipy', method=AdvectionRK4 @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_rotation_example(mode, tmpdir): - outfile = tmpdir.join('DecayingMovingParticle.nc') + outfile = tmpdir.join('DecayingMovingParticle.zarr') fieldset = decaying_moving_eddy_fieldset() pset = decaying_moving_example(fieldset, outfile, mode=mode) vals = true_values(pset[0].time, start_lon, start_lat) # Calculate values for the particle. @@ -79,7 +79,7 @@ def test_rotation_example(mode, tmpdir): if __name__ == "__main__": fset_filename = 'decaying_moving_eddy' - outfile = 'DecayingMovingParticle.nc' + outfile = 'DecayingMovingParticle.zarr' fieldset = decaying_moving_eddy_fieldset() fieldset.write(fset_filename) diff --git a/parcels/examples/example_globcurrent.py b/parcels/examples/example_globcurrent.py index ffd9efa0b..d8887d1cb 100644 --- a/parcels/examples/example_globcurrent.py +++ b/parcels/examples/example_globcurrent.py @@ -264,7 +264,7 @@ def DeleteParticle(particle, fieldset, time): @pytest.mark.parametrize('dt', [-300, 300]) @pytest.mark.parametrize('pid_offset', [0, 20]) def test_globcurrent_pset_fromfile(mode, dt, pid_offset, tmpdir): - filename = tmpdir.join("pset_fromparticlefile.nc") + filename = tmpdir.join("pset_fromparticlefile.zarr") fieldset = set_globcurrent_fieldset() ptype[mode].setLastID(pid_offset) diff --git a/parcels/examples/tutorial_analyticaladvection.ipynb b/parcels/examples/tutorial_analyticaladvection.ipynb index 1b25befc0..c31680bad 100644 --- a/parcels/examples/tutorial_analyticaladvection.ipynb +++ b/parcels/examples/tutorial_analyticaladvection.ipynb @@ -149,7 +149,7 @@ "\n", "pset = ParticleSet(fieldsetRR, pclass=MyParticle, lon=0, lat=4e3, time=0)\n", "\n", - "output = pset.ParticleFile(name='radialAnalytical.nc', outputdt=delta(hours=1))\n", + "output = pset.ParticleFile(name='radialAnalytical.zarr', outputdt=delta(hours=1))\n", "pset.execute(pset.Kernel(UpdateR) + AdvectionAnalytical,\n", " runtime=delta(hours=24),\n", " dt=np.inf, # needs to be set to np.inf for Analytical Advection\n", @@ -192,7 +192,7 @@ ], "source": [ "output.close()\n", - "plotTrajectoriesFile('radialAnalytical.nc')\n", + "plotTrajectoriesFile('radialAnalytical.zarr')\n", "\n", "print('Particle radius at start of run %f' % pset.radius_start[0])\n", "print('Particle radius at end of run %f' % pset.radius[0])\n", @@ -265,7 +265,7 @@ "X, Y = np.meshgrid(np.arange(0.15, 1.85, 0.1), np.arange(0.15, 0.85, 0.1))\n", "psetAA = ParticleSet(fieldsetDG, pclass=ScipyParticle, lon=X, lat=Y)\n", "\n", - "output = psetAA.ParticleFile(name='doublegyreAA.nc', outputdt=0.1)\n", + "output = psetAA.ParticleFile(name='doublegyreAA.zarr', outputdt=0.1)\n", "psetAA.execute(AdvectionAnalytical,\n", " dt=np.inf, # needs to be set to np.inf for Analytical Advection\n", " runtime=3,\n", @@ -1832,7 +1832,7 @@ ], "source": [ "output.close()\n", - "plotTrajectoriesFile('doublegyreAA.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('doublegyreAA.zarr', mode='movie2d_notebook')" ] }, { @@ -2008,7 +2008,7 @@ "\n", "psetAA = ParticleSet(fieldsetBJ, pclass=ScipyParticle, lon=X, lat=Y, time=0)\n", "\n", - "output = psetAA.ParticleFile(name='bickleyjetAA.nc', outputdt=delta(hours=1))\n", + "output = psetAA.ParticleFile(name='bickleyjetAA.zarr', outputdt=delta(hours=1))\n", "psetAA.execute(AdvectionAnalytical+psetAA.Kernel(ZonalBC),\n", " dt=np.inf,\n", " runtime=delta(days=1),\n", @@ -3161,7 +3161,7 @@ ], "source": [ "output.close()\n", - "plotTrajectoriesFile('bickleyjetAA.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('bickleyjetAA.zarr', mode='movie2d_notebook')" ] }, { diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 67035f002..ab81ee18f 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -39,7 +39,6 @@ class BaseParticleFile(ABC): write_ondelete = None outputdt = None lasttime_written = None - name = None particleset = None parcels_mesh = None time_origin = None @@ -51,7 +50,6 @@ def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False): self.outputdt = outputdt self.lasttime_written = None # variable to check if time has been written already - self.name = name self.particleset = particleset self.parcels_mesh = 'spherical' if self.particleset.fieldset is not None: @@ -88,11 +86,10 @@ def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False): np.uint16: np.iinfo(np.uint16).max, np.uint32: np.iinfo(np.uint32).max, np.uint64: np.iinfo(np.uint64).max} - extension = os.path.splitext(str(self.name))[1] - self.fname = self.name if extension in ['.nc', '.nc4', '.zarr'] else "%s.zarr" % self.name - if extension == '': - extension = '.zarr' - self.outputformat = extension + extension = os.path.splitext(str(name))[1] + if extension in ['.nc', '.nc4']: + raise RuntimeError('Output in NetCDF is not supported anymore. Use .zarr extension for ParticleFile name.') + self.fname = name if extension in ['.zarr'] else "%s.zarr" % name @abstractmethod def _reserved_var_names(self): diff --git a/parcels/particleset/baseparticleset.py b/parcels/particleset/baseparticleset.py index 1a25c62e7..20f18e681 100644 --- a/parcels/particleset/baseparticleset.py +++ b/parcels/particleset/baseparticleset.py @@ -485,7 +485,7 @@ def execute(self, pyfunc=AdvectionRK4, pyfunc_inter=None, endtime=None, runtime= if hasattr(fld, 'to_write') and fld.to_write: if fld.grid.tdim > 1: raise RuntimeError('Field writing during execution only works for Fields with one snapshot in time') - fldfilename = str(output_file.name).replace('.zarr', '_%.4d' % fld.to_write) + fldfilename = str(output_file.fname).replace('.zarr', '_%.4d' % fld.to_write) fld.write(fldfilename) fld.to_write += 1 if abs(time - next_output) < tol: diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index 4d53d827c..5dd4e7fc9 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -176,7 +176,7 @@ class MyParticle(ptype[mode]): @pytest.mark.parametrize('mode', ['scipy', 'jit']) @pytest.mark.parametrize('npart', [1, 2, 5]) def test_variable_written_once(fieldset, pset_mode, mode, tmpdir, npart): - filepath = tmpdir.join("pfile_once_written_variables.nc") + filepath = tmpdir.join("pfile_once_written_variables.zarr") def Update_v(particle, fieldset, time): particle.v_once += 1. diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 62533380c..757a104a0 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -25,7 +25,7 @@ def create_outputfiles(dir, pset_mode): y = (fieldset.U.lat[0] + x, fieldset.U.lat[-1] - x) lat = np.linspace(y[0], y[1], npart) - fp = dir.join("DelayParticle.nc") + fp = dir.join("DelayParticle.zarr") output_file = pset.ParticleFile(name=fp, outputdt=delaytime) for t in range(npart): From ef946ee28eb7832e5f3066cc668aa41d6ef03c5d Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 21 Jul 2022 16:57:10 +0200 Subject: [PATCH 39/79] Further changes from .nc to .zarr in unitest particlefiles --- tests/test_particle_file.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index 5dd4e7fc9..39855b17e 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -246,7 +246,7 @@ def IncrLon(particle, fieldset, time): @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_write_timebackward(fieldset, pset_mode, mode, tmpdir): - outfilepath = tmpdir.join("pfile_write_timebackward.nc") + outfilepath = tmpdir.join("pfile_write_timebackward.zarr") def Update_lon(particle, fieldset, time): particle.lon -= 0.1 * particle.dt @@ -274,7 +274,7 @@ def test_set_calendar(): def test_reset_dt(fieldset, pset_mode, mode, tmpdir): # Assert that p.dt gets reset when a write_time is not a multiple of dt # for p.dt=0.02 to reach outputdt=0.05 and endtime=0.1, the steps should be [0.2, 0.2, 0.1, 0.2, 0.2, 0.1], resulting in 6 kernel executions - filepath = tmpdir.join("pfile_reset_dt.nc") + filepath = tmpdir.join("pfile_reset_dt.zarr") def Update_lon(particle, fieldset, time): particle.lon += 0.1 From 14b3d034102d9708a6352d8434ec7be4ca8f08c6 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 21 Jul 2022 17:02:02 +0200 Subject: [PATCH 40/79] Adding chunks argument to ParticleFile --- parcels/particlefile/baseparticlefile.py | 11 ++++++++--- parcels/particlefile/particlefileaos.py | 5 +++-- parcels/particlefile/particlefilesoa.py | 5 +++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index ab81ee18f..40160c435 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -34,6 +34,7 @@ class BaseParticleFile(ABC): :param outputdt: Interval which dictates the update frequency of file output while ParticleFile is given as an argument of ParticleSet.execute() It is either a timedelta object or a positive double. + :param chunks: Tuple (trajs, obs) to control the size of chunks in the zarr output. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False """ write_ondelete = None @@ -44,10 +45,11 @@ class BaseParticleFile(ABC): time_origin = None lonlatdepth_dtype = None - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False): + def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_ondelete=False): self.write_ondelete = write_ondelete self.outputdt = outputdt + self.chunks = chunks self.lasttime_written = None # variable to check if time has been written already self.particleset = particleset @@ -200,18 +202,21 @@ def write(self, pset, time, deleted_only=False): if len(data_dict) > 0: if not self.written_first: + if self.chunks is None: + self.chunks = (maxtraj, 1) ds = xr.Dataset(attrs=self.metadata) attrs = self._create_variables_attribute_dict() ids = [self.IDs_written[i] for i in data_dict['id']] for var in data_dict: varout = 'z' if var == 'depth' else var varout = 'trajectory' if varout == 'id' else varout - data = np.full((maxtraj, 1), np.nan, dtype=self.vars_to_write[var]) + data = np.full(self.chunks, np.nan, dtype=self.vars_to_write[var]) data[ids, 0] = data_dict[var] ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) + ds[varout].encoding['chunks'] = self.chunks for var in data_dict_once: if var != 'id': # TODO check if needed - data = np.full((maxtraj,), np.nan, dtype=self.vars_to_write_once[var]) + data = np.full((self.chunks[0],), np.nan, dtype=self.vars_to_write_once[var]) data[ids] = data_dict_once[var] ds[var] = xr.DataArray(data=data, dims=["traj"], attrs=attrs[var]) ds.to_zarr(self.fname, mode='w') diff --git a/parcels/particlefile/particlefileaos.py b/parcels/particlefile/particlefileaos.py index 3ae0ce993..81186b0d1 100644 --- a/parcels/particlefile/particlefileaos.py +++ b/parcels/particlefile/particlefileaos.py @@ -14,12 +14,13 @@ class ParticleFileAOS(BaseParticleFile): :param outputdt: Interval which dictates the update frequency of file output while ParticleFile is given as an argument of ParticleSet.execute() It is either a timedelta object or a positive double. + :param chunks: Tuple (trajs, obs) to control the size of chunks in the zarr output. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False """ - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False): + def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_ondelete=False): super(ParticleFileAOS, self).__init__(name=name, particleset=particleset, outputdt=outputdt, - write_ondelete=write_ondelete) + chunks=chunks, write_ondelete=write_ondelete) def __del__(self): super(ParticleFileAOS, self).__del__() diff --git a/parcels/particlefile/particlefilesoa.py b/parcels/particlefile/particlefilesoa.py index b7fe33f0e..1cd088d99 100644 --- a/parcels/particlefile/particlefilesoa.py +++ b/parcels/particlefile/particlefilesoa.py @@ -14,12 +14,13 @@ class ParticleFileSOA(BaseParticleFile): :param outputdt: Interval which dictates the update frequency of file output while ParticleFile is given as an argument of ParticleSet.execute() It is either a timedelta object or a positive double. + :param chunks: Tuple (trajs, obs) to control the size of chunks in the zarr output. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False """ - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False): + def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_ondelete=False): super(ParticleFileSOA, self).__init__(name=name, particleset=particleset, outputdt=outputdt, - write_ondelete=write_ondelete) + chunks=chunks, write_ondelete=write_ondelete) def __del__(self): super(ParticleFileSOA, self).__del__() From d31c5f32f728d9ceec4911f266a74ee1582c41e1 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Fri, 22 Jul 2022 08:37:52 +0200 Subject: [PATCH 41/79] Update to chunk treatment in dump_to_zarr Including an error if chunk trajs is too small --- parcels/particlefile/baseparticlefile.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 40160c435..7756d3fec 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -204,6 +204,9 @@ def write(self, pset, time, deleted_only=False): if not self.written_first: if self.chunks is None: self.chunks = (maxtraj, 1) + if self.chunks[0] < maxtraj: + raise RuntimeError(f"chunks[0] is smaller than the size of the initial particleset ({self.chunks[0]} < {maxtraj}). " + "Please increase 'chunks' in your ParticleFile.") ds = xr.Dataset(attrs=self.metadata) attrs = self._create_variables_attribute_dict() ids = [self.IDs_written[i] for i in data_dict['id']] @@ -231,12 +234,13 @@ def write(self, pset, time, deleted_only=False): varout = 'z' if var == 'depth' else var varout = 'trajectory' if varout == 'id' else varout if max(maxobs) >= Z[varout].shape[1]: - a = np.full((Z[varout].shape[0], 1), np.nan, + a = np.full((Z[varout].shape[0], self.chunks[1]), np.nan, dtype=self.vars_to_write[var]) Z[varout].append(a, axis=1) zarr.consolidate_metadata(store) if max(ids) >= Z[varout].shape[0]: - a = np.full((maxtraj-Z[varout].shape[0], Z[varout].shape[1]), np.nan, + extra_trajs = max(maxtraj-Z[varout].shape[0], self.chunks[0]) + a = np.full((extra_trajs, Z[varout].shape[1]), np.nan, dtype=self.vars_to_write[var]) Z[varout].append(a, axis=0) zarr.consolidate_metadata(store) From d82a67cc7f1d8d8fc76d40c92addbbd05fa7511b Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Mon, 25 Jul 2022 15:42:05 +0200 Subject: [PATCH 42/79] Changing default zarr chunk to 10 timesteps Also adding unit test for chunks in zarr --- parcels/examples/example_mitgcm.py | 4 ++-- parcels/examples/tutorial_diffusion.ipynb | 2 ++ parcels/particlefile/baseparticlefile.py | 2 +- tests/test_advection.py | 2 +- tests/test_particle_file.py | 19 +++++++++++++------ 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/parcels/examples/example_mitgcm.py b/parcels/examples/example_mitgcm.py index 5118f1945..e20de820a 100644 --- a/parcels/examples/example_mitgcm.py +++ b/parcels/examples/example_mitgcm.py @@ -48,7 +48,7 @@ def periodicBC(particle, fieldset, time): size=10, ) pfile = ParticleFile( - "MIT_particles_" + str(mode) + ".zarr", pset, outputdt=delta(days=1) + "MIT_particles_" + str(mode) + ".zarr", pset, outputdt=delta(days=1), chunks=(len(pset), 1) ) kernels = AdvectionRK4 + pset.Kernel(periodicBC) pset.execute( @@ -61,7 +61,7 @@ def test_mitgcm_output_compare(): run_mitgcm_zonally_reentrant("scipy") run_mitgcm_zonally_reentrant("jit") - ds_jit = xr.open_zarr("MIT_particles_jit.zarr") + ds_jit = xr.open_zarr("MIT_particles_jit.zarr", decode_times=False) ds_scipy = xr.open_zarr("MIT_particles_scipy.zarr") np.testing.assert_allclose(ds_jit.lat.data, ds_scipy.lat.data) diff --git a/parcels/examples/tutorial_diffusion.ipynb b/parcels/examples/tutorial_diffusion.ipynb index c65629d06..fceffab84 100644 --- a/parcels/examples/tutorial_diffusion.ipynb +++ b/parcels/examples/tutorial_diffusion.ipynb @@ -226,6 +226,7 @@ "dt = 0.001\n", "testParticles = get_test_particles()\n", "output_file = testParticles.ParticleFile(name=\"M1_out.zarr\",\n", + " chunks=(len(testParticles), 1),\n", " outputdt=timedelta(seconds=dt))\n", "ParcelsRandom.seed(1636) # Random seed for reproducibility\n", "testParticles.execute(AdvectionDiffusionM1,\n", @@ -312,6 +313,7 @@ "dt = 0.001\n", "testParticles = get_test_particles()\n", "output_file = testParticles.ParticleFile(name=\"EM_out.zarr\",\n", + " chunks=(len(testParticles), 1),\n", " outputdt=timedelta(seconds=dt))\n", "ParcelsRandom.seed(1636) # Random seed for reproducibility\n", "testParticles.execute(AdvectionDiffusionEM,\n", diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 7756d3fec..63e251911 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -203,7 +203,7 @@ def write(self, pset, time, deleted_only=False): if len(data_dict) > 0: if not self.written_first: if self.chunks is None: - self.chunks = (maxtraj, 1) + self.chunks = (maxtraj, 10) if self.chunks[0] < maxtraj: raise RuntimeError(f"chunks[0] is smaller than the size of the initial particleset ({self.chunks[0]} < {maxtraj}). " "Please increase 'chunks' in your ParticleFile.") diff --git a/tests/test_advection.py b/tests/test_advection.py index cbc189197..3e366f1ec 100644 --- a/tests/test_advection.py +++ b/tests/test_advection.py @@ -490,7 +490,7 @@ def test_uniform_analytical(pset_mode, mode, u, v, w, direction, tmpdir): pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=x0, lat=y0, depth=z0) outfile_path = tmpdir.join("uniformanalytical.zarr") - outfile = pset.ParticleFile(name=outfile_path, outputdt=1) + outfile = pset.ParticleFile(name=outfile_path, outputdt=1, chunks=(1, 1)) pset.execute(AdvectionAnalytical, runtime=4, dt=direction, output_file=outfile) assert np.abs(pset.lon - x0 - 4 * u * direction) < 1e-6 diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index 39855b17e..f76983cbf 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -80,13 +80,15 @@ def Update_lon(particle, fieldset, time): @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) -def test_pfile_array_remove_all_particles(fieldset, pset_mode, mode, tmpdir, npart=10): +@pytest.mark.parametrize('chunks_obs', [1, None]) +def test_pfile_array_remove_all_particles(fieldset, pset_mode, mode, chunks_obs, tmpdir, npart=10): filepath = tmpdir.join("pfile_array_remove_particles.zarr") pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=np.linspace(0, 1, npart), lat=0.5*np.ones(npart), time=0) - pfile = pset.ParticleFile(filepath) + chunks = (npart, chunks_obs) if chunks_obs else None + pfile = pset.ParticleFile(filepath, chunks=chunks) pfile.write(pset, 0) for _ in range(npart): pset.remove_indices(-1) @@ -94,7 +96,12 @@ def test_pfile_array_remove_all_particles(fieldset, pset_mode, mode, tmpdir, npa pfile.write(pset, 2) ds = xr.open_zarr(filepath) - assert ds['time'][:].shape == (npart, 1) + assert np.allclose(ds['time'][:, 0], np.timedelta64(0, 's'), atol=np.timedelta64(1, 'ms')) + if chunks_obs is not None: + assert ds['time'][:].shape == chunks + else: + assert ds['time'][:].shape[0] == npart + assert np.all(np.isnan(ds['time'][:, 1:])) ds.close() @@ -119,7 +126,7 @@ def DeleteP(particle, fieldset, time): pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=lon, lat=lat) - outfile = pset.ParticleFile(name=filepath, write_ondelete=True) + outfile = pset.ParticleFile(name=filepath, write_ondelete=True, chunks=(len(pset), 1)) outfile.add_metadata('runtime', runtime) pset.execute(move_west, runtime=runtime, dt=dt, output_file=outfile, recovery={ErrorCode.ErrorOutOfBounds: DeleteP}) @@ -127,7 +134,7 @@ def DeleteP(particle, fieldset, time): ds = xr.open_zarr(filepath) assert ds.runtime == runtime lon = ds['lon'][:] - assert (lon.size == noutside) + assert (sum(np.isfinite(lon)) == noutside) ds.close() @@ -219,7 +226,7 @@ class MyParticle(ptype[mode]): elif type == 'timearr': pset = pset_type[pset_mode]['pset'](fieldset, lon=np.zeros(runtime), lat=np.zeros(runtime), pclass=MyParticle, time=list(range(runtime))) outfilepath = tmpdir.join("pfile_repeated_release.zarr") - pfile = pset.ParticleFile(outfilepath, outputdt=abs(dt)) + pfile = pset.ParticleFile(outfilepath, outputdt=abs(dt), chunks=(1, 1)) def IncrLon(particle, fieldset, time): particle.sample_var += 1. From 0c63c2cc0415dc256336942b71f220fe5393864d Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 26 Jul 2022 10:24:08 +0200 Subject: [PATCH 43/79] Cleanup of particlefile.to_write() --- parcels/examples/example_mitgcm.py | 2 +- parcels/particlefile/baseparticlefile.py | 24 ++++++++++++++---------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/parcels/examples/example_mitgcm.py b/parcels/examples/example_mitgcm.py index e20de820a..cde1ee092 100644 --- a/parcels/examples/example_mitgcm.py +++ b/parcels/examples/example_mitgcm.py @@ -61,7 +61,7 @@ def test_mitgcm_output_compare(): run_mitgcm_zonally_reentrant("scipy") run_mitgcm_zonally_reentrant("jit") - ds_jit = xr.open_zarr("MIT_particles_jit.zarr", decode_times=False) + ds_jit = xr.open_zarr("MIT_particles_jit.zarr") ds_scipy = xr.open_zarr("MIT_particles_scipy.zarr") np.testing.assert_allclose(ds_jit.lat.data, ds_scipy.lat.data) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 63e251911..5d452f2b6 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -162,6 +162,14 @@ def add_metadata(self, name, message): """ self.metadata[name] = message + def _convert_varout_name(self, var): + if var == 'depth': + return 'z' + elif var == 'id': + return 'trajectory' + else: + return var + def write(self, pset, time, deleted_only=False): """Write all data from one time step to the zarr file @@ -169,21 +177,19 @@ def write(self, pset, time, deleted_only=False): :param time: Time at which to write ParticleSet :param deleted_only: Flag to write only the deleted Particles """ - data_dict, data_dict_once = pset.to_dict(self, time, deleted_only=deleted_only) + data_dicts = pset.to_dict(self, time, deleted_only=deleted_only) if MPI: - all_data_dict = MPI.COMM_WORLD.gather(data_dict, root=0) - all_data_dict_once = MPI.COMM_WORLD.gather(data_dict_once, root=0) + all_data_dicts = MPI.COMM_WORLD.gather(data_dicts, root=0) rank = MPI.COMM_WORLD.Get_rank() else: - all_data_dict = [data_dict] - all_data_dict_once = [data_dict_once] + all_data_dicts = [data_dicts] rank = 0 if rank == 0: maxtraj = len(self.IDs_written) - for data_dict, data_dict_once in zip(all_data_dict, all_data_dict_once): + for data_dict, data_dict_once in all_data_dicts: if len(data_dict) > 0: for i in data_dict['id']: if i not in self.IDs_written: @@ -211,8 +217,7 @@ def write(self, pset, time, deleted_only=False): attrs = self._create_variables_attribute_dict() ids = [self.IDs_written[i] for i in data_dict['id']] for var in data_dict: - varout = 'z' if var == 'depth' else var - varout = 'trajectory' if varout == 'id' else varout + varout = self._convert_varout_name(var) data = np.full(self.chunks, np.nan, dtype=self.vars_to_write[var]) data[ids, 0] = data_dict[var] ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) @@ -231,8 +236,7 @@ def write(self, pset, time, deleted_only=False): maxobs = [self.maxobs[i] for i in data_dict['id']] for var in data_dict: - varout = 'z' if var == 'depth' else var - varout = 'trajectory' if varout == 'id' else varout + varout = self._convert_varout_name(var) if max(maxobs) >= Z[varout].shape[1]: a = np.full((Z[varout].shape[0], self.chunks[1]), np.nan, dtype=self.vars_to_write[var]) From f2e4da93f4f9d4da9dae868c1c9b9431a83783ed Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 26 Jul 2022 12:12:00 +0200 Subject: [PATCH 44/79] Updating more examples to zarr output --- parcels/examples/example_nemo_curvilinear.py | 10 +++++----- parcels/examples/example_stommel.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/parcels/examples/example_nemo_curvilinear.py b/parcels/examples/example_nemo_curvilinear.py index 5ccd726f1..b8dc45cf9 100644 --- a/parcels/examples/example_nemo_curvilinear.py +++ b/parcels/examples/example_nemo_curvilinear.py @@ -56,7 +56,7 @@ def periodicBC(particle, fieldSet, time): def make_plot(trajfile): - from netCDF4 import Dataset + import xarray as xr import matplotlib.pyplot as plt import cartopy @@ -66,10 +66,10 @@ def __init__(self): def load_particles_file(fname, varnames): T = ParticleData() - pfile = Dataset(fname, 'r') - T.id = pfile.variables['trajectory'][:] + ds = xr.open_zarr(fname) + T.id = ds['trajectory'][:] for v in varnames: - setattr(T, v, pfile.variables[v][:]) + setattr(T, v, ds[v][:]) return T T = load_particles_file(trajfile, ['lon', 'lat', 'time']) @@ -121,4 +121,4 @@ def test_nemo_3D_samegrid(): outfile = "nemo_particles" run_nemo_curvilinear(args.mode, outfile) - make_plot(outfile+'.nc') + make_plot(outfile+'.zarr') diff --git a/parcels/examples/example_stommel.py b/parcels/examples/example_stommel.py index 0d8baca8d..89e54556b 100644 --- a/parcels/examples/example_stommel.py +++ b/parcels/examples/example_stommel.py @@ -81,7 +81,7 @@ def AgeP(particle, fieldset, time): def stommel_example(npart=1, mode='jit', verbose=False, method=AdvectionRK4, grid_type='A', - outfile="StommelParticle.nc", repeatdt=None, maxage=None, write_fields=True, pset_mode='soa'): + outfile="StommelParticle.zarr", repeatdt=None, maxage=None, write_fields=True, pset_mode='soa'): timer.fieldset = timer.Timer('FieldSet', parent=timer.stommel) fieldset = stommel_fieldset(grid_type=grid_type) if write_fields: @@ -158,7 +158,7 @@ def test_stommel_fieldset(pset_mode, mode, grid_type, tmpdir): help='Print particle information before and after execution') p.add_argument('-m', '--method', choices=('RK4', 'EE', 'RK45'), default='RK4', help='Numerical method used for advection') - p.add_argument('-o', '--outfile', default='StommelParticle.nc', + p.add_argument('-o', '--outfile', default='StommelParticle.zarr', help='Name of output file') p.add_argument('-r', '--repeatdt', default=None, type=int, help='repeatdt of the ParticleSet') From da449d849786936323201fbb5c8d379c040c8937 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 26 Jul 2022 12:13:09 +0200 Subject: [PATCH 45/79] Moving extending zarr dimensions to separate method --- parcels/particlefile/baseparticlefile.py | 44 +++++++++++++----------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 5d452f2b6..a83e3ad77 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -69,6 +69,7 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde self.written_once = [] self.IDs_written = {} self.maxobs = {} + self.maxtraj = 0 self.written_first = False self.metadata = {"feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", @@ -170,6 +171,18 @@ def _convert_varout_name(self, var): else: return var + def _extend_zarr_dims(self, Z, store, dtype, axis): + if axis == 1: + a = np.full((Z.shape[0], self.chunks[1]), np.nan, dtype=dtype) + else: + extra_trajs = max(self.maxtraj - Z.shape[0], self.chunks[0]) + if len(Z.shape) == 2: + a = np.full((extra_trajs, Z.shape[1]), np.nan, dtype=dtype) + else: + a = np.full((extra_trajs,), np.nan, dtype=dtype) + Z.append(a, axis=axis) + zarr.consolidate_metadata(store) + def write(self, pset, time, deleted_only=False): """Write all data from one time step to the zarr file @@ -188,30 +201,29 @@ def write(self, pset, time, deleted_only=False): if rank == 0: - maxtraj = len(self.IDs_written) for data_dict, data_dict_once in all_data_dicts: if len(data_dict) > 0: for i in data_dict['id']: if i not in self.IDs_written: - self.IDs_written[i] = maxtraj + self.IDs_written[i] = self.maxtraj self.maxobs[i] = 0 - maxtraj += 1 + self.maxtraj += 1 else: self.maxobs[i] += 1 if len(data_dict_once) > 0: for i in data_dict_once['id']: if i not in self.IDs_written: - self.IDs_written[i] = maxtraj + self.IDs_written[i] = self.maxtraj self.maxobs[i] = -1 - maxtraj += 1 + self.maxtraj += 1 if len(data_dict) > 0: if not self.written_first: if self.chunks is None: - self.chunks = (maxtraj, 10) - if self.chunks[0] < maxtraj: - raise RuntimeError(f"chunks[0] is smaller than the size of the initial particleset ({self.chunks[0]} < {maxtraj}). " + self.chunks = (self.maxtraj, 10) + if self.chunks[0] < self.maxtraj: + raise RuntimeError(f"chunks[0] is smaller than the size of the initial particleset ({self.chunks[0]} < {self.maxtraj}). " "Please increase 'chunks' in your ParticleFile.") ds = xr.Dataset(attrs=self.metadata) attrs = self._create_variables_attribute_dict() @@ -238,24 +250,14 @@ def write(self, pset, time, deleted_only=False): for var in data_dict: varout = self._convert_varout_name(var) if max(maxobs) >= Z[varout].shape[1]: - a = np.full((Z[varout].shape[0], self.chunks[1]), np.nan, - dtype=self.vars_to_write[var]) - Z[varout].append(a, axis=1) - zarr.consolidate_metadata(store) + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) if max(ids) >= Z[varout].shape[0]: - extra_trajs = max(maxtraj-Z[varout].shape[0], self.chunks[0]) - a = np.full((extra_trajs, Z[varout].shape[1]), np.nan, - dtype=self.vars_to_write[var]) - Z[varout].append(a, axis=0) - zarr.consolidate_metadata(store) + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) Z[varout].vindex[ids, maxobs] = data_dict[var] if len(data_dict_once) > 0: ids = [self.IDs_written[i] for i in data_dict_once['id']] for var in data_dict_once: if var != 'id': # TODO check if needed if max(ids) >= Z[var].shape[0]: - a = np.full((maxtraj - Z[var].shape[0],), np.nan, - dtype=self.vars_to_write_once[var]) - Z[var].append(a, axis=0) - zarr.consolidate_metadata(store) + self._extend_zarr_dims(Z[var], store, dtype=self.vars_to_write_once[var], axis=0) Z[var].vindex[ids] = data_dict_once[var] From 31b3a6b53091d7d5a5248918a3012d40d773d1fe Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 26 Jul 2022 18:34:37 +0200 Subject: [PATCH 46/79] Simplifying data_dicts from collections_toDict --- parcels/collection/collectionaos.py | 36 ++++---- parcels/collection/collections.py | 11 ++- parcels/collection/collectionsoa.py | 38 ++++---- parcels/particle.py | 5 ++ parcels/particlefile/baseparticlefile.py | 109 ++++++++++------------- 5 files changed, 94 insertions(+), 105 deletions(-) diff --git a/parcels/collection/collectionaos.py b/parcels/collection/collectionaos.py index 97ad0d328..f2b2f5852 100644 --- a/parcels/collection/collectionaos.py +++ b/parcels/collection/collectionaos.py @@ -896,20 +896,16 @@ def toDictionary(self, pfile, time, deleted_only=False): :param pfile: ParticleFile object requesting the conversion :param time: Time at which to write ParticleSet :param deleted_only: Flag to write only the deleted Particles - returns two dictionaries: one for all variables to be written each outputdt, - and one for all variables to be written once + returns a dictionary with data of all variables to be written This function depends on the specific collection in question and thus needs to be specified in specific derivative classes. """ data_dict = {} - data_dict_once = {} - time = time.total_seconds() if isinstance(time, delta) else time indices_to_write = [] - if pfile.lasttime_written != time and \ - (pfile.write_ondelete is False or deleted_only): + if pfile.lasttime_written != time and (pfile.write_ondelete is False or deleted_only is not False): if self._ncount == 0: logger.warning("ParticleSet is empty on writing as array at time %g" % time) else: @@ -925,29 +921,29 @@ def toDictionary(self, pfile, time, deleted_only=False): else: indices_to_write = _to_write_particles(self._data, time) if len(indices_to_write) > 0: + ids = [np.int64(p.id) for p in self._data[indices_to_write]] for var in pfile.vars_to_write: - if 'id' in var: - data_dict[var] = np.array([np.int64(getattr(p, var)) for p in self._data[indices_to_write]]) - else: - data_dict[var] = np.array([getattr(p, var) for p in self._data[indices_to_write]]) + if self.ptype[var].to_write != 'once': + data_dict[var] = dict(zip(ids, [getattr(p, var) for p in self._data[indices_to_write]])) + + if self.has_write_once_variables(): + first_write = [p for p in self._data if _is_particle_started_yet(p, time) + and (np.int64(p.id) not in pfile.written_once)] + if np.any(first_write): + written_once_ids = [np.int64(p.id) for p in first_write] + pfile.written_once.extend(written_once_ids) + for var in pfile.vars_to_write: + if self.ptype[var].to_write == 'once': + data_dict[var] = dict(zip(written_once_ids, [getattr(p, var) for p in first_write])) pset_errs = [p for p in self._data[indices_to_write] if p.state != OperationCode.Delete and abs(time-p.time) > 1e-3 and np.isfinite(p.time)] for p in pset_errs: logger.warning_once('time argument in pfile.write() is %g, but a particle has time % g.' % (time, p.time)) - if len(pfile.vars_to_write_once) > 0: - # _to_write_particles(self._data, time) - first_write = [p for p in self._data if _is_particle_started_yet(p, time) and (np.int64(p.id) not in pfile.written_once)] - if np.any(first_write): - data_dict_once['id'] = np.array([p.id for p in first_write]).astype(dtype=np.int64) - for var in pfile.vars_to_write_once: - data_dict_once[var] = np.array([getattr(p, var) for p in first_write]) - pfile.written_once.extend(np.array(data_dict_once['id']).astype(dtype=np.int64).tolist()) - if deleted_only is False: pfile.lasttime_written = time - return data_dict, data_dict_once + return data_dict def toArray(self): """ diff --git a/parcels/collection/collections.py b/parcels/collection/collections.py index fa9695560..3e87562c9 100644 --- a/parcels/collection/collections.py +++ b/parcels/collection/collections.py @@ -898,15 +898,20 @@ def __getattr__(self, name): else: return False + def has_write_once_variables(self): + for var in self.ptype.variables: + if var.to_write == 'once': + return True + return False + @abstractmethod - def toDictionary(self): + def toDictionary(self, pfile, time, deleted_only=False): """ Convert all Particle data from one time step to a python dictionary. :param pfile: ParticleFile object requesting the conversion :param time: Time at which to write ParticleSet :param deleted_only: Flag to write only the deleted Particles - returns two dictionaries: one for all variables to be written each outputdt, - and one for all variables to be written once + returns a dictionary with data of all variables to be written This function depends on the specific collection in question and thus needs to be specified in specific derivatives classes. diff --git a/parcels/collection/collectionsoa.py b/parcels/collection/collectionsoa.py index b1b2b9b76..4c24a924d 100644 --- a/parcels/collection/collectionsoa.py +++ b/parcels/collection/collectionsoa.py @@ -821,52 +821,50 @@ def toDictionary(self, pfile, time, deleted_only=False): :param pfile: ParticleFile object requesting the conversion :param time: Time at which to write ParticleSet :param deleted_only: Flag to write only the deleted Particles - returns two dictionaries: one for all variables to be written each outputdt, - and one for all variables to be written once + returns a dictionary with data of all variables to be written This function depends on the specific collection in question and thus needs to be specified in specific derivative classes. """ - data_dict = {} - data_dict_once = {} - time = time.total_seconds() if isinstance(time, delta) else time indices_to_write = [] - if pfile.lasttime_written != time and \ - (pfile.write_ondelete is False or deleted_only is not False): + if pfile.lasttime_written != time and (pfile.write_ondelete is False or deleted_only is not False): if self._data['id'].size == 0: logger.warning("ParticleSet is empty on writing as array at time %g" % time) else: if deleted_only is not False: if type(deleted_only) not in [list, np.ndarray] and deleted_only in [True, 1]: - indices_to_write = np.where(np.isin(self._data['state'], - [OperationCode.Delete]))[0] + indices_to_write = np.where(np.isin(self._data['state'], [OperationCode.Delete]))[0] elif type(deleted_only) in [list, np.ndarray]: indices_to_write = deleted_only else: indices_to_write = _to_write_particles(self._data, time) - if np.any(indices_to_write): + if len(indices_to_write) > 0: + ids = self._data['id'][indices_to_write] for var in pfile.vars_to_write: - data_dict[var] = self._data[var][indices_to_write] + if self.ptype[var].to_write != 'once': + data_dict[var] = dict(zip(ids, self._data[var][indices_to_write])) + + if self.has_write_once_variables(): + first_write = (_to_write_particles(self._data, time) & _is_particle_started_yet(self._data, time) + & np.isin(self._data['id'], pfile.written_once, invert=True)) + if np.any(first_write): + written_once_ids = np.array(self._data['id'][first_write]).astype(dtype=np.int64).tolist() + pfile.written_once.extend(written_once_ids) + for var in pfile.vars_to_write: + if self.ptype[var].to_write == 'once': + data_dict[var] = dict(zip(written_once_ids, self._data[var][first_write])) pset_errs = ((self._data['state'][indices_to_write] != OperationCode.Delete) & np.greater(np.abs(time - self._data['time'][indices_to_write]), 1e-3, where=np.isfinite(self._data['time'][indices_to_write]))) if np.count_nonzero(pset_errs) > 0: logger.warning_once('time argument in pfile.write() is {}, but particles have time {}'.format(time, self._data['time'][pset_errs])) - if len(pfile.vars_to_write_once) > 0: - first_write = (_to_write_particles(self._data, time) & _is_particle_started_yet(self._data, time) & np.isin(self._data['id'], pfile.written_once, invert=True)) - if np.any(first_write): - data_dict_once['id'] = np.array(self._data['id'][first_write]).astype(dtype=np.int64) - for var in pfile.vars_to_write_once: - data_dict_once[var] = self._data[var][first_write] - pfile.written_once.extend(np.array(self._data['id'][first_write]).astype(dtype=np.int64).tolist()) - if deleted_only is False: pfile.lasttime_written = time - return data_dict, data_dict_once + return data_dict def toArray(self): """ diff --git a/parcels/particle.py b/parcels/particle.py index 767f7bd8a..5d45294a5 100644 --- a/parcels/particle.py +++ b/parcels/particle.py @@ -85,6 +85,11 @@ def __init__(self, pclass): def __repr__(self): return "PType<%s>::%s" % (self.name, self.variables) + def __getitem__(self, item): + for v in self.variables: + if v.name == item: + return v + @property def _cache_key(self): return "-".join(["%s:%s" % (v.name, v.dtype) for v in self.variables]) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index a83e3ad77..c531dedf4 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -58,19 +58,17 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde self.parcels_mesh = self.particleset.fieldset.gridset.grids[0].mesh self.time_origin = self.particleset.time_origin self.lonlatdepth_dtype = self.particleset.collection.lonlatdepth_dtype - self.vars_to_write = {} - self.vars_to_write_once = {} - for v in self.particleset.collection.ptype.variables: - if v.to_write == 'once': - self.vars_to_write_once[v.name] = v.dtype - elif v.to_write is True: - self.vars_to_write[v.name] = v.dtype - if len(self.vars_to_write_once) > 0: - self.written_once = [] - self.IDs_written = {} - self.maxobs = {} - self.maxtraj = 0 + self.written_once = [] + self.ids_written = {} + self.obs_written = {} + self.maxids = 0 + self.maxobs = 0 self.written_first = False + self.vars_to_write = {} + for var in self.particleset.collection.ptype.variables: + self.obs_written[var.name] = {} + if var.to_write: + self.vars_to_write[var.name] = var.dtype self.metadata = {"feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", "ncei_template_version": "NCEI_NetCDF_Trajectory_Template_v2.0", @@ -141,12 +139,6 @@ def _create_variables_attribute_dict(self): "standard_name": vname, "units": "unknown"} - for vname in self.vars_to_write_once: - attrs[vname] = {"_FillValue": self.fill_value_map[self.vars_to_write_once[vname]], - "long_name": "", - "standard_name": vname, - "units": "unknown"} - return attrs def __del__(self): @@ -171,11 +163,14 @@ def _convert_varout_name(self, var): else: return var + def write_once(self, var): + return self.particleset.collection.ptype[var].to_write == 'once' + def _extend_zarr_dims(self, Z, store, dtype, axis): if axis == 1: a = np.full((Z.shape[0], self.chunks[1]), np.nan, dtype=dtype) else: - extra_trajs = max(self.maxtraj - Z.shape[0], self.chunks[0]) + extra_trajs = max(self.maxids - Z.shape[0], self.chunks[0]) if len(Z.shape) == 2: a = np.full((extra_trajs, Z.shape[1]), np.nan, dtype=dtype) else: @@ -200,64 +195,54 @@ def write(self, pset, time, deleted_only=False): rank = 0 if rank == 0: - - for data_dict, data_dict_once in all_data_dicts: + for data_dict in all_data_dicts: if len(data_dict) > 0: - for i in data_dict['id']: - if i not in self.IDs_written: - self.IDs_written[i] = self.maxtraj - self.maxobs[i] = 0 - self.maxtraj += 1 - else: - self.maxobs[i] += 1 - - if len(data_dict_once) > 0: - for i in data_dict_once['id']: - if i not in self.IDs_written: - self.IDs_written[i] = self.maxtraj - self.maxobs[i] = -1 - self.maxtraj += 1 + for var in data_dict: + for i in data_dict[var].keys(): + if i not in self.ids_written: + self.ids_written[i] = self.maxids + self.maxids += 1 + if i in self.obs_written[var]: + self.obs_written[var][i] += 1 + else: + self.obs_written[var][i] = 0 - if len(data_dict) > 0: if not self.written_first: if self.chunks is None: - self.chunks = (self.maxtraj, 10) - if self.chunks[0] < self.maxtraj: - raise RuntimeError(f"chunks[0] is smaller than the size of the initial particleset ({self.chunks[0]} < {self.maxtraj}). " + self.chunks = (self.maxids, 10) + if self.chunks[0] < self.maxids: + raise RuntimeError(f"chunks[0] is smaller than the size of the initial particleset ({self.chunks[0]} < {self.maxids}). " "Please increase 'chunks' in your ParticleFile.") ds = xr.Dataset(attrs=self.metadata) attrs = self._create_variables_attribute_dict() - ids = [self.IDs_written[i] for i in data_dict['id']] for var in data_dict: + ids = [self.ids_written[i] for i in data_dict[var].keys()] varout = self._convert_varout_name(var) - data = np.full(self.chunks, np.nan, dtype=self.vars_to_write[var]) - data[ids, 0] = data_dict[var] - ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=attrs[varout]) + if self.write_once(var): + data = np.full((self.chunks[0],), np.nan, dtype=self.vars_to_write[var]) + data[ids] = [*data_dict[var].values()] + dims = ["traj"] + else: + data = np.full(self.chunks, np.nan, dtype=self.vars_to_write[var]) + data[ids, 0] = [*data_dict[var].values()] + dims = ["traj", "obs"] + ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) ds[varout].encoding['chunks'] = self.chunks - for var in data_dict_once: - if var != 'id': # TODO check if needed - data = np.full((self.chunks[0],), np.nan, dtype=self.vars_to_write_once[var]) - data[ids] = data_dict_once[var] - ds[var] = xr.DataArray(data=data, dims=["traj"], attrs=attrs[var]) ds.to_zarr(self.fname, mode='w') self.written_first = True else: store = zarr.DirectoryStore(self.fname) Z = zarr.group(store=store, overwrite=False) - ids = [self.IDs_written[i] for i in data_dict['id']] - maxobs = [self.maxobs[i] for i in data_dict['id']] - for var in data_dict: + maxobs = max(self.obs_written[var].values()) + obs = [self.obs_written[var][i] for i in data_dict[var].keys()] + ids = [self.ids_written[i] for i in data_dict[var].keys()] varout = self._convert_varout_name(var) - if max(maxobs) >= Z[varout].shape[1]: - self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) - if max(ids) >= Z[varout].shape[0]: + if self.maxids > Z[varout].shape[0]: self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) - Z[varout].vindex[ids, maxobs] = data_dict[var] - if len(data_dict_once) > 0: - ids = [self.IDs_written[i] for i in data_dict_once['id']] - for var in data_dict_once: - if var != 'id': # TODO check if needed - if max(ids) >= Z[var].shape[0]: - self._extend_zarr_dims(Z[var], store, dtype=self.vars_to_write_once[var], axis=0) - Z[var].vindex[ids] = data_dict_once[var] + if self.write_once(var): + Z[varout].vindex[ids] = [*data_dict[var].values()] + else: + if maxobs >= Z[varout].shape[1]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) + Z[varout].vindex[ids, obs] = [*data_dict[var].values()] From 0ef5d9693444faa031d49c60c6b85c7e6a9116ed Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 27 Jul 2022 15:49:57 +0200 Subject: [PATCH 47/79] Removing use of toDictionary And also speeding up after dramatic slowdown in previous refactor --- parcels/collection/collectionaos.py | 83 ++++---------------- parcels/collection/collections.py | 14 ---- parcels/collection/collectionsoa.py | 74 ++++-------------- parcels/particlefile/baseparticlefile.py | 99 +++++++++++++++++------- parcels/particleset/particlesetaos.py | 11 --- parcels/particleset/particlesetsoa.py | 11 --- 6 files changed, 97 insertions(+), 195 deletions(-) diff --git a/parcels/collection/collectionaos.py b/parcels/collection/collectionaos.py index f2b2f5852..1efe8d2ce 100644 --- a/parcels/collection/collectionaos.py +++ b/parcels/collection/collectionaos.py @@ -1,4 +1,3 @@ -from datetime import timedelta as delta from operator import attrgetter # NOQA from ctypes import c_void_p @@ -28,24 +27,6 @@ __all__ = ['ParticleCollectionAOS', 'ParticleCollectionIterableAOS', 'ParticleCollectionIteratorAOS'] -def _to_write_particles(pd, time): - """We don't want to write a particle that is not started yet. - Particle will be written if particle.time is between time-dt/2 and time+dt (/2) - """ - return [i for i, p in enumerate(pd) if (((time - np.abs(p.dt/2) <= p.time < time + np.abs(p.dt)) - or (np.isnan(p.dt) and np.equal(time, p.time))) - and np.isfinite(p.id))] - - -def _is_particle_started_yet(particle, time): - """We don't want to write a particle that is not started yet. - Particle will be written if: - * particle.time is equal to time argument of pfile.write() - * particle.time is before time (in case particle was deleted between previous export and current one) - """ - return (particle.dt*particle.time <= particle.dt*time or np.isclose(particle.time, time)) - - def _convert_to_flat_array(var): """Convert lists and single integers/floats to one-dimensional numpy arrays @@ -890,60 +871,22 @@ def cstruct(self): cstruct = self._data_c.ctypes.data_as(c_void_p) return cstruct - def toDictionary(self, pfile, time, deleted_only=False): + def _to_write_particles(self, pd, time): + """We don't want to write a particle that is not started yet. + Particle will be written if particle.time is between time-dt/2 and time+dt (/2) """ - Convert all Particle data from one time step to a python dictionary. - :param pfile: ParticleFile object requesting the conversion - :param time: Time at which to write ParticleSet - :param deleted_only: Flag to write only the deleted Particles - returns a dictionary with data of all variables to be written + return np.array([i for i, p in enumerate(pd) if (((time - np.abs(p.dt/2) <= p.time < time + np.abs(p.dt)) + or (np.isnan(p.dt) and np.equal(time, p.time))) + and np.isfinite(p.id))]) - This function depends on the specific collection in question and thus needs to be specified in specific - derivative classes. - """ - data_dict = {} - time = time.total_seconds() if isinstance(time, delta) else time + def getvardata(self, var, indices=None): + if indices is None: + return np.array([getattr(p, var) for p in self._data]) + else: + return np.array([getattr(p, var) for p in self._data[indices]]) - indices_to_write = [] - if pfile.lasttime_written != time and (pfile.write_ondelete is False or deleted_only is not False): - if self._ncount == 0: - logger.warning("ParticleSet is empty on writing as array at time %g" % time) - else: - if deleted_only: - if type(deleted_only) not in [list, np.ndarray] and deleted_only in [True, 1]: - data_states = [p.state for p in self._data] - indices_to_write = np.where(np.isin(data_states, [OperationCode.Delete]))[0] - elif type(deleted_only) in [list, np.ndarray] and len(deleted_only) > 0: - if type(deleted_only[0]) in [int, np.int32, np.uint32]: - indices_to_write = deleted_only - elif isinstance(deleted_only[0], ScipyParticle): - indices_to_write = [i for i, p in self._data if p in deleted_only] - else: - indices_to_write = _to_write_particles(self._data, time) - if len(indices_to_write) > 0: - ids = [np.int64(p.id) for p in self._data[indices_to_write]] - for var in pfile.vars_to_write: - if self.ptype[var].to_write != 'once': - data_dict[var] = dict(zip(ids, [getattr(p, var) for p in self._data[indices_to_write]])) - - if self.has_write_once_variables(): - first_write = [p for p in self._data if _is_particle_started_yet(p, time) - and (np.int64(p.id) not in pfile.written_once)] - if np.any(first_write): - written_once_ids = [np.int64(p.id) for p in first_write] - pfile.written_once.extend(written_once_ids) - for var in pfile.vars_to_write: - if self.ptype[var].to_write == 'once': - data_dict[var] = dict(zip(written_once_ids, [getattr(p, var) for p in first_write])) - - pset_errs = [p for p in self._data[indices_to_write] if p.state != OperationCode.Delete and abs(time-p.time) > 1e-3 and np.isfinite(p.time)] - for p in pset_errs: - logger.warning_once('time argument in pfile.write() is %g, but a particle has time % g.' % (time, p.time)) - - if deleted_only is False: - pfile.lasttime_written = time - - return data_dict + def setvardata(self, var, index, val): + setattr(self._data[index], var, val) def toArray(self): """ diff --git a/parcels/collection/collections.py b/parcels/collection/collections.py index 3e87562c9..8174db02d 100644 --- a/parcels/collection/collections.py +++ b/parcels/collection/collections.py @@ -904,20 +904,6 @@ def has_write_once_variables(self): return True return False - @abstractmethod - def toDictionary(self, pfile, time, deleted_only=False): - """ - Convert all Particle data from one time step to a python dictionary. - :param pfile: ParticleFile object requesting the conversion - :param time: Time at which to write ParticleSet - :param deleted_only: Flag to write only the deleted Particles - returns a dictionary with data of all variables to be written - - This function depends on the specific collection in question and thus needs to be specified in specific - derivatives classes. - """ - pass - @abstractmethod def set_variable_write_status(self, var, write_status): """ diff --git a/parcels/collection/collectionsoa.py b/parcels/collection/collectionsoa.py index 4c24a924d..ded3fe052 100644 --- a/parcels/collection/collectionsoa.py +++ b/parcels/collection/collectionsoa.py @@ -1,4 +1,3 @@ -from datetime import timedelta as delta from operator import attrgetter from ctypes import Structure, POINTER from bisect import bisect_left @@ -26,17 +25,6 @@ 'See http://oceanparcels.org/#parallel_install for more information') -def _to_write_particles(pd, time): - """We don't want to write a particle that is not started yet. - Particle will be written if particle.time is between time-dt/2 and time+dt (/2) - """ - return ((np.less_equal(time - np.abs(pd['dt']/2), pd['time'], where=np.isfinite(pd['time'])) - & np.greater_equal(time + np.abs(pd['dt'] / 2), pd['time'], where=np.isfinite(pd['time'])) - | ((np.isnan(pd['dt'])) & np.equal(time, pd['time'], where=np.isfinite(pd['time'])))) - & (np.isfinite(pd['id'])) - & (np.isfinite(pd['time']))) - - def _is_particle_started_yet(pd, time): """We don't want to write a particle that is not started yet. Particle will be written if: @@ -815,56 +803,24 @@ def flatten_dense_data_array(vname): cstruct = CParticles(*cdata) return cstruct - def toDictionary(self, pfile, time, deleted_only=False): + def _to_write_particles(self, pd, time): + """We don't want to write a particle that is not started yet. + Particle will be written if particle.time is between time-dt/2 and time+dt (/2) """ - Convert all Particle data from one time step to a python dictionary. - :param pfile: ParticleFile object requesting the conversion - :param time: Time at which to write ParticleSet - :param deleted_only: Flag to write only the deleted Particles - returns a dictionary with data of all variables to be written + return np.where((np.less_equal(time - np.abs(pd['dt'] / 2), pd['time'], where=np.isfinite(pd['time'])) + & np.greater_equal(time + np.abs(pd['dt'] / 2), pd['time'], where=np.isfinite(pd['time'])) + | ((np.isnan(pd['dt'])) & np.equal(time, pd['time'], where=np.isfinite(pd['time'])))) + & (np.isfinite(pd['id'])) + & (np.isfinite(pd['time'])))[0] - This function depends on the specific collection in question and thus needs to be specified in specific - derivative classes. - """ - data_dict = {} - time = time.total_seconds() if isinstance(time, delta) else time + def getvardata(self, var, indices=None): + if indices is None: + return self._data[var] + else: + return self._data[var][indices] - indices_to_write = [] - if pfile.lasttime_written != time and (pfile.write_ondelete is False or deleted_only is not False): - if self._data['id'].size == 0: - logger.warning("ParticleSet is empty on writing as array at time %g" % time) - else: - if deleted_only is not False: - if type(deleted_only) not in [list, np.ndarray] and deleted_only in [True, 1]: - indices_to_write = np.where(np.isin(self._data['state'], [OperationCode.Delete]))[0] - elif type(deleted_only) in [list, np.ndarray]: - indices_to_write = deleted_only - else: - indices_to_write = _to_write_particles(self._data, time) - if len(indices_to_write) > 0: - ids = self._data['id'][indices_to_write] - for var in pfile.vars_to_write: - if self.ptype[var].to_write != 'once': - data_dict[var] = dict(zip(ids, self._data[var][indices_to_write])) - - if self.has_write_once_variables(): - first_write = (_to_write_particles(self._data, time) & _is_particle_started_yet(self._data, time) - & np.isin(self._data['id'], pfile.written_once, invert=True)) - if np.any(first_write): - written_once_ids = np.array(self._data['id'][first_write]).astype(dtype=np.int64).tolist() - pfile.written_once.extend(written_once_ids) - for var in pfile.vars_to_write: - if self.ptype[var].to_write == 'once': - data_dict[var] = dict(zip(written_once_ids, self._data[var][first_write])) - - pset_errs = ((self._data['state'][indices_to_write] != OperationCode.Delete) & np.greater(np.abs(time - self._data['time'][indices_to_write]), 1e-3, where=np.isfinite(self._data['time'][indices_to_write]))) - if np.count_nonzero(pset_errs) > 0: - logger.warning_once('time argument in pfile.write() is {}, but particles have time {}'.format(time, self._data['time'][pset_errs])) - - if deleted_only is False: - pfile.lasttime_written = time - - return data_dict + def setvardata(self, var, index, val): + self._data[var][index] = val def toArray(self): """ diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index c531dedf4..973f89e8d 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -1,11 +1,15 @@ """Module controlling the writing of ParticleSets to Zarr file""" from abc import ABC from abc import abstractmethod +from datetime import timedelta as delta import os import numpy as np import xarray as xr import zarr +from parcels.tools.loggers import logger +from parcels.tools.statuscodes import OperationCode + try: from mpi4py import MPI except: @@ -62,14 +66,17 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde self.ids_written = {} self.obs_written = {} self.maxids = 0 - self.maxobs = 0 self.written_first = False self.vars_to_write = {} + self.obs_written = np.empty((0,), dtype=int) for var in self.particleset.collection.ptype.variables: - self.obs_written[var.name] = {} if var.to_write: self.vars_to_write[var.name] = var.dtype + self.data_dict_time = 0 + self.indices_time = 0 + self.writing_time = 0 + self.metadata = {"feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", "ncei_template_version": "NCEI_NetCDF_Trajectory_Template_v2.0", "parcels_version": parcels_version, @@ -178,6 +185,12 @@ def _extend_zarr_dims(self, Z, store, dtype, axis): Z.append(a, axis=axis) zarr.consolidate_metadata(store) + def has_write_once_variables(self): + for var in self.vars_to_write: + if self.write_once(var): + return True + return False + def write(self, pset, time, deleted_only=False): """Write all data from one time step to the zarr file @@ -185,27 +198,54 @@ def write(self, pset, time, deleted_only=False): :param time: Time at which to write ParticleSet :param deleted_only: Flag to write only the deleted Particles """ - data_dicts = pset.to_dict(self, time, deleted_only=deleted_only) - if MPI: - all_data_dicts = MPI.COMM_WORLD.gather(data_dicts, root=0) - rank = MPI.COMM_WORLD.Get_rank() - else: - all_data_dicts = [data_dicts] - rank = 0 + time = time.total_seconds() if isinstance(time, delta) else time + + # if MPI: + # all_psets = MPI.COMM_WORLD.gather(pset, root=0) + # rank = MPI.COMM_WORLD.Get_rank() + # else: + all_psets = [pset] + rank = 0 if rank == 0: - for data_dict in all_data_dicts: - if len(data_dict) > 0: - for var in data_dict: - for i in data_dict[var].keys(): - if i not in self.ids_written: - self.ids_written[i] = self.maxids + for pset in all_psets: + + indices_to_write = [] + ids1D = [] + ids2D = [] + if self.lasttime_written != time and (self.write_ondelete is False or deleted_only is not False): + if pset.collection._ncount == 0: + logger.warning("ParticleSet is empty on writing as array at time %g" % time) + else: + if deleted_only is not False: + if type(deleted_only) not in [list, np.ndarray] and deleted_only in [True, 1]: + indices_to_write = np.where(np.isin(pset.collection.getvardata('state'), [OperationCode.Delete]))[0] + elif type(deleted_only) == np.ndarray and set(deleted_only).issubset([0, 1]): + indices_to_write = np.where(deleted_only)[0] + elif type(deleted_only) in [list, np.ndarray]: + indices_to_write = deleted_only + else: + indices_to_write = pset.collection._to_write_particles(pset.collection._data, time) + if len(indices_to_write) > 0: + ids2D = pset.collection.getvardata('fileid', indices_to_write) + for i in np.where(ids2D == -1)[0]: + ids2D[i] = self.maxids + pset.collection.setvardata('fileid', indices_to_write[i], self.maxids) self.maxids += 1 - if i in self.obs_written[var]: - self.obs_written[var][i] += 1 - else: - self.obs_written[var][i] = 0 + + if self.has_write_once_variables(): + first_write = np.isin(indices_to_write, self.written_once, invert=True) + if np.any(first_write): + first_write = indices_to_write[first_write] + ids1D = pset.collection.getvardata('fileid', first_write) + self.written_once.extend(first_write) + + if deleted_only is False: + self.lasttime_written = time + + if len(indices_to_write) > 0: + self.obs_written = np.append(self.obs_written, np.zeros((self.maxids-len(self.obs_written)), dtype=int)) if not self.written_first: if self.chunks is None: @@ -215,16 +255,15 @@ def write(self, pset, time, deleted_only=False): "Please increase 'chunks' in your ParticleFile.") ds = xr.Dataset(attrs=self.metadata) attrs = self._create_variables_attribute_dict() - for var in data_dict: - ids = [self.ids_written[i] for i in data_dict[var].keys()] + for var in self.vars_to_write: varout = self._convert_varout_name(var) if self.write_once(var): data = np.full((self.chunks[0],), np.nan, dtype=self.vars_to_write[var]) - data[ids] = [*data_dict[var].values()] + data[ids1D] = pset.collection.getvardata(var, first_write) dims = ["traj"] else: data = np.full(self.chunks, np.nan, dtype=self.vars_to_write[var]) - data[ids, 0] = [*data_dict[var].values()] + data[ids2D, 0] = pset.collection.getvardata(var, indices_to_write) dims = ["traj", "obs"] ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) ds[varout].encoding['chunks'] = self.chunks @@ -233,16 +272,16 @@ def write(self, pset, time, deleted_only=False): else: store = zarr.DirectoryStore(self.fname) Z = zarr.group(store=store, overwrite=False) - for var in data_dict: - maxobs = max(self.obs_written[var].values()) - obs = [self.obs_written[var][i] for i in data_dict[var].keys()] - ids = [self.ids_written[i] for i in data_dict[var].keys()] + for var in self.vars_to_write: varout = self._convert_varout_name(var) if self.maxids > Z[varout].shape[0]: self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) if self.write_once(var): - Z[varout].vindex[ids] = [*data_dict[var].values()] + if len(ids1D) > 0: + Z[varout].vindex[ids1D] = pset.collection.getvardata(var, first_write) else: - if maxobs >= Z[varout].shape[1]: + obs = self.obs_written[np.array(ids2D)] + if max(obs) >= Z[varout].shape[1]: self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) - Z[varout].vindex[ids, obs] = [*data_dict[var].values()] + Z[varout].vindex[ids2D, obs] = pset.collection.getvardata(var, indices_to_write) + self.obs_written[np.array(ids2D)] += 1 diff --git a/parcels/particleset/particlesetaos.py b/parcels/particleset/particlesetaos.py index 74debd624..663c2e6f7 100644 --- a/parcels/particleset/particlesetaos.py +++ b/parcels/particleset/particlesetaos.py @@ -573,17 +573,6 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime depth=vars['depth'], time=vars['time'], pid_orig=vars['id'], lonlatdepth_dtype=lonlatdepth_dtype, repeatdt=repeatdt, **kwargs) - def to_dict(self, pfile, time, deleted_only=False): - """ - Convert all Particle data from one time step to a python dictionary. - :param pfile: ParticleFile object requesting the conversion - :param time: Time at which to write ParticleSet - :param deleted_only: Flag to write only the deleted Particles - returns two dictionaries: one for all variables to be written each outputdt, and one for all variables to be written once - """ - return self._collection.toDictionary(pfile=pfile, time=time, - deleted_only=deleted_only) - def __iadd__(self, particles): """Add particles to the ParticleSet. Note that this is an incremental add, the particles will be added to the ParticleSet diff --git a/parcels/particleset/particlesetsoa.py b/parcels/particleset/particlesetsoa.py index 9274ef111..7cc439a6c 100644 --- a/parcels/particleset/particlesetsoa.py +++ b/parcels/particleset/particlesetsoa.py @@ -481,17 +481,6 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime depth=vars['depth'], time=vars['time'], pid_orig=vars['id'], lonlatdepth_dtype=lonlatdepth_dtype, repeatdt=repeatdt, **kwargs) - def to_dict(self, pfile, time, deleted_only=False): - """ - Convert all Particle data from one time step to a python dictionary. - :param pfile: ParticleFile object requesting the conversion - :param time: Time at which to write ParticleSet - :param deleted_only: Flag to write only the deleted Particles - returns two dictionaries: one for all variables to be written each outputdt, and one for all variables to be written once - """ - return self._collection.toDictionary(pfile=pfile, time=time, - deleted_only=deleted_only) - def compute_neighbor_tree(self, time, dt): active_mask = self.active_particles_mask(time, dt) From b997bc37024648c8aed43957c30e30f90100ac4c Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 27 Jul 2022 16:48:26 +0200 Subject: [PATCH 48/79] Making trajectory a to_write='once' variable by default Also fixing some breaking integration tests and further cleaning up the code --- parcels/collection/collectionaos.py | 4 ++++ parcels/collection/collections.py | 8 ++++++++ parcels/collection/collectionsoa.py | 12 +++--------- parcels/particle.py | 2 +- parcels/particlefile/baseparticlefile.py | 20 ++++++++++---------- tests/test_particle_file.py | 2 +- 6 files changed, 27 insertions(+), 21 deletions(-) diff --git a/parcels/collection/collectionaos.py b/parcels/collection/collectionaos.py index 1efe8d2ce..585d4e255 100644 --- a/parcels/collection/collectionaos.py +++ b/parcels/collection/collectionaos.py @@ -888,6 +888,10 @@ def getvardata(self, var, indices=None): def setvardata(self, var, index, val): setattr(self._data[index], var, val) + def setallvardata(self, var, val): + for i in range(len(self._data)): + setattr(self._data[i], var, val) + def toArray(self): """ This function converts (or: transforms; reformats; translates) this collection into an array-like structure diff --git a/parcels/collection/collections.py b/parcels/collection/collections.py index 8174db02d..1aa06f627 100644 --- a/parcels/collection/collections.py +++ b/parcels/collection/collections.py @@ -904,6 +904,14 @@ def has_write_once_variables(self): return True return False + @abstractmethod + def getvardata(self, var, indices=None): + pass + + @abstractmethod + def setvardata(self, var, index, val): + pass + @abstractmethod def set_variable_write_status(self, var, write_status): """ diff --git a/parcels/collection/collectionsoa.py b/parcels/collection/collectionsoa.py index ded3fe052..14766ca79 100644 --- a/parcels/collection/collectionsoa.py +++ b/parcels/collection/collectionsoa.py @@ -25,15 +25,6 @@ 'See http://oceanparcels.org/#parallel_install for more information') -def _is_particle_started_yet(pd, time): - """We don't want to write a particle that is not started yet. - Particle will be written if: - * particle.time is equal to time argument of pfile.write() - * particle.time is before time (in case particle was deleted between previous export and current one) - """ - return np.less_equal(pd['dt']*pd['time'], pd['dt']*time) | np.isclose(pd['time'], time) - - def _convert_to_flat_array(var): """Convert lists and single integers/floats to one-dimensional numpy arrays @@ -822,6 +813,9 @@ def getvardata(self, var, indices=None): def setvardata(self, var, index, val): self._data[var][index] = val + def setallvardata(self, var, val): + self._data[var][:] = val + def toArray(self): """ This function converts (or: transforms; reformats; translates) this collection into an array-like structure diff --git a/parcels/particle.py b/parcels/particle.py index 5d45294a5..605b891e9 100644 --- a/parcels/particle.py +++ b/parcels/particle.py @@ -185,7 +185,7 @@ class ScipyParticle(_Particle): lat = Variable('lat', dtype=np.float32) depth = Variable('depth', dtype=np.float32) time = Variable('time', dtype=np.float64) - id = Variable('id', dtype=np.int64) + id = Variable('id', dtype=np.int64, to_write='once') fileid = Variable('fileid', dtype=np.int32, initial=-1, to_write=False) dt = Variable('dt', dtype=np.float64, to_write=False) state = Variable('state', dtype=np.int32, initial=StateCode.Evaluate, to_write=False) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 973f89e8d..a4235af94 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -73,9 +73,8 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde if var.to_write: self.vars_to_write[var.name] = var.dtype - self.data_dict_time = 0 - self.indices_time = 0 - self.writing_time = 0 + # Reset fileid of each particle, in case new ParticleFile created for a ParticleSet + particleset.collection.setallvardata('fileid', -1) self.metadata = {"feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", "ncei_template_version": "NCEI_NetCDF_Trajectory_Template_v2.0", @@ -221,12 +220,16 @@ def write(self, pset, time, deleted_only=False): if deleted_only is not False: if type(deleted_only) not in [list, np.ndarray] and deleted_only in [True, 1]: indices_to_write = np.where(np.isin(pset.collection.getvardata('state'), [OperationCode.Delete]))[0] - elif type(deleted_only) == np.ndarray and set(deleted_only).issubset([0, 1]): - indices_to_write = np.where(deleted_only)[0] - elif type(deleted_only) in [list, np.ndarray]: - indices_to_write = deleted_only + elif type(deleted_only) == np.ndarray: + if set(deleted_only).issubset([0, 1]): + indices_to_write = np.where(deleted_only)[0] + else: + indices_to_write = deleted_only + elif type(deleted_only) == list: + indices_to_write = np.array(deleted_only) else: indices_to_write = pset.collection._to_write_particles(pset.collection._data, time) + self.lasttime_written = time if len(indices_to_write) > 0: ids2D = pset.collection.getvardata('fileid', indices_to_write) for i in np.where(ids2D == -1)[0]: @@ -241,9 +244,6 @@ def write(self, pset, time, deleted_only=False): ids1D = pset.collection.getvardata('fileid', first_write) self.written_once.extend(first_write) - if deleted_only is False: - self.lasttime_written = time - if len(indices_to_write) > 0: self.obs_written = np.append(self.obs_written, np.zeros((self.maxids-len(self.obs_written)), dtype=int)) diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index f76983cbf..d98496af4 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -264,7 +264,7 @@ def Update_lon(particle, fieldset, time): pset.execute(pset.Kernel(Update_lon), runtime=4, dt=-1., output_file=pfile) ds = xr.open_zarr(outfilepath) - trajs = ds['trajectory'][:, 0] + trajs = ds['trajectory'][:] assert np.all(np.diff(trajs.values) < 0) # all particles written in order of start time ds.close() From cd110949782682bfcedd4be5f4af989f9a216d45 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 28 Jul 2022 13:39:37 +0200 Subject: [PATCH 49/79] Further cleanup of baseparticlefile Also adding argument `create_new_zarrfile` to ParticleFile, to either create (default) or append to file --- parcels/particlefile/baseparticlefile.py | 166 +++++++++++------------ 1 file changed, 76 insertions(+), 90 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index a4235af94..7e97a0d6c 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -40,6 +40,7 @@ class BaseParticleFile(ABC): It is either a timedelta object or a positive double. :param chunks: Tuple (trajs, obs) to control the size of chunks in the zarr output. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False + :param create_new_zarrfile: Boolean to create a new file. Default is True """ write_ondelete = None outputdt = None @@ -49,7 +50,8 @@ class BaseParticleFile(ABC): time_origin = None lonlatdepth_dtype = None - def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_ondelete=False): + def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_ondelete=False, + create_new_zarrfile=True): self.write_ondelete = write_ondelete self.outputdt = outputdt @@ -66,7 +68,7 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde self.ids_written = {} self.obs_written = {} self.maxids = 0 - self.written_first = False + self.create_new_zarrfile = create_new_zarrfile self.vars_to_write = {} self.obs_written = np.empty((0,), dtype=int) for var in self.particleset.collection.ptype.variables: @@ -184,12 +186,6 @@ def _extend_zarr_dims(self, Z, store, dtype, axis): Z.append(a, axis=axis) zarr.consolidate_metadata(store) - def has_write_once_variables(self): - for var in self.vars_to_write: - if self.write_once(var): - return True - return False - def write(self, pset, time, deleted_only=False): """Write all data from one time step to the zarr file @@ -200,88 +196,78 @@ def write(self, pset, time, deleted_only=False): time = time.total_seconds() if isinstance(time, delta) else time - # if MPI: - # all_psets = MPI.COMM_WORLD.gather(pset, root=0) - # rank = MPI.COMM_WORLD.Get_rank() - # else: - all_psets = [pset] - rank = 0 - - if rank == 0: - for pset in all_psets: - - indices_to_write = [] - ids1D = [] - ids2D = [] - if self.lasttime_written != time and (self.write_ondelete is False or deleted_only is not False): - if pset.collection._ncount == 0: - logger.warning("ParticleSet is empty on writing as array at time %g" % time) + if self.lasttime_written != time and (self.write_ondelete is False or deleted_only is not False): + if pset.collection._ncount == 0: + logger.warning("ParticleSet is empty on writing as array at time %g" % time) + return + + if deleted_only is not False: + if type(deleted_only) not in [list, np.ndarray] and deleted_only in [True, 1]: + indices_to_write = np.where(np.isin(pset.collection.getvardata('state'), [OperationCode.Delete]))[0] + elif type(deleted_only) == np.ndarray: + if set(deleted_only).issubset([0, 1]): + indices_to_write = np.where(deleted_only)[0] else: - if deleted_only is not False: - if type(deleted_only) not in [list, np.ndarray] and deleted_only in [True, 1]: - indices_to_write = np.where(np.isin(pset.collection.getvardata('state'), [OperationCode.Delete]))[0] - elif type(deleted_only) == np.ndarray: - if set(deleted_only).issubset([0, 1]): - indices_to_write = np.where(deleted_only)[0] - else: - indices_to_write = deleted_only - elif type(deleted_only) == list: - indices_to_write = np.array(deleted_only) - else: - indices_to_write = pset.collection._to_write_particles(pset.collection._data, time) - self.lasttime_written = time - if len(indices_to_write) > 0: - ids2D = pset.collection.getvardata('fileid', indices_to_write) - for i in np.where(ids2D == -1)[0]: - ids2D[i] = self.maxids - pset.collection.setvardata('fileid', indices_to_write[i], self.maxids) - self.maxids += 1 - - if self.has_write_once_variables(): - first_write = np.isin(indices_to_write, self.written_once, invert=True) - if np.any(first_write): - first_write = indices_to_write[first_write] - ids1D = pset.collection.getvardata('fileid', first_write) - self.written_once.extend(first_write) - - if len(indices_to_write) > 0: + indices_to_write = deleted_only + elif type(deleted_only) == list: + indices_to_write = np.array(deleted_only) + else: + indices_to_write = pset.collection._to_write_particles(pset.collection._data, time) + self.lasttime_written = time + + if len(indices_to_write) > 0: + ids2D = pset.collection.getvardata('fileid', indices_to_write) + for i in np.where(ids2D == -1)[0]: + ids2D[i] = self.maxids + pset.collection.setvardata('fileid', indices_to_write[i], self.maxids) + self.maxids += 1 + + first_write = np.isin(indices_to_write, self.written_once, invert=True) + if np.any(first_write): + first_write = indices_to_write[first_write] + ids1D = pset.collection.getvardata('fileid', first_write) + self.written_once.extend(first_write) + else: + ids1D = [] + + if self.maxids > len(self.obs_written): self.obs_written = np.append(self.obs_written, np.zeros((self.maxids-len(self.obs_written)), dtype=int)) - if not self.written_first: - if self.chunks is None: - self.chunks = (self.maxids, 10) - if self.chunks[0] < self.maxids: - raise RuntimeError(f"chunks[0] is smaller than the size of the initial particleset ({self.chunks[0]} < {self.maxids}). " - "Please increase 'chunks' in your ParticleFile.") - ds = xr.Dataset(attrs=self.metadata) - attrs = self._create_variables_attribute_dict() - for var in self.vars_to_write: - varout = self._convert_varout_name(var) - if self.write_once(var): - data = np.full((self.chunks[0],), np.nan, dtype=self.vars_to_write[var]) - data[ids1D] = pset.collection.getvardata(var, first_write) - dims = ["traj"] - else: - data = np.full(self.chunks, np.nan, dtype=self.vars_to_write[var]) - data[ids2D, 0] = pset.collection.getvardata(var, indices_to_write) - dims = ["traj", "obs"] - ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) - ds[varout].encoding['chunks'] = self.chunks - ds.to_zarr(self.fname, mode='w') - self.written_first = True - else: - store = zarr.DirectoryStore(self.fname) - Z = zarr.group(store=store, overwrite=False) - for var in self.vars_to_write: - varout = self._convert_varout_name(var) - if self.maxids > Z[varout].shape[0]: - self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) - if self.write_once(var): - if len(ids1D) > 0: - Z[varout].vindex[ids1D] = pset.collection.getvardata(var, first_write) - else: - obs = self.obs_written[np.array(ids2D)] - if max(obs) >= Z[varout].shape[1]: - self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) - Z[varout].vindex[ids2D, obs] = pset.collection.getvardata(var, indices_to_write) - self.obs_written[np.array(ids2D)] += 1 + if self.create_new_zarrfile: + if self.chunks is None: + self.chunks = (self.maxids, 10) + if self.chunks[0] < self.maxids: + raise RuntimeError(f"chunks[0] is smaller than the size of the initial particleset ({self.chunks[0]} < {self.maxids}). " + "Please increase 'chunks' in your ParticleFile.") + ds = xr.Dataset(attrs=self.metadata) + attrs = self._create_variables_attribute_dict() + for var in self.vars_to_write: + varout = self._convert_varout_name(var) + if self.write_once(var): + data = np.full((self.chunks[0],), np.nan, dtype=self.vars_to_write[var]) + data[ids1D] = pset.collection.getvardata(var, first_write) + dims = ["traj"] + else: + data = np.full(self.chunks, np.nan, dtype=self.vars_to_write[var]) + data[ids2D, 0] = pset.collection.getvardata(var, indices_to_write) + dims = ["traj", "obs"] + ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) + ds[varout].encoding['chunks'] = self.chunks + ds.to_zarr(self.fname, mode='w') + self.create_new_zarrfile = False + else: + store = zarr.DirectoryStore(self.fname) + Z = zarr.group(store=store, overwrite=False) + for var in self.vars_to_write: + varout = self._convert_varout_name(var) + if self.maxids > Z[varout].shape[0]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) + if self.write_once(var): + if len(ids1D) > 0: + Z[varout].vindex[ids1D] = pset.collection.getvardata(var, first_write) + else: + obs = self.obs_written[np.array(ids2D)] + if max(obs) >= Z[varout].shape[1]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) + Z[varout].vindex[ids2D, obs] = pset.collection.getvardata(var, indices_to_write) + self.obs_written[np.array(ids2D)] += 1 From b2caf8584382d0ac5ac3d843fa65c1e3f87377f8 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 28 Jul 2022 15:13:21 +0200 Subject: [PATCH 50/79] Simplifying computation of to_write='once' variable indices --- parcels/particlefile/baseparticlefile.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 7e97a0d6c..f9d898478 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -64,13 +64,10 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde self.parcels_mesh = self.particleset.fieldset.gridset.grids[0].mesh self.time_origin = self.particleset.time_origin self.lonlatdepth_dtype = self.particleset.collection.lonlatdepth_dtype - self.written_once = [] - self.ids_written = {} - self.obs_written = {} self.maxids = 0 + self.obs_written = np.empty((0,), dtype=int) self.create_new_zarrfile = create_new_zarrfile self.vars_to_write = {} - self.obs_written = np.empty((0,), dtype=int) for var in self.particleset.collection.ptype.variables: if var.to_write: self.vars_to_write[var.name] = var.dtype @@ -217,19 +214,16 @@ def write(self, pset, time, deleted_only=False): if len(indices_to_write) > 0: ids2D = pset.collection.getvardata('fileid', indices_to_write) - for i in np.where(ids2D == -1)[0]: - ids2D[i] = self.maxids - pset.collection.setvardata('fileid', indices_to_write[i], self.maxids) + new_ids = np.where(ids2D == -1)[0] + ids1D = np.empty((len(new_ids),), dtype=int) + first_write = np.empty((len(new_ids),), dtype=int) + for i, id in enumerate(new_ids): + ids2D[id] = self.maxids + pset.collection.setvardata('fileid', indices_to_write[id], self.maxids) + ids1D[i] = self.maxids + first_write[i] = indices_to_write[id] self.maxids += 1 - first_write = np.isin(indices_to_write, self.written_once, invert=True) - if np.any(first_write): - first_write = indices_to_write[first_write] - ids1D = pset.collection.getvardata('fileid', first_write) - self.written_once.extend(first_write) - else: - ids1D = [] - if self.maxids > len(self.obs_written): self.obs_written = np.append(self.obs_written, np.zeros((self.maxids-len(self.obs_written)), dtype=int)) From 9e5381bce2c7d94035be5af1893d6af2d8582c3e Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 28 Jul 2022 18:49:06 +0200 Subject: [PATCH 51/79] Updating baseparticlefile to support mpi --- parcels/particlefile/baseparticlefile.py | 76 ++++++++++++++++++------ 1 file changed, 58 insertions(+), 18 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index f9d898478..9a15a51f3 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -71,6 +71,7 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde for var in self.particleset.collection.ptype.variables: if var.to_write: self.vars_to_write[var.name] = var.dtype + self.mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 # Reset fileid of each particle, in case new ParticleFile created for a ParticleSet particleset.collection.setallvardata('fileid', -1) @@ -224,38 +225,77 @@ def write(self, pset, time, deleted_only=False): first_write[i] = indices_to_write[id] self.maxids += 1 + if MPI: + maxids = MPI.COMM_WORLD.gather(max(ids2D)+1, root=0) + ids2Dlens = MPI.COMM_WORLD.gather(len(ids2D), root=0) + + if self.mpi_rank == 0: + maxids = max(maxids) + ids2Dlens = min(ids2Dlens) + minchunks = int(MPI.COMM_WORLD.bcast(ids2Dlens, root=0)) + self.maxids = int(MPI.COMM_WORLD.bcast(maxids, root=0)) + else: + minchunks = len(ids2D) + self.maxids = max(ids2D)+1 + if self.maxids > len(self.obs_written): self.obs_written = np.append(self.obs_written, np.zeros((self.maxids-len(self.obs_written)), dtype=int)) if self.create_new_zarrfile: if self.chunks is None: - self.chunks = (self.maxids, 10) - if self.chunks[0] < self.maxids: + self.chunks = (minchunks, 10) + if self.chunks[0] < minchunks: raise RuntimeError(f"chunks[0] is smaller than the size of the initial particleset ({self.chunks[0]} < {self.maxids}). " "Please increase 'chunks' in your ParticleFile.") - ds = xr.Dataset(attrs=self.metadata) - attrs = self._create_variables_attribute_dict() - for var in self.vars_to_write: - varout = self._convert_varout_name(var) - if self.write_once(var): - data = np.full((self.chunks[0],), np.nan, dtype=self.vars_to_write[var]) - data[ids1D] = pset.collection.getvardata(var, first_write) - dims = ["traj"] - else: - data = np.full(self.chunks, np.nan, dtype=self.vars_to_write[var]) - data[ids2D, 0] = pset.collection.getvardata(var, indices_to_write) - dims = ["traj", "obs"] - ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) - ds[varout].encoding['chunks'] = self.chunks - ds.to_zarr(self.fname, mode='w') + if self.mpi_rank == 0: + ds = xr.Dataset(attrs=self.metadata) + attrs = self._create_variables_attribute_dict() + for var in self.vars_to_write: + varout = self._convert_varout_name(var) + if self.write_once(var): + data = np.full((self.chunks[0],), np.nan, dtype=self.vars_to_write[var]) + data[ids1D] = pset.collection.getvardata(var, first_write) + dims = ["traj"] + else: + data = np.full(self.chunks, np.nan, dtype=self.vars_to_write[var]) + data[ids2D, 0] = pset.collection.getvardata(var, indices_to_write) + dims = ["traj", "obs"] + ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) + ds[varout].encoding['chunks'] = self.chunks + ds.to_zarr(self.fname, mode='w') self.create_new_zarrfile = False + if MPI: + MPI.COMM_WORLD.barrier() + if self.mpi_rank > 0: + store = zarr.DirectoryStore(self.fname) + Z = zarr.group(store=store, overwrite=False) + for var in self.vars_to_write: + varout = self._convert_varout_name(var) + if self.maxids > Z[varout].shape[0]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) + + if self.write_once(var): + if len(ids1D) > 0: + Z[varout].vindex[ids1D] = pset.collection.getvardata(var, first_write) + else: + obs = self.obs_written[np.array(ids2D)] + if max(obs) >= Z[varout].shape[1]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) + Z[varout].vindex[ids2D, obs] = pset.collection.getvardata(var, indices_to_write) + else: store = zarr.DirectoryStore(self.fname) Z = zarr.group(store=store, overwrite=False) for var in self.vars_to_write: varout = self._convert_varout_name(var) if self.maxids > Z[varout].shape[0]: - self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) + if self.mpi_rank == 0: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) + if MPI: + MPI.COMM_WORLD.barrier() + + for var in self.vars_to_write: + varout = self._convert_varout_name(var) if self.write_once(var): if len(ids1D) > 0: Z[varout].vindex[ids1D] = pset.collection.getvardata(var, first_write) From 5bf4955c42cbcd97c222e364fdd7231c5623e80b Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 28 Jul 2022 19:39:50 +0200 Subject: [PATCH 52/79] Further fix to mpi and zarr --- parcels/particlefile/baseparticlefile.py | 22 ++++++++++++++-------- tests/test_particle_file.py | 2 +- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 9a15a51f3..6c202a71c 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -72,6 +72,9 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde if var.to_write: self.vars_to_write[var.name] = var.dtype self.mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 + self.fileidoffset = particleset.collection.getvardata('id', [0]) + if MPI: + self.fileidoffset = MPI.COMM_WORLD.bcast(self.fileidoffset, root=0)[0] # Reset fileid of each particle, in case new ParticleFile created for a ParticleSet particleset.collection.setallvardata('fileid', -1) @@ -214,16 +217,15 @@ def write(self, pset, time, deleted_only=False): self.lasttime_written = time if len(indices_to_write) > 0: - ids2D = pset.collection.getvardata('fileid', indices_to_write) - new_ids = np.where(ids2D == -1)[0] + ids2D = pset.collection.getvardata('id', indices_to_write) - self.fileidoffset + once_id = pset.collection.getvardata('fileid', indices_to_write) + new_ids = np.where(once_id == -1)[0] ids1D = np.empty((len(new_ids),), dtype=int) first_write = np.empty((len(new_ids),), dtype=int) for i, id in enumerate(new_ids): - ids2D[id] = self.maxids - pset.collection.setvardata('fileid', indices_to_write[id], self.maxids) - ids1D[i] = self.maxids + pset.collection.setvardata('fileid', indices_to_write[id], 1) + ids1D[i] = ids2D[id] first_write[i] = indices_to_write[id] - self.maxids += 1 if MPI: maxids = MPI.COMM_WORLD.gather(max(ids2D)+1, root=0) @@ -250,14 +252,18 @@ def write(self, pset, time, deleted_only=False): if self.mpi_rank == 0: ds = xr.Dataset(attrs=self.metadata) attrs = self._create_variables_attribute_dict() + if max(ids2D) > self.chunks[0]: + arrsize = (self.maxids, self.chunks[1]) + else: + arrsize = self.chunks for var in self.vars_to_write: varout = self._convert_varout_name(var) if self.write_once(var): - data = np.full((self.chunks[0],), np.nan, dtype=self.vars_to_write[var]) + data = np.full((arrsize[0],), np.nan, dtype=self.vars_to_write[var]) data[ids1D] = pset.collection.getvardata(var, first_write) dims = ["traj"] else: - data = np.full(self.chunks, np.nan, dtype=self.vars_to_write[var]) + data = np.full(arrsize, np.nan, dtype=self.vars_to_write[var]) data[ids2D, 0] = pset.collection.getvardata(var, indices_to_write) dims = ["traj", "obs"] ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index d98496af4..20753022f 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -265,7 +265,7 @@ def Update_lon(particle, fieldset, time): output_file=pfile) ds = xr.open_zarr(outfilepath) trajs = ds['trajectory'][:] - assert np.all(np.diff(trajs.values) < 0) # all particles written in order of start time + assert np.all(np.diff(trajs.values) > 0) # all particles written in order of traj ID ds.close() From cca1a32737c5128f82220d93107bf78ceb4beb6e Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Fri, 29 Jul 2022 15:11:43 +0200 Subject: [PATCH 53/79] Fixing bug in setting fileoffsets when particleset is empty --- parcels/collection/collectionaos.py | 5 ++++- parcels/collection/collectionsoa.py | 5 ++++- parcels/particlefile/baseparticlefile.py | 3 ++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/parcels/collection/collectionaos.py b/parcels/collection/collectionaos.py index 585d4e255..b561e9902 100644 --- a/parcels/collection/collectionaos.py +++ b/parcels/collection/collectionaos.py @@ -883,7 +883,10 @@ def getvardata(self, var, indices=None): if indices is None: return np.array([getattr(p, var) for p in self._data]) else: - return np.array([getattr(p, var) for p in self._data[indices]]) + try: + return np.array([getattr(p, var) for p in self._data[indices]]) + except: # Can occur for zero-length ParticleSets + return None def setvardata(self, var, index, val): setattr(self._data[index], var, val) diff --git a/parcels/collection/collectionsoa.py b/parcels/collection/collectionsoa.py index 14766ca79..ec8b2188b 100644 --- a/parcels/collection/collectionsoa.py +++ b/parcels/collection/collectionsoa.py @@ -808,7 +808,10 @@ def getvardata(self, var, indices=None): if indices is None: return self._data[var] else: - return self._data[var][indices] + try: + return self._data[var][indices] + except: # Can occur for zero-length ParticleSets + return None def setvardata(self, var, index, val): self._data[var][index] = val diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 6c202a71c..111fea943 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -72,7 +72,8 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde if var.to_write: self.vars_to_write[var.name] = var.dtype self.mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 - self.fileidoffset = particleset.collection.getvardata('id', [0]) + id0 = particleset.collection.getvardata('id', [0]) + self.fileidoffset = [0] if id0 is None else id0 if MPI: self.fileidoffset = MPI.COMM_WORLD.bcast(self.fileidoffset, root=0)[0] From 381c3751ebc92f7597aef4fc7d4ed4177a1815c2 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Fri, 29 Jul 2022 15:35:56 +0200 Subject: [PATCH 54/79] Fixing bug in MPI-zarr writing where extending ons-dimension could be handled by multiple threads --- parcels/particlefile/baseparticlefile.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 111fea943..9006dabc7 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -293,11 +293,16 @@ def write(self, pset, time, deleted_only=False): else: store = zarr.DirectoryStore(self.fname) Z = zarr.group(store=store, overwrite=False) - for var in self.vars_to_write: - varout = self._convert_varout_name(var) - if self.maxids > Z[varout].shape[0]: - if self.mpi_rank == 0: + obs = self.obs_written[np.array(ids2D)] + if self.mpi_rank == 0: + for var in self.vars_to_write: + varout = self._convert_varout_name(var) + if self.maxids > Z[varout].shape[0]: self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) + if not self.write_once(var): + if max(obs) >= Z[varout].shape[1]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) + if MPI: MPI.COMM_WORLD.barrier() @@ -307,8 +312,5 @@ def write(self, pset, time, deleted_only=False): if len(ids1D) > 0: Z[varout].vindex[ids1D] = pset.collection.getvardata(var, first_write) else: - obs = self.obs_written[np.array(ids2D)] - if max(obs) >= Z[varout].shape[1]: - self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) Z[varout].vindex[ids2D, obs] = pset.collection.getvardata(var, indices_to_write) self.obs_written[np.array(ids2D)] += 1 From ffd3b42aa9671cad1ce33cfe769d73c0a769e618 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Mon, 1 Aug 2022 09:58:04 +0200 Subject: [PATCH 55/79] Cleaning up baseparticlefile --- parcels/particlefile/baseparticlefile.py | 72 ++++++++++-------------- 1 file changed, 29 insertions(+), 43 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 9006dabc7..29cd51c82 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -196,6 +196,31 @@ def write(self, pset, time, deleted_only=False): :param deleted_only: Flag to write only the deleted Particles """ + def add_data_to_zarr(firstcall=False): + # Helper function to write to a zarr file + store = zarr.DirectoryStore(self.fname) + Z = zarr.group(store=store, overwrite=False) + obs = self.obs_written[np.array(ids2D)] + if self.mpi_rank == 0: + for var in self.vars_to_write: + varout = self._convert_varout_name(var) + if self.maxids > Z[varout].shape[0]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) + if not self.write_once(var): + if max(obs) >= Z[varout].shape[1]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) + + if MPI and (not firstcall): + MPI.COMM_WORLD.barrier() + + for var in self.vars_to_write: + varout = self._convert_varout_name(var) + if self.write_once(var): + if len(ids1D) > 0: + Z[varout].vindex[ids1D] = pset.collection.getvardata(var, first_write) + else: + Z[varout].vindex[ids2D, obs] = pset.collection.getvardata(var, indices_to_write) + time = time.total_seconds() if isinstance(time, delta) else time if self.lasttime_written != time and (self.write_ondelete is False or deleted_only is not False): @@ -247,13 +272,10 @@ def write(self, pset, time, deleted_only=False): if self.create_new_zarrfile: if self.chunks is None: self.chunks = (minchunks, 10) - if self.chunks[0] < minchunks: - raise RuntimeError(f"chunks[0] is smaller than the size of the initial particleset ({self.chunks[0]} < {self.maxids}). " - "Please increase 'chunks' in your ParticleFile.") if self.mpi_rank == 0: ds = xr.Dataset(attrs=self.metadata) attrs = self._create_variables_attribute_dict() - if max(ids2D) > self.chunks[0]: + if self.maxids > minchunks: arrsize = (self.maxids, self.chunks[1]) else: arrsize = self.chunks @@ -268,49 +290,13 @@ def write(self, pset, time, deleted_only=False): data[ids2D, 0] = pset.collection.getvardata(var, indices_to_write) dims = ["traj", "obs"] ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) - ds[varout].encoding['chunks'] = self.chunks + ds[varout].encoding['chunks'] = self.chunks[0] if self.write_once(var) else self.chunks ds.to_zarr(self.fname, mode='w') self.create_new_zarrfile = False if MPI: MPI.COMM_WORLD.barrier() if self.mpi_rank > 0: - store = zarr.DirectoryStore(self.fname) - Z = zarr.group(store=store, overwrite=False) - for var in self.vars_to_write: - varout = self._convert_varout_name(var) - if self.maxids > Z[varout].shape[0]: - self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) - - if self.write_once(var): - if len(ids1D) > 0: - Z[varout].vindex[ids1D] = pset.collection.getvardata(var, first_write) - else: - obs = self.obs_written[np.array(ids2D)] - if max(obs) >= Z[varout].shape[1]: - self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) - Z[varout].vindex[ids2D, obs] = pset.collection.getvardata(var, indices_to_write) - + add_data_to_zarr(firstcall=True) else: - store = zarr.DirectoryStore(self.fname) - Z = zarr.group(store=store, overwrite=False) - obs = self.obs_written[np.array(ids2D)] - if self.mpi_rank == 0: - for var in self.vars_to_write: - varout = self._convert_varout_name(var) - if self.maxids > Z[varout].shape[0]: - self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) - if not self.write_once(var): - if max(obs) >= Z[varout].shape[1]: - self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) - - if MPI: - MPI.COMM_WORLD.barrier() - - for var in self.vars_to_write: - varout = self._convert_varout_name(var) - if self.write_once(var): - if len(ids1D) > 0: - Z[varout].vindex[ids1D] = pset.collection.getvardata(var, first_write) - else: - Z[varout].vindex[ids2D, obs] = pset.collection.getvardata(var, indices_to_write) + add_data_to_zarr() self.obs_written[np.array(ids2D)] += 1 From adfd755e770c1dc576d2ad260b208bc8df96e6b5 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Mon, 1 Aug 2022 12:11:37 +0200 Subject: [PATCH 56/79] Renaming particle.fileid to particle.once_written --- parcels/collection/collectionsoa.py | 2 +- parcels/particle.py | 4 ++-- parcels/particlefile/baseparticlefile.py | 10 +++++----- parcels/particleset/particlesetaos.py | 2 +- parcels/particleset/particlesetsoa.py | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/parcels/collection/collectionsoa.py b/parcels/collection/collectionsoa.py index ec8b2188b..16f3027a1 100644 --- a/parcels/collection/collectionsoa.py +++ b/parcels/collection/collectionsoa.py @@ -132,7 +132,7 @@ def __init__(self, pclass, lon, lat, depth, time, lonlatdepth_dtype, pid_orig, p self._data['depth'][:] = depth self._data['time'][:] = time self._data['id'][:] = pid - self._data['fileid'][:] = -1 + self._data['once_written'][:] = 0 # special case for exceptions which can only be handled from scipy self._data['exception'] = np.empty(self.ncount, dtype=object) diff --git a/parcels/particle.py b/parcels/particle.py index 605b891e9..fdc1c6251 100644 --- a/parcels/particle.py +++ b/parcels/particle.py @@ -186,7 +186,7 @@ class ScipyParticle(_Particle): depth = Variable('depth', dtype=np.float32) time = Variable('time', dtype=np.float64) id = Variable('id', dtype=np.int64, to_write='once') - fileid = Variable('fileid', dtype=np.int32, initial=-1, to_write=False) + once_written = Variable('once_written', dtype=np.int32, initial=0, to_write=False) # np.bool not implemented in JIT dt = Variable('dt', dtype=np.float64, to_write=False) state = Variable('state', dtype=np.int32, initial=StateCode.Evaluate, to_write=False) next_dt = Variable('_next_dt', dtype=np.float64, initial=np.nan, to_write=False) @@ -200,7 +200,7 @@ def __init__(self, lon, lat, pid, fieldset=None, ngrids=None, depth=0., time=0., type(self).time.initial = time type(self).id.initial = pid _Particle.lastID = max(_Particle.lastID, pid) - type(self).fileid.initial = -1 + type(self).once_written.initial = 0 type(self).dt.initial = None type(self).next_dt.initial = np.nan diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 29cd51c82..d11c136c7 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -77,8 +77,8 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde if MPI: self.fileidoffset = MPI.COMM_WORLD.bcast(self.fileidoffset, root=0)[0] - # Reset fileid of each particle, in case new ParticleFile created for a ParticleSet - particleset.collection.setallvardata('fileid', -1) + # Reset once-written flag of each particle, in case new ParticleFile created for a ParticleSet + particleset.collection.setallvardata('once_written', 0) self.metadata = {"feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", "ncei_template_version": "NCEI_NetCDF_Trajectory_Template_v2.0", @@ -244,12 +244,12 @@ def add_data_to_zarr(firstcall=False): if len(indices_to_write) > 0: ids2D = pset.collection.getvardata('id', indices_to_write) - self.fileidoffset - once_id = pset.collection.getvardata('fileid', indices_to_write) - new_ids = np.where(once_id == -1)[0] + once_written = pset.collection.getvardata('once_written', indices_to_write) + new_ids = np.where(once_written == 0)[0] ids1D = np.empty((len(new_ids),), dtype=int) first_write = np.empty((len(new_ids),), dtype=int) for i, id in enumerate(new_ids): - pset.collection.setvardata('fileid', indices_to_write[id], 1) + pset.collection.setvardata('once_written', indices_to_write[id], 1) ids1D[i] = ids2D[id] first_write[i] = indices_to_write[id] diff --git a/parcels/particleset/particlesetaos.py b/parcels/particleset/particlesetaos.py index 663c2e6f7..8a29f3cef 100644 --- a/parcels/particleset/particlesetaos.py +++ b/parcels/particleset/particlesetaos.py @@ -538,7 +538,7 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime for v in pclass.getPType().variables: if v.name in pfile_vars: vars[v.name] = np.ma.filled(pfile.variables[v.name], np.nan) - elif v.name not in ['xi', 'yi', 'zi', 'ti', 'dt', '_next_dt', 'depth', 'id', 'fileid', 'state'] \ + elif v.name not in ['xi', 'yi', 'zi', 'ti', 'dt', '_next_dt', 'depth', 'id', 'once_written', 'state'] \ and v.to_write: raise RuntimeError('Variable %s is in pclass but not in the particlefile' % v.name) to_write[v.name] = v.to_write diff --git a/parcels/particleset/particlesetsoa.py b/parcels/particleset/particlesetsoa.py index 7cc439a6c..bce522d9c 100644 --- a/parcels/particleset/particlesetsoa.py +++ b/parcels/particleset/particlesetsoa.py @@ -446,7 +446,7 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime for v in pclass.getPType().variables: if v.name in pfile_vars: vars[v.name] = np.ma.filled(pfile.variables[v.name], np.nan) - elif v.name not in ['xi', 'yi', 'zi', 'ti', 'dt', '_next_dt', 'depth', 'id', 'fileid', 'state'] \ + elif v.name not in ['xi', 'yi', 'zi', 'ti', 'dt', '_next_dt', 'depth', 'id', 'once_written', 'state'] \ and v.to_write: raise RuntimeError('Variable %s is in pclass but not in the particlefile' % v.name) to_write[v.name] = v.to_write From 0667291551566186aa06c1a2d52fc2ba0bcbea6e Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Fri, 5 Aug 2022 15:13:33 +0200 Subject: [PATCH 57/79] Removing for-loop from baseparticlefile Still needed in aos-mode; but not anymore in soa-mode. So moving for-loop to collectionaos Also renaming some variables to make their use clearer --- parcels/collection/collectionaos.py | 6 +++- parcels/particlefile/baseparticlefile.py | 36 +++++++++++------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/parcels/collection/collectionaos.py b/parcels/collection/collectionaos.py index b561e9902..30c961c63 100644 --- a/parcels/collection/collectionaos.py +++ b/parcels/collection/collectionaos.py @@ -889,7 +889,11 @@ def getvardata(self, var, indices=None): return None def setvardata(self, var, index, val): - setattr(self._data[index], var, val) + if isinstance(index, (np.int64, int, np.int32)): + setattr(self._data[index], var, val) + else: + for i, v in zip(index, val): + setattr(self._data[i], var, v) def setallvardata(self, var, val): for i in range(len(self._data)): diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index d11c136c7..43651bea9 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -200,7 +200,7 @@ def add_data_to_zarr(firstcall=False): # Helper function to write to a zarr file store = zarr.DirectoryStore(self.fname) Z = zarr.group(store=store, overwrite=False) - obs = self.obs_written[np.array(ids2D)] + obs = self.obs_written[np.array(ids)] if self.mpi_rank == 0: for var in self.vars_to_write: varout = self._convert_varout_name(var) @@ -216,10 +216,10 @@ def add_data_to_zarr(firstcall=False): for var in self.vars_to_write: varout = self._convert_varout_name(var) if self.write_once(var): - if len(ids1D) > 0: - Z[varout].vindex[ids1D] = pset.collection.getvardata(var, first_write) + if len(ids_once) > 0: + Z[varout].vindex[ids_once] = pset.collection.getvardata(var, indices_to_write_once) else: - Z[varout].vindex[ids2D, obs] = pset.collection.getvardata(var, indices_to_write) + Z[varout].vindex[ids, obs] = pset.collection.getvardata(var, indices_to_write) time = time.total_seconds() if isinstance(time, delta) else time @@ -243,19 +243,15 @@ def add_data_to_zarr(firstcall=False): self.lasttime_written = time if len(indices_to_write) > 0: - ids2D = pset.collection.getvardata('id', indices_to_write) - self.fileidoffset - once_written = pset.collection.getvardata('once_written', indices_to_write) - new_ids = np.where(once_written == 0)[0] - ids1D = np.empty((len(new_ids),), dtype=int) - first_write = np.empty((len(new_ids),), dtype=int) - for i, id in enumerate(new_ids): - pset.collection.setvardata('once_written', indices_to_write[id], 1) - ids1D[i] = ids2D[id] - first_write[i] = indices_to_write[id] + ids = pset.collection.getvardata('id', indices_to_write) - self.fileidoffset + once_ids = np.where(pset.collection.getvardata('once_written', indices_to_write) == 0)[0] + ids_once = ids[once_ids] + indices_to_write_once = indices_to_write[once_ids] + pset.collection.setvardata('once_written', indices_to_write_once, np.ones(len(ids_once))) if MPI: - maxids = MPI.COMM_WORLD.gather(max(ids2D)+1, root=0) - ids2Dlens = MPI.COMM_WORLD.gather(len(ids2D), root=0) + ids2Dlens = MPI.COMM_WORLD.gather(len(ids), root=0) + maxids = MPI.COMM_WORLD.gather(max(ids)+1, root=0) if self.mpi_rank == 0: maxids = max(maxids) @@ -263,8 +259,8 @@ def add_data_to_zarr(firstcall=False): minchunks = int(MPI.COMM_WORLD.bcast(ids2Dlens, root=0)) self.maxids = int(MPI.COMM_WORLD.bcast(maxids, root=0)) else: - minchunks = len(ids2D) - self.maxids = max(ids2D)+1 + minchunks = len(ids) + self.maxids = max(ids)+1 if self.maxids > len(self.obs_written): self.obs_written = np.append(self.obs_written, np.zeros((self.maxids-len(self.obs_written)), dtype=int)) @@ -283,11 +279,11 @@ def add_data_to_zarr(firstcall=False): varout = self._convert_varout_name(var) if self.write_once(var): data = np.full((arrsize[0],), np.nan, dtype=self.vars_to_write[var]) - data[ids1D] = pset.collection.getvardata(var, first_write) + data[ids_once] = pset.collection.getvardata(var, indices_to_write_once) dims = ["traj"] else: data = np.full(arrsize, np.nan, dtype=self.vars_to_write[var]) - data[ids2D, 0] = pset.collection.getvardata(var, indices_to_write) + data[ids, 0] = pset.collection.getvardata(var, indices_to_write) dims = ["traj", "obs"] ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) ds[varout].encoding['chunks'] = self.chunks[0] if self.write_once(var) else self.chunks @@ -299,4 +295,4 @@ def add_data_to_zarr(firstcall=False): add_data_to_zarr(firstcall=True) else: add_data_to_zarr() - self.obs_written[np.array(ids2D)] += 1 + self.obs_written[np.array(ids)] += 1 From 229488d50a1977daa43e3c0e980ef33b48898aac Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 24 Aug 2022 16:41:46 +0200 Subject: [PATCH 58/79] Updating test_mpi to also run with 8 particles Currently fails --- tests/test_mpirun.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_mpirun.py b/tests/test_mpirun.py index e1f8d00dc..7864e4f7c 100644 --- a/tests/test_mpirun.py +++ b/tests/test_mpirun.py @@ -11,17 +11,17 @@ @pytest.mark.skipif(sys.platform.startswith("darwin"), reason="skipping macOS test as problem with file in pytest") @pytest.mark.parametrize('pset_mode', ['soa', 'aos']) -@pytest.mark.parametrize('repeatdt', [200*86400, 10*86400]) -@pytest.mark.parametrize('maxage', [600*86400, 10*86400]) -def test_mpi_run(pset_mode, tmpdir, repeatdt, maxage): +@pytest.mark.parametrize('repeatdt, maxage', [(20*86400, 600*86400), (10*86400, 10*86400)]) +@pytest.mark.parametrize('nump', [4, 8]) +def test_mpi_run(pset_mode, tmpdir, repeatdt, maxage, nump): if MPI: stommel_file = path.join(path.dirname(__file__), '..', 'parcels', 'examples', 'example_stommel.py') outputMPI = tmpdir.join('StommelMPI.zarr') outputNoMPI = tmpdir.join('StommelNoMPI.zarr') - system('mpirun -np 2 python %s -p 4 -o %s -r %d -a %d -psm %s' % (stommel_file, outputMPI, repeatdt, maxage, pset_mode)) - system('python %s -p 4 -o %s -r %d -a %d -psm %s' % (stommel_file, outputNoMPI, repeatdt, maxage, pset_mode)) + system('mpirun -np 2 python %s -p %d -o %s -r %d -a %d -psm %s' % (stommel_file, nump, outputMPI, repeatdt, maxage, pset_mode)) + system('python %s -p %d -o %s -r %d -a %d -psm %s' % (stommel_file, nump, outputNoMPI, repeatdt, maxage, pset_mode)) ds1 = xr.open_zarr(outputMPI) ds2 = xr.open_zarr(outputNoMPI) From b1c5d1e616ff4d4f357bf933e4358e4ebc1a919b Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 24 Aug 2022 17:21:25 +0200 Subject: [PATCH 59/79] Fixing small bug when chunks < maxids in baseparticlefile --- parcels/particlefile/baseparticlefile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 43651bea9..6e07985a7 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -271,7 +271,7 @@ def add_data_to_zarr(firstcall=False): if self.mpi_rank == 0: ds = xr.Dataset(attrs=self.metadata) attrs = self._create_variables_attribute_dict() - if self.maxids > minchunks: + if (self.maxids > minchunks) or (self.maxids > self.chunks[0]): arrsize = (self.maxids, self.chunks[1]) else: arrsize = self.chunks From 18c75c287fe36ece9110d16204628b6764a6c7bb Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 25 Aug 2022 08:07:56 +0200 Subject: [PATCH 60/79] Updating delaystart tutorial to use zarr output --- parcels/examples/tutorial_delaystart.ipynb | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/parcels/examples/tutorial_delaystart.ipynb b/parcels/examples/tutorial_delaystart.ipynb index 5ae07afd2..3559a1d9b 100644 --- a/parcels/examples/tutorial_delaystart.ipynb +++ b/parcels/examples/tutorial_delaystart.ipynb @@ -1896,7 +1896,7 @@ " particle.mass = particle.mass / 2.\n", "\n", "pset = ParticleSet(fieldset=fieldset, pclass=GrowingParticle, lon=0, lat=0)\n", - "outfile = ParticleFile('growingparticles.nc', pset, outputdt=1)\n", + "outfile = ParticleFile('growingparticles.zarr', pset, outputdt=1)\n", "\n", "for t in range(40):\n", " pset.execute(GrowParticles, runtime=1, dt=1, output_file=outfile)\n", @@ -1904,8 +1904,7 @@ " if p.splittime > 0:\n", " pset.add(ParticleSet(fieldset=fieldset, pclass=GrowingParticle, lon=0, lat=0, \n", " time=p.splittime, mass=p.splitmass))\n", - " p.splittime = -1 # reset splittime\n", - "outfile.close()" + " p.splittime = -1 # reset splittime" ] }, { @@ -1936,7 +1935,7 @@ } ], "source": [ - "ds = xr.open_dataset('growingparticles.nc')\n", + "ds = xr.open_zarr('growingparticles.zarr')\n", "plt.plot(ds.time.values[:].astype('timedelta64[s]').T, ds.mass.T)\n", "plt.grid()\n", "plt.xlabel('Time')\n", From f789075419bf48deb6c691250eb7bcc1bdb8c683 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 25 Aug 2022 14:36:52 +0200 Subject: [PATCH 61/79] Updating MPI version of zarr writing Simplifying (and speeidng up) the writing to zarr in MPI mode, by letting each processor write to its own file and then combine with `xr.merge()` at the end --- parcels/particlefile/baseparticlefile.py | 112 +++++++++-------------- tests/test_mpirun.py | 9 +- tests/test_particle_file.py | 2 +- 3 files changed, 53 insertions(+), 70 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 6e07985a7..0a7915e65 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -66,16 +66,13 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde self.lonlatdepth_dtype = self.particleset.collection.lonlatdepth_dtype self.maxids = 0 self.obs_written = np.empty((0,), dtype=int) + self.pids_written = {} self.create_new_zarrfile = create_new_zarrfile self.vars_to_write = {} for var in self.particleset.collection.ptype.variables: if var.to_write: self.vars_to_write[var.name] = var.dtype self.mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 - id0 = particleset.collection.getvardata('id', [0]) - self.fileidoffset = [0] if id0 is None else id0 - if MPI: - self.fileidoffset = MPI.COMM_WORLD.bcast(self.fileidoffset, root=0)[0] # Reset once-written flag of each particle, in case new ParticleFile created for a ParticleSet particleset.collection.setallvardata('once_written', 0) @@ -101,6 +98,8 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde if extension in ['.nc', '.nc4']: raise RuntimeError('Output in NetCDF is not supported anymore. Use .zarr extension for ParticleFile name.') self.fname = name if extension in ['.zarr'] else "%s.zarr" % name + if MPI.COMM_WORLD.Get_size() > 1: + self.fname = os.path.join(self.fname, f"proc{self.mpi_rank}") # TODO check if we can also do this with zarr-groups @abstractmethod def _reserved_var_names(self): @@ -196,31 +195,6 @@ def write(self, pset, time, deleted_only=False): :param deleted_only: Flag to write only the deleted Particles """ - def add_data_to_zarr(firstcall=False): - # Helper function to write to a zarr file - store = zarr.DirectoryStore(self.fname) - Z = zarr.group(store=store, overwrite=False) - obs = self.obs_written[np.array(ids)] - if self.mpi_rank == 0: - for var in self.vars_to_write: - varout = self._convert_varout_name(var) - if self.maxids > Z[varout].shape[0]: - self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) - if not self.write_once(var): - if max(obs) >= Z[varout].shape[1]: - self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) - - if MPI and (not firstcall): - MPI.COMM_WORLD.barrier() - - for var in self.vars_to_write: - varout = self._convert_varout_name(var) - if self.write_once(var): - if len(ids_once) > 0: - Z[varout].vindex[ids_once] = pset.collection.getvardata(var, indices_to_write_once) - else: - Z[varout].vindex[ids, obs] = pset.collection.getvardata(var, indices_to_write) - time = time.total_seconds() if isinstance(time, delta) else time if self.lasttime_written != time and (self.write_ondelete is False or deleted_only is not False): @@ -243,56 +217,58 @@ def add_data_to_zarr(firstcall=False): self.lasttime_written = time if len(indices_to_write) > 0: - ids = pset.collection.getvardata('id', indices_to_write) - self.fileidoffset + ids = np.zeros(len(indices_to_write), dtype=int) + for i, pid in enumerate(pset.collection.getvardata('id', indices_to_write)): # TODO check if we can avoid for-loop here + if pid not in self.pids_written: + self.pids_written[pid] = self.maxids + self.maxids += 1 + ids[i] = self.pids_written[pid] + once_ids = np.where(pset.collection.getvardata('once_written', indices_to_write) == 0)[0] ids_once = ids[once_ids] indices_to_write_once = indices_to_write[once_ids] pset.collection.setvardata('once_written', indices_to_write_once, np.ones(len(ids_once))) - if MPI: - ids2Dlens = MPI.COMM_WORLD.gather(len(ids), root=0) - maxids = MPI.COMM_WORLD.gather(max(ids)+1, root=0) - - if self.mpi_rank == 0: - maxids = max(maxids) - ids2Dlens = min(ids2Dlens) - minchunks = int(MPI.COMM_WORLD.bcast(ids2Dlens, root=0)) - self.maxids = int(MPI.COMM_WORLD.bcast(maxids, root=0)) - else: - minchunks = len(ids) - self.maxids = max(ids)+1 - if self.maxids > len(self.obs_written): self.obs_written = np.append(self.obs_written, np.zeros((self.maxids-len(self.obs_written)), dtype=int)) if self.create_new_zarrfile: if self.chunks is None: - self.chunks = (minchunks, 10) - if self.mpi_rank == 0: - ds = xr.Dataset(attrs=self.metadata) - attrs = self._create_variables_attribute_dict() - if (self.maxids > minchunks) or (self.maxids > self.chunks[0]): - arrsize = (self.maxids, self.chunks[1]) + self.chunks = (len(ids), 10) + ds = xr.Dataset(attrs=self.metadata) + attrs = self._create_variables_attribute_dict() + if (self.maxids > len(ids)) or (self.maxids > self.chunks[0]): + arrsize = (self.maxids, self.chunks[1]) + else: + arrsize = self.chunks + for var in self.vars_to_write: + varout = self._convert_varout_name(var) + if self.write_once(var): + data = np.full((arrsize[0],), np.nan, dtype=self.vars_to_write[var]) + data[ids_once] = pset.collection.getvardata(var, indices_to_write_once) + dims = ["traj"] else: - arrsize = self.chunks - for var in self.vars_to_write: - varout = self._convert_varout_name(var) - if self.write_once(var): - data = np.full((arrsize[0],), np.nan, dtype=self.vars_to_write[var]) - data[ids_once] = pset.collection.getvardata(var, indices_to_write_once) - dims = ["traj"] - else: - data = np.full(arrsize, np.nan, dtype=self.vars_to_write[var]) - data[ids, 0] = pset.collection.getvardata(var, indices_to_write) - dims = ["traj", "obs"] - ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) - ds[varout].encoding['chunks'] = self.chunks[0] if self.write_once(var) else self.chunks - ds.to_zarr(self.fname, mode='w') + data = np.full(arrsize, np.nan, dtype=self.vars_to_write[var]) + data[ids, 0] = pset.collection.getvardata(var, indices_to_write) + dims = ["traj", "obs"] + ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) + ds[varout].encoding['chunks'] = self.chunks[0] if self.write_once(var) else self.chunks + ds.to_zarr(self.fname, mode='w') # , group=self.mpi_rank) #TODO try to get to work with groups self.create_new_zarrfile = False - if MPI: - MPI.COMM_WORLD.barrier() - if self.mpi_rank > 0: - add_data_to_zarr(firstcall=True) else: - add_data_to_zarr() + store = zarr.DirectoryStore(self.fname) + Z = zarr.group(store=store, overwrite=False) # .create_group(self.mpi_rank) #TODO try to get to work with groups + obs = self.obs_written[np.array(ids)] + for var in self.vars_to_write: + varout = self._convert_varout_name(var) + if self.maxids > Z[varout].shape[0]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) + if self.write_once(var): + if len(ids_once) > 0: + Z[varout].vindex[ids_once] = pset.collection.getvardata(var, indices_to_write_once) + else: + if max(obs) >= Z[varout].shape[1]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) + Z[varout].vindex[ids, obs] = pset.collection.getvardata(var, indices_to_write) + self.obs_written[np.array(ids)] += 1 diff --git a/tests/test_mpirun.py b/tests/test_mpirun.py index 7864e4f7c..3b1709823 100644 --- a/tests/test_mpirun.py +++ b/tests/test_mpirun.py @@ -1,4 +1,5 @@ from os import path, system +from glob import glob import numpy as np import pytest import sys @@ -23,7 +24,13 @@ def test_mpi_run(pset_mode, tmpdir, repeatdt, maxage, nump): system('mpirun -np 2 python %s -p %d -o %s -r %d -a %d -psm %s' % (stommel_file, nump, outputMPI, repeatdt, maxage, pset_mode)) system('python %s -p %d -o %s -r %d -a %d -psm %s' % (stommel_file, nump, outputNoMPI, repeatdt, maxage, pset_mode)) - ds1 = xr.open_zarr(outputMPI) + files = glob(path.join(outputMPI, "proc*")) + ds11 = xr.open_zarr(files[0]) + ds12 = xr.open_zarr(files[1]) + ds11 = ds11.assign_coords({'traj': ('traj', ds11.trajectory.values)}) + ds12 = ds12.assign_coords({'traj': ('traj', ds12.trajectory.values)}) + ds1 = xr.merge([ds11, ds12], compat='no_conflicts') + ds2 = xr.open_zarr(outputNoMPI) for v in ds2.variables.keys(): diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index 20753022f..ce102d01e 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -265,7 +265,7 @@ def Update_lon(particle, fieldset, time): output_file=pfile) ds = xr.open_zarr(outfilepath) trajs = ds['trajectory'][:] - assert np.all(np.diff(trajs.values) > 0) # all particles written in order of traj ID + assert np.all(np.diff(trajs.values) < 0) # all particles written in order of release ds.close() From 6e2c05e1bfd1c131528d6209a65f4bee7a8ed87d Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Thu, 25 Aug 2022 15:14:38 +0200 Subject: [PATCH 62/79] Fixing bug when MPI not installed --- parcels/particlefile/baseparticlefile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 0a7915e65..8a93c5d1f 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -98,7 +98,7 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde if extension in ['.nc', '.nc4']: raise RuntimeError('Output in NetCDF is not supported anymore. Use .zarr extension for ParticleFile name.') self.fname = name if extension in ['.zarr'] else "%s.zarr" % name - if MPI.COMM_WORLD.Get_size() > 1: + if MPI and MPI.COMM_WORLD.Get_size() > 1: self.fname = os.path.join(self.fname, f"proc{self.mpi_rank}") # TODO check if we can also do this with zarr-groups @abstractmethod From dd61b21f094302d87cdfa4721d648ada9ee17e19 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Fri, 26 Aug 2022 09:45:51 +0200 Subject: [PATCH 63/79] Making trajectory a coordinate variable in zarr For easier merging of multiple zarr outputs in MPI mode --- parcels/particlefile/baseparticlefile.py | 15 ++++++++++----- tests/test_mpirun.py | 2 -- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 8a93c5d1f..f9c0227f8 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -218,7 +218,8 @@ def write(self, pset, time, deleted_only=False): if len(indices_to_write) > 0: ids = np.zeros(len(indices_to_write), dtype=int) - for i, pid in enumerate(pset.collection.getvardata('id', indices_to_write)): # TODO check if we can avoid for-loop here + pids = pset.collection.getvardata('id', indices_to_write) + for i, pid in enumerate(pids): # TODO check if we can avoid for-loop here if pid not in self.pids_written: self.pids_written[pid] = self.maxids self.maxids += 1 @@ -235,7 +236,11 @@ def write(self, pset, time, deleted_only=False): if self.create_new_zarrfile: if self.chunks is None: self.chunks = (len(ids), 10) - ds = xr.Dataset(attrs=self.metadata) + elif self.chunks[0] > len(ids): + logger.warning(f'Chunk size for trajectory ({self.chunks[0]}) is larger than length of initial set to write. ' + f'Reducing ParticleFile chunks to ({len(ids)}, {self.chunks[1]})') + self.chunks = (len(ids), self.chunks[1]) + ds = xr.Dataset(attrs=self.metadata, coords={"trajectory": ("trajectory", pids)}) attrs = self._create_variables_attribute_dict() if (self.maxids > len(ids)) or (self.maxids > self.chunks[0]): arrsize = (self.maxids, self.chunks[1]) @@ -243,14 +248,14 @@ def write(self, pset, time, deleted_only=False): arrsize = self.chunks for var in self.vars_to_write: varout = self._convert_varout_name(var) - if self.write_once(var): + if self.write_once(var) and var not in ['trajectory']: data = np.full((arrsize[0],), np.nan, dtype=self.vars_to_write[var]) data[ids_once] = pset.collection.getvardata(var, indices_to_write_once) - dims = ["traj"] + dims = ["trajectory"] else: data = np.full(arrsize, np.nan, dtype=self.vars_to_write[var]) data[ids, 0] = pset.collection.getvardata(var, indices_to_write) - dims = ["traj", "obs"] + dims = ["trajectory", "obs"] ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) ds[varout].encoding['chunks'] = self.chunks[0] if self.write_once(var) else self.chunks ds.to_zarr(self.fname, mode='w') # , group=self.mpi_rank) #TODO try to get to work with groups diff --git a/tests/test_mpirun.py b/tests/test_mpirun.py index 3b1709823..f5c194aad 100644 --- a/tests/test_mpirun.py +++ b/tests/test_mpirun.py @@ -27,8 +27,6 @@ def test_mpi_run(pset_mode, tmpdir, repeatdt, maxage, nump): files = glob(path.join(outputMPI, "proc*")) ds11 = xr.open_zarr(files[0]) ds12 = xr.open_zarr(files[1]) - ds11 = ds11.assign_coords({'traj': ('traj', ds11.trajectory.values)}) - ds12 = ds12.assign_coords({'traj': ('traj', ds12.trajectory.values)}) ds1 = xr.merge([ds11, ds12], compat='no_conflicts') ds2 = xr.open_zarr(outputNoMPI) From 31ea6672c7deab8f0f1bb6f4b125eb7351cfdd72 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Mon, 29 Aug 2022 14:40:36 +0200 Subject: [PATCH 64/79] Simplifying code in baseparticlefile for findings ids to write --- parcels/particlefile/baseparticlefile.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index f9c0227f8..95d3335be 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -217,13 +217,12 @@ def write(self, pset, time, deleted_only=False): self.lasttime_written = time if len(indices_to_write) > 0: - ids = np.zeros(len(indices_to_write), dtype=int) pids = pset.collection.getvardata('id', indices_to_write) - for i, pid in enumerate(pids): # TODO check if we can avoid for-loop here - if pid not in self.pids_written: - self.pids_written[pid] = self.maxids - self.maxids += 1 - ids[i] = self.pids_written[pid] + to_add = sorted(set(pids) - set(self.pids_written.keys())) + for i, pid in enumerate(to_add): + self.pids_written[pid] = self.maxids + i + ids = np.array([self.pids_written[p] for p in pids], dtype=int) + self.maxids = len(self.pids_written) once_ids = np.where(pset.collection.getvardata('once_written', indices_to_write) == 0)[0] ids_once = ids[once_ids] From 1a9731dcd38efcb79d1533caa82721e040153dda Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Mon, 29 Aug 2022 17:18:53 +0200 Subject: [PATCH 65/79] Adding an obs coordinate to the zarr file --- parcels/particlefile/baseparticlefile.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 95d3335be..0e7550ae7 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -99,7 +99,7 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde raise RuntimeError('Output in NetCDF is not supported anymore. Use .zarr extension for ParticleFile name.') self.fname = name if extension in ['.zarr'] else "%s.zarr" % name if MPI and MPI.COMM_WORLD.Get_size() > 1: - self.fname = os.path.join(self.fname, f"proc{self.mpi_rank}") # TODO check if we can also do this with zarr-groups + self.fname = os.path.join(self.fname, f"proc{self.mpi_rank:02d}") # TODO check if we can also do this with zarr-groups @abstractmethod def _reserved_var_names(self): @@ -178,6 +178,9 @@ def write_once(self, var): def _extend_zarr_dims(self, Z, store, dtype, axis): if axis == 1: a = np.full((Z.shape[0], self.chunks[1]), np.nan, dtype=dtype) + obs = zarr.group(store=store, overwrite=False)["obs"] + if len(obs) == Z.shape[1]: + obs.append(np.arange(self.chunks[1])+obs[-1]+1) else: extra_trajs = max(self.maxids - Z.shape[0], self.chunks[0]) if len(Z.shape) == 2: @@ -239,12 +242,13 @@ def write(self, pset, time, deleted_only=False): logger.warning(f'Chunk size for trajectory ({self.chunks[0]}) is larger than length of initial set to write. ' f'Reducing ParticleFile chunks to ({len(ids)}, {self.chunks[1]})') self.chunks = (len(ids), self.chunks[1]) - ds = xr.Dataset(attrs=self.metadata, coords={"trajectory": ("trajectory", pids)}) - attrs = self._create_variables_attribute_dict() if (self.maxids > len(ids)) or (self.maxids > self.chunks[0]): arrsize = (self.maxids, self.chunks[1]) else: arrsize = self.chunks + ds = xr.Dataset(attrs=self.metadata, coords={"trajectory": ("trajectory", pids), + "obs": ("obs", np.arange(arrsize[1], dtype=np.int32))}) + attrs = self._create_variables_attribute_dict() for var in self.vars_to_write: varout = self._convert_varout_name(var) if self.write_once(var) and var not in ['trajectory']: From 44b859326a6ae45de9565599e4ceaaa906c9de43 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Mon, 29 Aug 2022 20:19:52 +0200 Subject: [PATCH 66/79] Add unit test to check if zarr trajectory.dtype is int64 --- tests/test_particle_file.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index ce102d01e..c57c637a1 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -265,6 +265,7 @@ def Update_lon(particle, fieldset, time): output_file=pfile) ds = xr.open_zarr(outfilepath) trajs = ds['trajectory'][:] + assert trajs.values.dtype == 'int64' assert np.all(np.diff(trajs.values) < 0) # all particles written in order of release ds.close() From 4279a20e9097ea3f9f249b34575035155c76401d Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Mon, 29 Aug 2022 21:04:25 +0200 Subject: [PATCH 67/79] Fixing bug where trajectory written as float In xarray version 2022.6.0. Thanks for noticing, @JamiePringle! --- parcels/particlefile/baseparticlefile.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 0e7550ae7..0513c5c09 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -251,16 +251,17 @@ def write(self, pset, time, deleted_only=False): attrs = self._create_variables_attribute_dict() for var in self.vars_to_write: varout = self._convert_varout_name(var) - if self.write_once(var) and var not in ['trajectory']: - data = np.full((arrsize[0],), np.nan, dtype=self.vars_to_write[var]) - data[ids_once] = pset.collection.getvardata(var, indices_to_write_once) - dims = ["trajectory"] - else: - data = np.full(arrsize, np.nan, dtype=self.vars_to_write[var]) - data[ids, 0] = pset.collection.getvardata(var, indices_to_write) - dims = ["trajectory", "obs"] - ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) - ds[varout].encoding['chunks'] = self.chunks[0] if self.write_once(var) else self.chunks + if varout not in ['trajectory']: + if self.write_once(var): + data = np.full((arrsize[0],), np.nan, dtype=self.vars_to_write[var]) + data[ids_once] = pset.collection.getvardata(var, indices_to_write_once) + dims = ["trajectory"] + else: + data = np.full(arrsize, np.nan, dtype=self.vars_to_write[var]) + data[ids, 0] = pset.collection.getvardata(var, indices_to_write) + dims = ["trajectory", "obs"] + ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) + ds[varout].encoding['chunks'] = self.chunks[0] if self.write_once(var) else self.chunks ds.to_zarr(self.fname, mode='w') # , group=self.mpi_rank) #TODO try to get to work with groups self.create_new_zarrfile = False else: From 8a02ccd86f34028a153ebc31c906c41233cb9127 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 30 Aug 2022 11:55:08 +0200 Subject: [PATCH 68/79] Fixing naming of zarr files for MPI --- parcels/particlefile/baseparticlefile.py | 13 ++++++++----- tests/test_mpirun.py | 7 +++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index 0513c5c09..ad8f10697 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -97,9 +97,12 @@ def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_onde extension = os.path.splitext(str(name))[1] if extension in ['.nc', '.nc4']: raise RuntimeError('Output in NetCDF is not supported anymore. Use .zarr extension for ParticleFile name.') - self.fname = name if extension in ['.zarr'] else "%s.zarr" % name if MPI and MPI.COMM_WORLD.Get_size() > 1: - self.fname = os.path.join(self.fname, f"proc{self.mpi_rank:02d}") # TODO check if we can also do this with zarr-groups + self.fname = os.path.join(name, f"proc{self.mpi_rank:02d}.zarr") + if extension in ['.zarr']: + logger.warning(f'The ParticleFile name contains .zarr extension, but zarr files will be written per processor in MPI mode at {self.fname}') + else: + self.fname = name if extension in ['.zarr'] else "%s.zarr" % name @abstractmethod def _reserved_var_names(self): @@ -251,7 +254,7 @@ def write(self, pset, time, deleted_only=False): attrs = self._create_variables_attribute_dict() for var in self.vars_to_write: varout = self._convert_varout_name(var) - if varout not in ['trajectory']: + if varout not in ['trajectory']: # because 'trajectory' is written as coordinate if self.write_once(var): data = np.full((arrsize[0],), np.nan, dtype=self.vars_to_write[var]) data[ids_once] = pset.collection.getvardata(var, indices_to_write_once) @@ -262,11 +265,11 @@ def write(self, pset, time, deleted_only=False): dims = ["trajectory", "obs"] ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) ds[varout].encoding['chunks'] = self.chunks[0] if self.write_once(var) else self.chunks - ds.to_zarr(self.fname, mode='w') # , group=self.mpi_rank) #TODO try to get to work with groups + ds.to_zarr(self.fname, mode='w') self.create_new_zarrfile = False else: store = zarr.DirectoryStore(self.fname) - Z = zarr.group(store=store, overwrite=False) # .create_group(self.mpi_rank) #TODO try to get to work with groups + Z = zarr.group(store=store, overwrite=False) obs = self.obs_written[np.array(ids)] for var in self.vars_to_write: varout = self._convert_varout_name(var) diff --git a/tests/test_mpirun.py b/tests/test_mpirun.py index f5c194aad..589fe51c9 100644 --- a/tests/test_mpirun.py +++ b/tests/test_mpirun.py @@ -18,16 +18,15 @@ def test_mpi_run(pset_mode, tmpdir, repeatdt, maxage, nump): if MPI: stommel_file = path.join(path.dirname(__file__), '..', 'parcels', 'examples', 'example_stommel.py') - outputMPI = tmpdir.join('StommelMPI.zarr') + outputMPI = tmpdir.join('StommelMPI') outputNoMPI = tmpdir.join('StommelNoMPI.zarr') system('mpirun -np 2 python %s -p %d -o %s -r %d -a %d -psm %s' % (stommel_file, nump, outputMPI, repeatdt, maxage, pset_mode)) system('python %s -p %d -o %s -r %d -a %d -psm %s' % (stommel_file, nump, outputNoMPI, repeatdt, maxage, pset_mode)) files = glob(path.join(outputMPI, "proc*")) - ds11 = xr.open_zarr(files[0]) - ds12 = xr.open_zarr(files[1]) - ds1 = xr.merge([ds11, ds12], compat='no_conflicts') + ds1 = xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', + compat='no_conflicts', coords='minimal').sortby(['trajectory']) ds2 = xr.open_zarr(outputNoMPI) From 33654602b57b594186bfa6f03c21f822913a4318 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 30 Aug 2022 12:07:46 +0200 Subject: [PATCH 69/79] Adding info on zarr output in MPI to documentation --- parcels/examples/documentation_MPI.ipynb | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/parcels/examples/documentation_MPI.ipynb b/parcels/examples/documentation_MPI.ipynb index a2d120719..4f6703994 100644 --- a/parcels/examples/documentation_MPI.ipynb +++ b/parcels/examples/documentation_MPI.ipynb @@ -37,6 +37,45 @@ "Note that in principle this means that all MPI processors need access to the full `FieldSet`, which can be Gigabytes in size for large global datasets. Therefore, efficient parallelisation only works if at the same time we also chunk the `FieldSet` into smaller domains" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reading in the ParticleFile data in zarr format\n", + "\n", + "For efficiency, each processor will write its own data to a `zarr`-store. If the name of your `ParticleFile` is `fname`, then these stores will be located at `fname/proc00.zarr`, `fname/proc01.zarr`, etc.\n", + "\n", + "Reading in these stores and merging them into one `xarray.Dataset` can be done with" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from glob import glob\n", + "files = glob(path.join(fname, \"proc*\"))\n", + "ds = xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', compat='no_conflicts', coords='minimal')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that, if you have added particles during the `execute()` (for example because you used `repeatdt`), then the trajectories will not be ordered monotonically. While this may not be a problem, this will result in a different Dataset than a single-core simulation. If you do want the outputs of the MPI run to be the same as the single-core run, add `.sortby(['trajectory'])` at the end of the `xr.concat()` command" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', \n", + " compat='no_conflicts', coords='minimal').sortby(['trajectory'])" + ] + }, { "cell_type": "markdown", "metadata": {}, From 1f2b2a0240c9c773fd6fa5c8f22e94f8221b86da Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 30 Aug 2022 15:38:05 +0200 Subject: [PATCH 70/79] Adding info on MPI concatenation to output tutorial --- parcels/examples/tutorial_output.ipynb | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/parcels/examples/tutorial_output.ipynb b/parcels/examples/tutorial_output.ipynb index 58999f505..8bf097b15 100644 --- a/parcels/examples/tutorial_output.ipynb +++ b/parcels/examples/tutorial_output.ipynb @@ -69,7 +69,7 @@ "source": [ "## Reading the output file\n", "\n", - "Parcels exports output trajectories in [`zarr` format](https://zarr.readthedocs.io/en/stable/). Files in `zarr` are typically _much_ smaller in size than netcdf, although may be slightly more challenging to handle (although `xarray` has a fairly seamless [`open_zarr()` method](https://docs.xarray.dev/en/stable/generated/xarray.open_zarr.html))." + "Parcels exports output trajectories in [`zarr` format](https://zarr.readthedocs.io/en/stable/). Files in `zarr` are typically _much_ smaller in size than netcdf, although may be slightly more challenging to handle (but `xarray` has a fairly seamless [`open_zarr()` method](https://docs.xarray.dev/en/stable/generated/xarray.open_zarr.html))." ] }, { @@ -137,6 +137,13 @@ "print(data_xarray['trajectory'])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that if you are running Parcels on multiple processors with `mpirun`, you will need to concatenate the files of each processor, see also the [MPI documentation](https://nbviewer.jupyter.org/github/OceanParcels/parcels/blob/master/parcels/examples/documentation_MPI.ipynb#Reading-in-the-ParticleFile-data-in-zarr-format)." + ] + }, { "cell_type": "markdown", "metadata": {}, From 051c8bf20fdc57530498df16a7a381a185de250a Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 30 Aug 2022 16:25:12 +0200 Subject: [PATCH 71/79] Updating text of zarr explanation in MPI and output tutorials --- parcels/examples/documentation_MPI.ipynb | 19 +++++++++++++++++++ parcels/examples/tutorial_output.ipynb | 4 +++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/parcels/examples/documentation_MPI.ipynb b/parcels/examples/documentation_MPI.ipynb index 4f6703994..541c930cc 100644 --- a/parcels/examples/documentation_MPI.ipynb +++ b/parcels/examples/documentation_MPI.ipynb @@ -76,6 +76,25 @@ " compat='no_conflicts', coords='minimal').sortby(['trajectory'])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that if you want, you can save this new DataSet with the `.to_zarr()` or `.to_netcdf()` methods. \n", + "\n", + "When using `.to_zarr()`, then it further analyses may be sped up by first rechunking the DataSet, by using `ds.chunk()`. Note that in some cases, you will first need to remove the chunks encoding information manually, using a code like below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for v in ds.variables:\n", + " del ds[v].encoding['chunks']" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/parcels/examples/tutorial_output.ipynb b/parcels/examples/tutorial_output.ipynb index 8bf097b15..05f7e43a8 100644 --- a/parcels/examples/tutorial_output.ipynb +++ b/parcels/examples/tutorial_output.ipynb @@ -141,7 +141,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Note that if you are running Parcels on multiple processors with `mpirun`, you will need to concatenate the files of each processor, see also the [MPI documentation](https://nbviewer.jupyter.org/github/OceanParcels/parcels/blob/master/parcels/examples/documentation_MPI.ipynb#Reading-in-the-ParticleFile-data-in-zarr-format)." + "Note that if you are running Parcels on multiple processors with `mpirun`, you will need to concatenate the files of each processor, see also the [MPI documentation](https://nbviewer.jupyter.org/github/OceanParcels/parcels/blob/master/parcels/examples/documentation_MPI.ipynb#Reading-in-the-ParticleFile-data-in-zarr-format). \n", + "\n", + "Also, once you have loaded the data as an `xarray` DataSet using `xr.open_zarr()`, you can always save the file to NetCDF if you prefer with the `.to_netcdf()` method." ] }, { From 66807da50411343ff5fc7186dd6685d0bff40659 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 30 Aug 2022 17:12:55 +0200 Subject: [PATCH 72/79] Updating default zarr chunks to 1 obs --- parcels/particlefile/baseparticlefile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index ad8f10697..ec98169fb 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -240,7 +240,7 @@ def write(self, pset, time, deleted_only=False): if self.create_new_zarrfile: if self.chunks is None: - self.chunks = (len(ids), 10) + self.chunks = (len(ids), 1) elif self.chunks[0] > len(ids): logger.warning(f'Chunk size for trajectory ({self.chunks[0]}) is larger than length of initial set to write. ' f'Reducing ParticleFile chunks to ({len(ids)}, {self.chunks[1]})') From 884686132e4d7880307e89298455ab7cc37cd4ee Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 31 Aug 2022 08:07:08 +0200 Subject: [PATCH 73/79] Updating MPI documentation to also import os.path --- parcels/examples/documentation_MPI.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parcels/examples/documentation_MPI.ipynb b/parcels/examples/documentation_MPI.ipynb index 541c930cc..77c7efd53 100644 --- a/parcels/examples/documentation_MPI.ipynb +++ b/parcels/examples/documentation_MPI.ipynb @@ -55,6 +55,8 @@ "outputs": [], "source": [ "from glob import glob\n", + "from os import path\n", + "\n", "files = glob(path.join(fname, \"proc*\"))\n", "ds = xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', compat='no_conflicts', coords='minimal')" ] From 01bf12f05c828c89be29c490cf6be5351da0b622 Mon Sep 17 00:00:00 2001 From: JamiePringle Date: Tue, 6 Sep 2022 14:42:14 -0400 Subject: [PATCH 74/79] Add documentation to combine lMPI run output into single Zarr store Code to combine output from MPI run into a single Zarr file, re-chunk the data, and change variable types, and add variables to the zarr file even for very large output sizes. --- documentation_LargeRunsOutput.ipynb | 484 ++++++++++++++++++++++++++++ 1 file changed, 484 insertions(+) create mode 100644 documentation_LargeRunsOutput.ipynb diff --git a/documentation_LargeRunsOutput.ipynb b/documentation_LargeRunsOutput.ipynb new file mode 100644 index 000000000..5eef45387 --- /dev/null +++ b/documentation_LargeRunsOutput.ipynb @@ -0,0 +1,484 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e2d8b054", + "metadata": {}, + "source": [ + "# Combining large ocean-parcels datasets and choosing an optimal chunking. " + ] + }, + { + "cell_type": "markdown", + "id": "eb5792d0", + "metadata": {}, + "source": [ + "You might imagine that if you followed the instructions [on the making of parallel runs](https://github.com/OceanParcels/parcels/blob/dump_to_zarr/parcels/examples/documentation_MPI.ipynb) and the loading of the resulting dataset, you could just use the `dataset.to_zarr()` function to save the data to a single zarr datastore. This is true for small enough datasets -- but for a number of reasons, including the non-contiguous trajectory coordinate, if you try this with datasets whose size is larger than a 1/3 or so of the memory of your computer, you will find that it takes a long time, then fails as it exhausts the memory of your computer. \n", + "\n", + "This may be fixed in xarray or zarr in the future, but for now, we can work around this problem by saving the output in steps. \n", + "\n", + "At the same time, we can change the datatype of the output and modify the chunking of the dataset. Both of these can improve both performance and the size of the data on the disk. However, some care is required to do this, and this will be described in more detail below. " + ] + }, + { + "cell_type": "markdown", + "id": "4010c8d0", + "metadata": {}, + "source": [ + "## Why are we doing this? And what chunk sizes should we choose?" + ] + }, + { + "cell_type": "markdown", + "id": "46cb22d5", + "metadata": {}, + "source": [ + "If you are running a relatively small case (perhaps 1/10 the size of the memory of your machine), nearly anything you do will work. However, as your problems get larger, it can help to write the data into a single zarr datastore, and to chunk that store appropriately. \n", + "\n", + "To illustrate this, here is the time it takes to retrieve all the results (with `ds['variableName'].values`) of some common data structures with different chunk sizes. (What is a chunk size? More on that below). The data in this example has 39 million trajectories starting over 120 times, and there are 250 observations, resulting in a directory size of 88Gb in double precision and 39 in single. In this table, \"trajectory:5e4, obs:10\" indicates that each chunk extends over 50,000 trajectories and 10 obs. The chunking in the original data is roughly a few thousand observations and 10 obs. \n", + "\n", + "|File type|open [s]|read 1 obs, all traj [s]|read 23 obs, all traj [s]|read 8000 contigous traj, all obs [s]|read traj that start at a given time, all obs [s]|\n", + "|---|---|---|---|---|---|\n", + "|Straigth from parcels|2.9|8.4|59.9|1.5|17.4|\n", + "|trajectory:5e4, obs:10|0.48|2.5|19.5|0.4|10.33|\n", + "|trajectory:5e4, obs:100|0.55|20.5|13.8|0.5|3.88|\n", + "|trajectory:5e5, obs:10|0.54|2.2|16.3|0.85|18.5|\n", + "|trajectory:5e5, obs:100|0.46|19.9|40.0|0.62|49.36|\n", + "\n", + "\n", + "You can see several things in this. It is always quicker to open a single file, and for all data access patterns, there is are chunksizes that are more efficient than the default output. Why is this?\n", + "\n", + "The chunksize determines how data is stored on disk. For the default Zarr datastore, each chunk of data is stored as a single compressed file. In netCDF, chunking is similar except that the compressed data is stored within a single file. In either case, if you must access any data from within a chunk, you must read the entire chunk from disk. \n", + "\n", + "So when we access one obs dimension and many trajectories, the chunking scheme that is elongated in the trajectory direction is fastest. When we get all the observation for a scattered set of trajectories, the chunking that is elongated in observations is the best. In general, the product of the two chunksizes (the number of data points in a chunk) should be hundreds of thousands to 10s of millions. A suboptimal chunking scheme is usually not tragic, but if you know how you will most often access the data, you can save considerable time. " + ] + }, + { + "cell_type": "markdown", + "id": "0d5d3385", + "metadata": {}, + "source": [ + "## How to save the output of an MPI ocean parcels run to a single zarr dataset" + ] + }, + { + "cell_type": "markdown", + "id": "b5001615", + "metadata": {}, + "source": [ + "First, we need to import the necessary modules, specify the directory `inputDir` which contains the output of the parcels run (the directory that has proc01, proc02 and so forth), the location of the ouput zarr file `outputDir` and a dictionary giving the chunk size for the `trajectory` and `obs` coordinates, `chunksize`. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "2622a91d", + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "from pylab import *\n", + "from numpy import *\n", + "from glob import glob\n", + "from os import path\n", + "import time\n", + "\n", + "#first specify the directory in which the MPI code wrote its output\n", + "inputDir=('dataPathsTemp/'+\n", + " 'theAmericas_wholeGlobe_range100km_depthFrom_1m_to_500m_habitatTree_months01_to_02_fixed_1m/'+\n", + " '2008/tracks.zarr')\n", + "\n", + "\n", + "#specify chunksize and where the output zarr file should go; also set chunksize of output file\n", + "chunksize={'trajectory':5*int(1e4),'obs':10}; \n", + "outputDir='/home/pringle/jnkData/singleFile_5e4_X_10_example.zarr'" + ] + }, + { + "cell_type": "markdown", + "id": "33383cbe", + "metadata": {}, + "source": [ + "Now for large datasets, this code can take a while to run; for 36 million trajectories and 250 observations, it can take an hour and a half. I prefer not to accidently destroy data that takes more than an hour to create, so I put in a safety check and only let the code run if the output directory does not exist. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "51a1414d", + "metadata": {}, + "outputs": [], + "source": [ + "#do not overwrite existing data sets\n", + "if path.exists(outputDir):\n", + " print('the ouput path',outputDir,'exists')\n", + " print('please delete if you want to replace it')\n", + " assert False,'stopping execution'" + ] + }, + { + "cell_type": "markdown", + "id": "b8818397", + "metadata": {}, + "source": [ + "It will often be useful to change the [`dtype`](https://numpy.org/doc/stable/reference/generated/numpy.dtype.html) of the output data. Doing so can save a great deal of disk space. For example, the input data for this example is 88Gb in size, but by changing lat, lon and z to single precision, I can make the file about half as big. \n", + "\n", + "This comes at the cost of some accuracy. Float64 has 14 digits of accuracy, float32 has 7. For latitude and longitude, going from float64 to float32 increases the error by the circumfrence of the Earth devided 1e7, or about 1m. This is good enough for what I am doing. However, a year of time has about 3.15e7 seconds, and we often want to know within a second when a particle is released (to avoid floating point issues when picking out particles that start at a specific time). So the 3.15e7/1e7 error (a few seconds) in the time coordinate could cause problems. So I don't want to reduce the precision of time. \n", + "\n", + "To change precision, put an entry into the dictionary `varType` whose key is the name of the variable, and whose value is the type you wish the variable to be cast to:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9ca2bb15", + "metadata": {}, + "outputs": [], + "source": [ + "varType={\n", + " 'lat':dtype('float32'),\n", + " 'lon':dtype('float32'),\n", + " 'time':dtype('datetime64'),\n", + " 'z':dtype('float32'),\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "c26e9ba7", + "metadata": {}, + "source": [ + "Now we need to read in the data as discussed in the section on making an MPI run:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e7dd9f61", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "opening data from multiple process files\n", + " done opening in 2.19\n" + ] + } + ], + "source": [ + "print('opening data from multiple process files')\n", + "tic=time.time()\n", + "files = glob(path.join(inputDir, \"proc*\")); \n", + "dataIn = xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', \n", + " compat='no_conflicts', coords='minimal') \n", + "print(' done opening in %5.2f'%(time.time()-tic))" + ] + }, + { + "cell_type": "markdown", + "id": "f93a60ff", + "metadata": {}, + "source": [ + "Now we can take advantage of the `.astype` operator to change the type of the variables. This is a lazy operator, and it will only be applied to the data when the data values are requested below, when the data is written to a new zarr store. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6819cd84", + "metadata": {}, + "outputs": [], + "source": [ + "for v in varType.keys():\n", + " dataIn[v]=dataIn[v].astype(varType[v])" + ] + }, + { + "cell_type": "markdown", + "id": "f8410589", + "metadata": {}, + "source": [ + "The dataset is then rechunked to our desired shape. This does not actually do anything right now, but will when the data is written below. Before doing this, it is useful to remove the per-variable chunking metadata, because of inconsistencies which arrise due to (I think) each MPI process output having a different chunking. This is explained in more detail in https://github.com/dcs4cop/xcube/issues/347 " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9a56c3cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "re-chunking\n", + " done in 3.4046807289123535\n" + ] + } + ], + "source": [ + "print('re-chunking')\n", + "tic=time.time()\n", + "for v in dataIn.variables:\n", + " if 'chunks' in dataIn[v].encoding:\n", + " del dataIn[v].encoding['chunks']\n", + "dataIn=dataIn.chunk(chunksize)\n", + "print(' done in',time.time()-tic)" + ] + }, + { + "cell_type": "markdown", + "id": "6f59018b", + "metadata": {}, + "source": [ + "The dataset `dataIn` is now ready to be written back out in stages. But how big is each segment that will be written out? I have found that writing out about 650 million points per variable is easy on a laptop with 16Gb of memory. Larger sizes might be more efficient, but one does run into the law of diminishing returns fairly quickly. The number of points to write out is called `nPointsWrite` and it is integer devided by the `obs` dimension to get the number of rows to write out, `writeChunkLen`:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "de5415ed", + "metadata": {}, + "outputs": [], + "source": [ + "nPointsWrite=int(5e6)*130\n", + "writeChunkLen=nPointsWrite//dataIn.dims['obs']" + ] + }, + { + "cell_type": "markdown", + "id": "2b210d2c", + "metadata": {}, + "source": [ + "Now, at some point in the future, one could imagine making this code parallel; to make that easier, make sure that writeChunkLen is an even multiple of the trajectory chunk size, and also make sure it is at least one." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ecda3883", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " writing 2600000 trajectory's at a time\n" + ] + } + ], + "source": [ + "writeChunkLen=(writeChunkLen//chunksize['trajectory'])*chunksize['trajectory']\n", + "writeChunkLen=max(1,writeChunkLen) #must be at least 1!\n", + "print(\" writing %d trajectory's at a time\"%(writeChunkLen,))" + ] + }, + { + "cell_type": "markdown", + "id": "a1bd78bc", + "metadata": {}, + "source": [ + "Now we want to make a list of the indices that are to be written out, `chunkList`, so it goes from 0 to the total size of the trajectory dimension:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c993c932", + "metadata": {}, + "outputs": [], + "source": [ + "#make list of indices to write out, inclusive of end of the trajectory dimension\n", + "chunkList=list(range(0,dataIn.dims['trajectory'],writeChunkLen))\n", + "if chunkList[-1]!=dataIn.dims['trajectory']:\n", + " chunkList.append(dataIn.dims['trajectory'])" + ] + }, + { + "cell_type": "markdown", + "id": "52d6ace4", + "metadata": {}, + "source": [ + "Now iterate through the indices in chunkList, and write out each bit. The following code is a bit chatty, but as the code can take a while to run, I find it reassuring to know it is doing something. If one errs in using xarray, it can take an excessive amount of time to do things, so it is worth keeping an eye on things. After you start the code below running, go teach a class or revise a manuscript. It can take a while. On my test 88Gb file, it takes about an hour." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c14e2d39", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "starting to write to /home/pringle/jnkData/singleFile_5e4_X_10_example.zarr\n", + " getting chunk 0\n", + " starting to write\n", + " done with chunk 0 of 15 from 0 to 2600000 in 484.8957693576813\n", + " getting chunk 1\n", + " starting to write\n", + " done with chunk 1 of 15 from 2600000 to 5200000 in 490.55356001853943\n", + " getting chunk 2\n", + " starting to write\n", + " done with chunk 2 of 15 from 5200000 to 7800000 in 550.8107087612152\n", + " getting chunk 3\n", + " starting to write\n", + " done with chunk 3 of 15 from 7800000 to 10400000 in 484.8843400478363\n", + " getting chunk 4\n", + " starting to write\n", + " done with chunk 4 of 15 from 10400000 to 13000000 in 440.20081901550293\n", + " getting chunk 5\n", + " starting to write\n", + " done with chunk 5 of 15 from 13000000 to 15600000 in 344.02604961395264\n", + " getting chunk 6\n", + " starting to write\n", + " done with chunk 6 of 15 from 15600000 to 18200000 in 337.93852186203003\n", + " getting chunk 7\n", + " starting to write\n", + " done with chunk 7 of 15 from 18200000 to 20800000 in 455.27793192863464\n", + " getting chunk 8\n", + " starting to write\n", + " done with chunk 8 of 15 from 20800000 to 23400000 in 405.1483941078186\n", + " getting chunk 9\n", + " starting to write\n", + " done with chunk 9 of 15 from 23400000 to 26000000 in 310.73776030540466\n", + " getting chunk 10\n", + " starting to write\n", + " done with chunk 10 of 15 from 26000000 to 28600000 in 286.34414768218994\n", + " getting chunk 11\n", + " starting to write\n", + " done with chunk 11 of 15 from 28600000 to 31200000 in 247.74302196502686\n", + " getting chunk 12\n", + " starting to write\n", + " done with chunk 12 of 15 from 31200000 to 33800000 in 165.62516450881958\n", + " getting chunk 13\n", + " starting to write\n", + " done with chunk 13 of 15 from 33800000 to 36400000 in 155.9800374507904\n", + " getting chunk 14\n", + " starting to write\n", + " done with chunk 14 of 15 from 36400000 to 39000000 in 204.0485861301422\n", + " getting chunk 15\n", + " starting to write\n", + " done with chunk 15 of 15 from 39000000 to 39692941 in 43.03104019165039\n", + "Done writing in 5425.2\n" + ] + } + ], + "source": [ + "#write out the chunks\n", + "print('starting to write to',outputDir)\n", + "for nChunk in range(len(chunkList)-1):\n", + " innerTic=time.time()\n", + " print(' getting chunk',nChunk)\n", + " newds=dataIn.isel(trajectory=np.arange(chunkList[nChunk],chunkList[nChunk+1])) #do not sort by trajectory\n", + " print(' starting to write')\n", + " if nChunk==0:\n", + " newds.to_zarr(outputDir,mode='w')\n", + " else:\n", + " newds.to_zarr(outputDir,append_dim='trajectory')\n", + " print(' done with chunk %d of %d from'%(nChunk,len(chunkList)-2),\n", + " chunkList[nChunk],'to',chunkList[nChunk+1],'in',time.time()-innerTic)\n", + "\n", + "print('Done writing in %5.1f'%(time.time()-tic,))" + ] + }, + { + "cell_type": "markdown", + "id": "9080025f", + "metadata": {}, + "source": [ + "We can now load the zarr data set we have created, and see what is in it, compared to what was in the input dataset. " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "3157592c", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The original data\n", + " \n", + "Dimensions: (trajectory: 39692941, obs: 250)\n", + "Coordinates:\n", + " * obs (obs) int32 0 1 2 3 4 5 6 7 ... 242 243 244 245 246 247 248 249\n", + " * trajectory (trajectory) int64 16 23 68 165 ... 39692792 39692889 39692920\n", + "Data variables:\n", + " age (trajectory, obs) float32 dask.array\n", + " lat (trajectory, obs) float64 dask.array\n", + " lon (trajectory, obs) float64 dask.array\n", + " time (trajectory, obs) datetime64[ns] dask.array\n", + " z (trajectory, obs) float64 dask.array\n", + "Attributes:\n", + " Conventions: CF-1.6/CF-1.7\n", + " feature_type: trajectory\n", + " ncei_template_version: NCEI_NetCDF_Trajectory_Template_v2.0\n", + " parcels_mesh: spherical\n", + " parcels_version: 2.3.2.dev137 \n", + "\n", + "The new dataSet\n", + " \n", + "Dimensions: (trajectory: 39692941, obs: 250)\n", + "Coordinates:\n", + " * obs (obs) int32 0 1 2 3 4 5 6 7 ... 242 243 244 245 246 247 248 249\n", + " * trajectory (trajectory) int64 16 23 68 165 ... 39692792 39692889 39692920\n", + "Data variables:\n", + " age (trajectory, obs) float32 dask.array\n", + " lat (trajectory, obs) float32 dask.array\n", + " lon (trajectory, obs) float32 dask.array\n", + " time (trajectory, obs) datetime64[ns] dask.array\n", + " z (trajectory, obs) float32 dask.array\n", + "Attributes:\n", + " Conventions: CF-1.6/CF-1.7\n", + " feature_type: trajectory\n", + " ncei_template_version: NCEI_NetCDF_Trajectory_Template_v2.0\n", + " parcels_mesh: spherical\n", + " parcels_version: 2.3.2.dev137\n" + ] + } + ], + "source": [ + "dataOriginal=dataIn = xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', \n", + " compat='no_conflicts', coords='minimal') \n", + "dataProcessed=xr.open_zarr(outputDir)\n", + "print('The original data\\n',dataOriginal,'\\n\\nThe new dataSet\\n',dataProcessed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "186c678b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 125dc61fe465ca88436089d16578a2ee0e4e1600 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 13 Sep 2022 11:56:31 +0200 Subject: [PATCH 75/79] Moving documentation_largeoutputfile to parcels/examples folder --- .../examples/documentation_LargeRunsOutput.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename documentation_LargeRunsOutput.ipynb => parcels/examples/documentation_LargeRunsOutput.ipynb (100%) diff --git a/documentation_LargeRunsOutput.ipynb b/parcels/examples/documentation_LargeRunsOutput.ipynb similarity index 100% rename from documentation_LargeRunsOutput.ipynb rename to parcels/examples/documentation_LargeRunsOutput.ipynb From 0b1f4a2edd3481a78937e69a1bc088bebaa6ccf4 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Tue, 13 Sep 2022 12:00:52 +0200 Subject: [PATCH 76/79] Adding cell on large output runs to mpi documentation Note link will not work yet; only when merged into master --- parcels/examples/documentation_MPI.ipynb | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/parcels/examples/documentation_MPI.ipynb b/parcels/examples/documentation_MPI.ipynb index 77c7efd53..4e31f3643 100644 --- a/parcels/examples/documentation_MPI.ipynb +++ b/parcels/examples/documentation_MPI.ipynb @@ -34,7 +34,7 @@ "\n", "Parcels will then split the `ParticleSet` into `` smaller ParticleSets, based on a `sklearn.cluster.KMeans` clustering. Each of those smaller `ParticleSets` will be executed by one of the `` MPI processors.\n", "\n", - "Note that in principle this means that all MPI processors need access to the full `FieldSet`, which can be Gigabytes in size for large global datasets. Therefore, efficient parallelisation only works if at the same time we also chunk the `FieldSet` into smaller domains" + "Note that in principle this means that all MPI processors need access to the full `FieldSet`, which can be Gigabytes in size for large global datasets. Therefore, efficient parallelisation only works if at the same time we also chunk the `FieldSet` into smaller domains." ] }, { @@ -97,6 +97,13 @@ " del ds[v].encoding['chunks']" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For small projects, the above instructions are sufficient. If your project is large, then it is helpful to combine the `proc*` directories into a single zarr dataset and to optimize the chunking for your analysis. What is \"large\"? If you find yourself running out of memory while doing your analyses, saving the results, or sorting the dataset, or if reading the data is taking longer than you can tolerate, your problem is \"large.\" Another rule of thumb is if the size of your output directory is 1/3 or more of the memory of your machine, your problem is large. Chunking and combining the `proc*` data in order to speed up analysis is discussed [in the documentation on runs with large output](https://nbviewer.org/github/OceanParcels/parcels/blob/master/parcels/examples/documentation_LargeRunsOutput.ipynb). " + ] + }, { "cell_type": "markdown", "metadata": {}, From cfd482150d406dce1b3b61a43be1f24f8069b4a9 Mon Sep 17 00:00:00 2001 From: JamiePringle Date: Tue, 13 Sep 2022 11:22:59 -0400 Subject: [PATCH 77/79] update documentation to fix xaray datetime64[ns] bug change documentation_MPI.ipynb and documentation_LargeRunsOutput.ipynb to work around slow .to_zarr() of datasets that contain datetime variables. --- .../documentation_LargeRunsOutput.ipynb | 203 ++++-------------- parcels/examples/documentation_MPI.ipynb | 4 +- 2 files changed, 40 insertions(+), 167 deletions(-) diff --git a/parcels/examples/documentation_LargeRunsOutput.ipynb b/parcels/examples/documentation_LargeRunsOutput.ipynb index 5eef45387..f0ee8b9c7 100644 --- a/parcels/examples/documentation_LargeRunsOutput.ipynb +++ b/parcels/examples/documentation_LargeRunsOutput.ipynb @@ -13,11 +13,9 @@ "id": "eb5792d0", "metadata": {}, "source": [ - "You might imagine that if you followed the instructions [on the making of parallel runs](https://github.com/OceanParcels/parcels/blob/dump_to_zarr/parcels/examples/documentation_MPI.ipynb) and the loading of the resulting dataset, you could just use the `dataset.to_zarr()` function to save the data to a single zarr datastore. This is true for small enough datasets -- but for a number of reasons, including the non-contiguous trajectory coordinate, if you try this with datasets whose size is larger than a 1/3 or so of the memory of your computer, you will find that it takes a long time, then fails as it exhausts the memory of your computer. \n", + "You might imagine that if you followed the instructions [on the making of parallel runs](https://github.com/OceanParcels/parcels/blob/dump_to_zarr/parcels/examples/documentation_MPI.ipynb) and the loading of the resulting dataset, you could just use the `dataset.to_zarr()` function to save the data to a single zarr datastore. This is true for small enough datasets. However, there is a bug in xarray which makes this ineffecient for large data sets. \n", "\n", - "This may be fixed in xarray or zarr in the future, but for now, we can work around this problem by saving the output in steps. \n", - "\n", - "At the same time, we can change the datatype of the output and modify the chunking of the dataset. Both of these can improve both performance and the size of the data on the disk. However, some care is required to do this, and this will be described in more detail below. " + "At the same time, it will often improve performance if large datasets are saved as a single zarr store, chunked appropriately, and the type of the variables in them modified. It is often also useful to add other variables to the dataset. This document describes how to do all this." ] }, { @@ -71,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "2622a91d", "metadata": {}, "outputs": [], @@ -82,11 +80,13 @@ "from glob import glob\n", "from os import path\n", "import time\n", + "import dask\n", + "from dask.diagnostics import ProgressBar\n", "\n", "#first specify the directory in which the MPI code wrote its output\n", "inputDir=('dataPathsTemp/'+\n", " 'theAmericas_wholeGlobe_range100km_depthFrom_1m_to_500m_habitatTree_months01_to_02_fixed_1m/'+\n", - " '2008/tracks.zarr')\n", + " '2007/tracks.zarr')\n", "\n", "\n", "#specify chunksize and where the output zarr file should go; also set chunksize of output file\n", @@ -104,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "51a1414d", "metadata": {}, "outputs": [], @@ -125,12 +125,14 @@ "\n", "This comes at the cost of some accuracy. Float64 has 14 digits of accuracy, float32 has 7. For latitude and longitude, going from float64 to float32 increases the error by the circumfrence of the Earth devided 1e7, or about 1m. This is good enough for what I am doing. However, a year of time has about 3.15e7 seconds, and we often want to know within a second when a particle is released (to avoid floating point issues when picking out particles that start at a specific time). So the 3.15e7/1e7 error (a few seconds) in the time coordinate could cause problems. So I don't want to reduce the precision of time. \n", "\n", + "There is one other important issue. Due to a bug in xarray, it is much slower to save datasets with a datetime64 variable in them. So time here will be given as float64. If (as we do below) the attribute data is preserved, it will still appear as a datetime type when the data file is loaded\n", + "\n", "To change precision, put an entry into the dictionary `varType` whose key is the name of the variable, and whose value is the type you wish the variable to be cast to:" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 14, "id": "9ca2bb15", "metadata": {}, "outputs": [], @@ -138,7 +140,7 @@ "varType={\n", " 'lat':dtype('float32'),\n", " 'lon':dtype('float32'),\n", - " 'time':dtype('datetime64'),\n", + " 'time':dtype('float64'), #to avoid bug in xarray\n", " 'z':dtype('float32'),\n", " }" ] @@ -148,12 +150,12 @@ "id": "c26e9ba7", "metadata": {}, "source": [ - "Now we need to read in the data as discussed in the section on making an MPI run:" + "Now we need to read in the data as discussed in the section on making an MPI run. However, note that `xr.open_zarr()` is given the `decode_times=False` option, which prevents the time variable from being converted into a datetime64[ns] object. This is neccessary due to a bug in xarray. As discussed above, when the data set is read back in, time will again be interpreted as a datetime." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 15, "id": "e7dd9f61", "metadata": {}, "outputs": [ @@ -162,7 +164,7 @@ "output_type": "stream", "text": [ "opening data from multiple process files\n", - " done opening in 2.19\n" + " done opening in 6.37\n" ] } ], @@ -170,7 +172,7 @@ "print('opening data from multiple process files')\n", "tic=time.time()\n", "files = glob(path.join(inputDir, \"proc*\")); \n", - "dataIn = xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', \n", + "dataIn = xr.concat([xr.open_zarr(f,decode_times=False) for f in files], dim='trajectory', \n", " compat='no_conflicts', coords='minimal') \n", "print(' done opening in %5.2f'%(time.time()-tic))" ] @@ -185,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 16, "id": "6819cd84", "metadata": {}, "outputs": [], @@ -204,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 17, "id": "9a56c3cc", "metadata": {}, "outputs": [ @@ -213,7 +215,7 @@ "output_type": "stream", "text": [ "re-chunking\n", - " done in 3.4046807289123535\n" + " done in 9.15590238571167\n" ] } ], @@ -232,156 +234,27 @@ "id": "6f59018b", "metadata": {}, "source": [ - "The dataset `dataIn` is now ready to be written back out in stages. But how big is each segment that will be written out? I have found that writing out about 650 million points per variable is easy on a laptop with 16Gb of memory. Larger sizes might be more efficient, but one does run into the law of diminishing returns fairly quickly. The number of points to write out is called `nPointsWrite` and it is integer devided by the `obs` dimension to get the number of rows to write out, `writeChunkLen`:" + "The dataset `dataIn` is now ready to be written back out with dataIn.to_zarr(). Because this can take a while, it is nice to delay computation and then compute() the resulting object with a progress bar, so we know how long we have to get a cup of coffee or tea. " ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 19, "id": "de5415ed", "metadata": {}, - "outputs": [], - "source": [ - "nPointsWrite=int(5e6)*130\n", - "writeChunkLen=nPointsWrite//dataIn.dims['obs']" - ] - }, - { - "cell_type": "markdown", - "id": "2b210d2c", - "metadata": {}, - "source": [ - "Now, at some point in the future, one could imagine making this code parallel; to make that easier, make sure that writeChunkLen is an even multiple of the trajectory chunk size, and also make sure it is at least one." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "ecda3883", - "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - " writing 2600000 trajectory's at a time\n" + "[########################################] | 100% Completed | 33m 9ss\n" ] } ], "source": [ - "writeChunkLen=(writeChunkLen//chunksize['trajectory'])*chunksize['trajectory']\n", - "writeChunkLen=max(1,writeChunkLen) #must be at least 1!\n", - "print(\" writing %d trajectory's at a time\"%(writeChunkLen,))" - ] - }, - { - "cell_type": "markdown", - "id": "a1bd78bc", - "metadata": {}, - "source": [ - "Now we want to make a list of the indices that are to be written out, `chunkList`, so it goes from 0 to the total size of the trajectory dimension:" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "c993c932", - "metadata": {}, - "outputs": [], - "source": [ - "#make list of indices to write out, inclusive of end of the trajectory dimension\n", - "chunkList=list(range(0,dataIn.dims['trajectory'],writeChunkLen))\n", - "if chunkList[-1]!=dataIn.dims['trajectory']:\n", - " chunkList.append(dataIn.dims['trajectory'])" - ] - }, - { - "cell_type": "markdown", - "id": "52d6ace4", - "metadata": {}, - "source": [ - "Now iterate through the indices in chunkList, and write out each bit. The following code is a bit chatty, but as the code can take a while to run, I find it reassuring to know it is doing something. If one errs in using xarray, it can take an excessive amount of time to do things, so it is worth keeping an eye on things. After you start the code below running, go teach a class or revise a manuscript. It can take a while. On my test 88Gb file, it takes about an hour." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "c14e2d39", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "starting to write to /home/pringle/jnkData/singleFile_5e4_X_10_example.zarr\n", - " getting chunk 0\n", - " starting to write\n", - " done with chunk 0 of 15 from 0 to 2600000 in 484.8957693576813\n", - " getting chunk 1\n", - " starting to write\n", - " done with chunk 1 of 15 from 2600000 to 5200000 in 490.55356001853943\n", - " getting chunk 2\n", - " starting to write\n", - " done with chunk 2 of 15 from 5200000 to 7800000 in 550.8107087612152\n", - " getting chunk 3\n", - " starting to write\n", - " done with chunk 3 of 15 from 7800000 to 10400000 in 484.8843400478363\n", - " getting chunk 4\n", - " starting to write\n", - " done with chunk 4 of 15 from 10400000 to 13000000 in 440.20081901550293\n", - " getting chunk 5\n", - " starting to write\n", - " done with chunk 5 of 15 from 13000000 to 15600000 in 344.02604961395264\n", - " getting chunk 6\n", - " starting to write\n", - " done with chunk 6 of 15 from 15600000 to 18200000 in 337.93852186203003\n", - " getting chunk 7\n", - " starting to write\n", - " done with chunk 7 of 15 from 18200000 to 20800000 in 455.27793192863464\n", - " getting chunk 8\n", - " starting to write\n", - " done with chunk 8 of 15 from 20800000 to 23400000 in 405.1483941078186\n", - " getting chunk 9\n", - " starting to write\n", - " done with chunk 9 of 15 from 23400000 to 26000000 in 310.73776030540466\n", - " getting chunk 10\n", - " starting to write\n", - " done with chunk 10 of 15 from 26000000 to 28600000 in 286.34414768218994\n", - " getting chunk 11\n", - " starting to write\n", - " done with chunk 11 of 15 from 28600000 to 31200000 in 247.74302196502686\n", - " getting chunk 12\n", - " starting to write\n", - " done with chunk 12 of 15 from 31200000 to 33800000 in 165.62516450881958\n", - " getting chunk 13\n", - " starting to write\n", - " done with chunk 13 of 15 from 33800000 to 36400000 in 155.9800374507904\n", - " getting chunk 14\n", - " starting to write\n", - " done with chunk 14 of 15 from 36400000 to 39000000 in 204.0485861301422\n", - " getting chunk 15\n", - " starting to write\n", - " done with chunk 15 of 15 from 39000000 to 39692941 in 43.03104019165039\n", - "Done writing in 5425.2\n" - ] - } - ], - "source": [ - "#write out the chunks\n", - "print('starting to write to',outputDir)\n", - "for nChunk in range(len(chunkList)-1):\n", - " innerTic=time.time()\n", - " print(' getting chunk',nChunk)\n", - " newds=dataIn.isel(trajectory=np.arange(chunkList[nChunk],chunkList[nChunk+1])) #do not sort by trajectory\n", - " print(' starting to write')\n", - " if nChunk==0:\n", - " newds.to_zarr(outputDir,mode='w')\n", - " else:\n", - " newds.to_zarr(outputDir,append_dim='trajectory')\n", - " print(' done with chunk %d of %d from'%(nChunk,len(chunkList)-2),\n", - " chunkList[nChunk],'to',chunkList[nChunk+1],'in',time.time()-innerTic)\n", - "\n", - "print('Done writing in %5.1f'%(time.time()-tic,))" + "delayedObj=dataIn.to_zarr(outputDir,compute=False)\n", + "with ProgressBar():\n", + " results=delayedObj.compute()" ] }, { @@ -389,12 +262,12 @@ "id": "9080025f", "metadata": {}, "source": [ - "We can now load the zarr data set we have created, and see what is in it, compared to what was in the input dataset. " + "We can now load the zarr data set we have created, and see what is in it, compared to what was in the input dataset. Note that since we have not used \"decode_times=False\", the time coordinate appears as a datetime object. " ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 20, "id": "3157592c", "metadata": { "scrolled": true @@ -406,16 +279,16 @@ "text": [ "The original data\n", " \n", - "Dimensions: (trajectory: 39692941, obs: 250)\n", + "Dimensions: (trajectory: 39363539, obs: 250)\n", "Coordinates:\n", " * obs (obs) int32 0 1 2 3 4 5 6 7 ... 242 243 244 245 246 247 248 249\n", - " * trajectory (trajectory) int64 16 23 68 165 ... 39692792 39692889 39692920\n", + " * trajectory (trajectory) int64 0 22 32 40 ... 39363210 39363255 39363379\n", "Data variables:\n", - " age (trajectory, obs) float32 dask.array\n", - " lat (trajectory, obs) float64 dask.array\n", - " lon (trajectory, obs) float64 dask.array\n", - " time (trajectory, obs) datetime64[ns] dask.array\n", - " z (trajectory, obs) float64 dask.array\n", + " age (trajectory, obs) float32 dask.array\n", + " lat (trajectory, obs) float64 dask.array\n", + " lon (trajectory, obs) float64 dask.array\n", + " time (trajectory, obs) datetime64[ns] dask.array\n", + " z (trajectory, obs) float64 dask.array\n", "Attributes:\n", " Conventions: CF-1.6/CF-1.7\n", " feature_type: trajectory\n", @@ -425,15 +298,15 @@ "\n", "The new dataSet\n", " \n", - "Dimensions: (trajectory: 39692941, obs: 250)\n", + "Dimensions: (trajectory: 39363539, obs: 250)\n", "Coordinates:\n", " * obs (obs) int32 0 1 2 3 4 5 6 7 ... 242 243 244 245 246 247 248 249\n", - " * trajectory (trajectory) int64 16 23 68 165 ... 39692792 39692889 39692920\n", + " * trajectory (trajectory) int64 0 22 32 40 ... 39363210 39363255 39363379\n", "Data variables:\n", " age (trajectory, obs) float32 dask.array\n", " lat (trajectory, obs) float32 dask.array\n", " lon (trajectory, obs) float32 dask.array\n", - " time (trajectory, obs) datetime64[ns] dask.array\n", + " time (trajectory, obs) datetime64[ns] dask.array\n", " z (trajectory, obs) float32 dask.array\n", "Attributes:\n", " Conventions: CF-1.6/CF-1.7\n", @@ -445,7 +318,7 @@ } ], "source": [ - "dataOriginal=dataIn = xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', \n", + "dataOriginal=xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', \n", " compat='no_conflicts', coords='minimal') \n", "dataProcessed=xr.open_zarr(outputDir)\n", "print('The original data\\n',dataOriginal,'\\n\\nThe new dataSet\\n',dataProcessed)" @@ -454,7 +327,7 @@ { "cell_type": "code", "execution_count": null, - "id": "186c678b", + "id": "74f5a9f5", "metadata": {}, "outputs": [], "source": [] diff --git a/parcels/examples/documentation_MPI.ipynb b/parcels/examples/documentation_MPI.ipynb index 4e31f3643..5f77c2692 100644 --- a/parcels/examples/documentation_MPI.ipynb +++ b/parcels/examples/documentation_MPI.ipynb @@ -381,7 +381,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -395,7 +395,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.10.6" } }, "nbformat": 4, From c1f4f36164a1d9e7dd7e521db4c8d2c7b7944a9e Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 5 Oct 2022 09:57:59 +0200 Subject: [PATCH 78/79] Pinning dask version to 2022.9.0 Since this version still worked, and 2022.9.2 gives an error with https://github.com/OceanParcels/parcels/actions/runs/3180136958/jobs/5199374392 --- environment_py3_win.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment_py3_win.yml b/environment_py3_win.yml index 1ea0e9ab3..f4ecf30fd 100644 --- a/environment_py3_win.yml +++ b/environment_py3_win.yml @@ -19,7 +19,7 @@ dependencies: - scipy>=0.16.0 - tqdm - xarray>=0.5.1 - - dask>=2.0 + - dask<=2022.9.0 - cftime>=1.3.1 - ipykernel - pytest From 0326f773e0faee0cfb1400ad2bd0385833123936 Mon Sep 17 00:00:00 2001 From: Erik van Sebille Date: Wed, 5 Oct 2022 12:26:41 +0200 Subject: [PATCH 79/79] Fixing 3D plotting bug in Windows --- parcels/scripts/plottrajectoriesfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parcels/scripts/plottrajectoriesfile.py b/parcels/scripts/plottrajectoriesfile.py index b8b3a83f3..f215c43db 100644 --- a/parcels/scripts/plottrajectoriesfile.py +++ b/parcels/scripts/plottrajectoriesfile.py @@ -71,7 +71,7 @@ def plotTrajectoriesFile(filename, mode='2d', tracerfile=None, tracerfield='P', if mode == '3d': from mpl_toolkits.mplot3d import Axes3D # noqa plt.clf() # clear the figure - ax = fig.gca(projection='3d') + ax = plt.axes(projection='3d') for p in range(len(lon)): ax.plot(lon[p, :], lat[p, :], z[p, :], '.-') ax.set_xlabel('Longitude')