diff --git a/environment_py3_win.yml b/environment_py3_win.yml index 1ea0e9ab3..f4ecf30fd 100644 --- a/environment_py3_win.yml +++ b/environment_py3_win.yml @@ -19,7 +19,7 @@ dependencies: - scipy>=0.16.0 - tqdm - xarray>=0.5.1 - - dask>=2.0 + - dask<=2022.9.0 - cftime>=1.3.1 - ipykernel - pytest diff --git a/parcels/collection/collectionaos.py b/parcels/collection/collectionaos.py index adddb8fe7..30c961c63 100644 --- a/parcels/collection/collectionaos.py +++ b/parcels/collection/collectionaos.py @@ -1,4 +1,3 @@ -from datetime import timedelta as delta from operator import attrgetter # NOQA from ctypes import c_void_p @@ -28,24 +27,6 @@ __all__ = ['ParticleCollectionAOS', 'ParticleCollectionIterableAOS', 'ParticleCollectionIteratorAOS'] -def _to_write_particles(pd, time): - """We don't want to write a particle that is not started yet. - Particle will be written if particle.time is between time-dt/2 and time+dt (/2) - """ - return [i for i, p in enumerate(pd) if (((time - np.abs(p.dt/2) <= p.time < time + np.abs(p.dt)) - or (np.isnan(p.dt) and np.equal(time, p.time))) - and np.isfinite(p.id))] - - -def _is_particle_started_yet(particle, time): - """We don't want to write a particle that is not started yet. - Particle will be written if: - * particle.time is equal to time argument of pfile.write() - * particle.time is before time (in case particle was deleted between previous export and current one) - """ - return (particle.dt*particle.time <= particle.dt*time or np.isclose(particle.time, time)) - - def _convert_to_flat_array(var): """Convert lists and single integers/floats to one-dimensional numpy arrays @@ -890,64 +871,33 @@ def cstruct(self): cstruct = self._data_c.ctypes.data_as(c_void_p) return cstruct - def toDictionary(self, pfile, time, deleted_only=False): - """ - Convert all Particle data from one time step to a python dictionary. - :param pfile: ParticleFile object requesting the conversion - :param time: Time at which to write ParticleSet - :param deleted_only: Flag to write only the deleted Particles - returns two dictionaries: one for all variables to be written each outputdt, - and one for all variables to be written once - - This function depends on the specific collection in question and thus needs to be specified in specific - derivative classes. + def _to_write_particles(self, pd, time): + """We don't want to write a particle that is not started yet. + Particle will be written if particle.time is between time-dt/2 and time+dt (/2) """ - data_dict = {} - data_dict_once = {} - - time = time.total_seconds() if isinstance(time, delta) else time + return np.array([i for i, p in enumerate(pd) if (((time - np.abs(p.dt/2) <= p.time < time + np.abs(p.dt)) + or (np.isnan(p.dt) and np.equal(time, p.time))) + and np.isfinite(p.id))]) - indices_to_write = [] - if pfile.lasttime_written != time and \ - (pfile.write_ondelete is False or deleted_only): - if self._ncount == 0: - logger.warning("ParticleSet is empty on writing as array at time %g" % time) - else: - if deleted_only: - if type(deleted_only) not in [list, np.ndarray] and deleted_only in [True, 1]: - data_states = [p.state for p in self._data] - indices_to_write = np.where(np.isin(data_states, [OperationCode.Delete]))[0] - elif type(deleted_only) in [list, np.ndarray] and len(deleted_only) > 0: - if type(deleted_only[0]) in [int, np.int32, np.uint32]: - indices_to_write = deleted_only - elif isinstance(deleted_only[0], ScipyParticle): - indices_to_write = [i for i, p in self._data if p in deleted_only] - else: - indices_to_write = _to_write_particles(self._data, time) - if len(indices_to_write) > 0: - for var in pfile.var_names: - if 'id' in var: - data_dict[var] = np.array([np.int64(getattr(p, var)) for p in self._data[indices_to_write]]) - else: - data_dict[var] = np.array([getattr(p, var) for p in self._data[indices_to_write]]) - - pset_errs = [p for p in self._data[indices_to_write] if p.state != OperationCode.Delete and abs(time-p.time) > 1e-3 and np.isfinite(p.time)] - for p in pset_errs: - logger.warning_once('time argument in pfile.write() is %g, but a particle has time % g.' % (time, p.time)) - - if len(pfile.var_names_once) > 0: - # _to_write_particles(self._data, time) - first_write = [p for p in self._data if _is_particle_started_yet(p, time) and (np.int64(p.id) not in pfile.written_once)] - if np.any(first_write): - data_dict_once['id'] = np.array([p.id for p in first_write]).astype(dtype=np.int64) - for var in pfile.var_names_once: - data_dict_once[var] = np.array([getattr(p, var) for p in first_write]) - pfile.written_once.extend(np.array(data_dict_once['id']).astype(dtype=np.int64).tolist()) - - if deleted_only is False: - pfile.lasttime_written = time + def getvardata(self, var, indices=None): + if indices is None: + return np.array([getattr(p, var) for p in self._data]) + else: + try: + return np.array([getattr(p, var) for p in self._data[indices]]) + except: # Can occur for zero-length ParticleSets + return None + + def setvardata(self, var, index, val): + if isinstance(index, (np.int64, int, np.int32)): + setattr(self._data[index], var, val) + else: + for i, v in zip(index, val): + setattr(self._data[i], var, v) - return data_dict, data_dict_once + def setallvardata(self, var, val): + for i in range(len(self._data)): + setattr(self._data[i], var, val) def toArray(self): """ diff --git a/parcels/collection/collections.py b/parcels/collection/collections.py index fa9695560..1aa06f627 100644 --- a/parcels/collection/collections.py +++ b/parcels/collection/collections.py @@ -898,19 +898,18 @@ def __getattr__(self, name): else: return False + def has_write_once_variables(self): + for var in self.ptype.variables: + if var.to_write == 'once': + return True + return False + @abstractmethod - def toDictionary(self): - """ - Convert all Particle data from one time step to a python dictionary. - :param pfile: ParticleFile object requesting the conversion - :param time: Time at which to write ParticleSet - :param deleted_only: Flag to write only the deleted Particles - returns two dictionaries: one for all variables to be written each outputdt, - and one for all variables to be written once + def getvardata(self, var, indices=None): + pass - This function depends on the specific collection in question and thus needs to be specified in specific - derivatives classes. - """ + @abstractmethod + def setvardata(self, var, index, val): pass @abstractmethod diff --git a/parcels/collection/collectionsoa.py b/parcels/collection/collectionsoa.py index f739acad8..16f3027a1 100644 --- a/parcels/collection/collectionsoa.py +++ b/parcels/collection/collectionsoa.py @@ -1,4 +1,3 @@ -from datetime import timedelta as delta from operator import attrgetter from ctypes import Structure, POINTER from bisect import bisect_left @@ -26,26 +25,6 @@ 'See http://oceanparcels.org/#parallel_install for more information') -def _to_write_particles(pd, time): - """We don't want to write a particle that is not started yet. - Particle will be written if particle.time is between time-dt/2 and time+dt (/2) - """ - return ((np.less_equal(time - np.abs(pd['dt']/2), pd['time'], where=np.isfinite(pd['time'])) - & np.greater_equal(time + np.abs(pd['dt'] / 2), pd['time'], where=np.isfinite(pd['time'])) - | ((np.isnan(pd['dt'])) & np.equal(time, pd['time'], where=np.isfinite(pd['time'])))) - & (np.isfinite(pd['id'])) - & (np.isfinite(pd['time']))) - - -def _is_particle_started_yet(pd, time): - """We don't want to write a particle that is not started yet. - Particle will be written if: - * particle.time is equal to time argument of pfile.write() - * particle.time is before time (in case particle was deleted between previous export and current one) - """ - return np.less_equal(pd['dt']*pd['time'], pd['dt']*time) | np.isclose(pd['time'], time) - - def _convert_to_flat_array(var): """Convert lists and single integers/floats to one-dimensional numpy arrays @@ -153,7 +132,7 @@ def __init__(self, pclass, lon, lat, depth, time, lonlatdepth_dtype, pid_orig, p self._data['depth'][:] = depth self._data['time'][:] = time self._data['id'][:] = pid - self._data['fileid'][:] = -1 + self._data['once_written'][:] = 0 # special case for exceptions which can only be handled from scipy self._data['exception'] = np.empty(self.ncount, dtype=object) @@ -815,58 +794,30 @@ def flatten_dense_data_array(vname): cstruct = CParticles(*cdata) return cstruct - def toDictionary(self, pfile, time, deleted_only=False): + def _to_write_particles(self, pd, time): + """We don't want to write a particle that is not started yet. + Particle will be written if particle.time is between time-dt/2 and time+dt (/2) """ - Convert all Particle data from one time step to a python dictionary. - :param pfile: ParticleFile object requesting the conversion - :param time: Time at which to write ParticleSet - :param deleted_only: Flag to write only the deleted Particles - returns two dictionaries: one for all variables to be written each outputdt, - and one for all variables to be written once + return np.where((np.less_equal(time - np.abs(pd['dt'] / 2), pd['time'], where=np.isfinite(pd['time'])) + & np.greater_equal(time + np.abs(pd['dt'] / 2), pd['time'], where=np.isfinite(pd['time'])) + | ((np.isnan(pd['dt'])) & np.equal(time, pd['time'], where=np.isfinite(pd['time'])))) + & (np.isfinite(pd['id'])) + & (np.isfinite(pd['time'])))[0] - This function depends on the specific collection in question and thus needs to be specified in specific - derivative classes. - """ - - data_dict = {} - data_dict_once = {} + def getvardata(self, var, indices=None): + if indices is None: + return self._data[var] + else: + try: + return self._data[var][indices] + except: # Can occur for zero-length ParticleSets + return None - time = time.total_seconds() if isinstance(time, delta) else time + def setvardata(self, var, index, val): + self._data[var][index] = val - indices_to_write = [] - if pfile.lasttime_written != time and \ - (pfile.write_ondelete is False or deleted_only is not False): - if self._data['id'].size == 0: - logger.warning("ParticleSet is empty on writing as array at time %g" % time) - else: - if deleted_only is not False: - if type(deleted_only) not in [list, np.ndarray] and deleted_only in [True, 1]: - indices_to_write = np.where(np.isin(self._data['state'], - [OperationCode.Delete]))[0] - elif type(deleted_only) in [list, np.ndarray]: - indices_to_write = deleted_only - else: - indices_to_write = _to_write_particles(self._data, time) - if np.any(indices_to_write): - for var in pfile.var_names: - data_dict[var] = self._data[var][indices_to_write] - - pset_errs = ((self._data['state'][indices_to_write] != OperationCode.Delete) & np.greater(np.abs(time - self._data['time'][indices_to_write]), 1e-3, where=np.isfinite(self._data['time'][indices_to_write]))) - if np.count_nonzero(pset_errs) > 0: - logger.warning_once('time argument in pfile.write() is {}, but particles have time {}'.format(time, self._data['time'][pset_errs])) - - if len(pfile.var_names_once) > 0: - first_write = (_to_write_particles(self._data, time) & _is_particle_started_yet(self._data, time) & np.isin(self._data['id'], pfile.written_once, invert=True)) - if np.any(first_write): - data_dict_once['id'] = np.array(self._data['id'][first_write]).astype(dtype=np.int64) - for var in pfile.var_names_once: - data_dict_once[var] = self._data[var][first_write] - pfile.written_once.extend(np.array(self._data['id'][first_write]).astype(dtype=np.int64).tolist()) - - if deleted_only is False: - pfile.lasttime_written = time - - return data_dict, data_dict_once + def setallvardata(self, var, val): + self._data[var][:] = val def toArray(self): """ diff --git a/parcels/examples/documentation_LargeRunsOutput.ipynb b/parcels/examples/documentation_LargeRunsOutput.ipynb new file mode 100644 index 000000000..f0ee8b9c7 --- /dev/null +++ b/parcels/examples/documentation_LargeRunsOutput.ipynb @@ -0,0 +1,357 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e2d8b054", + "metadata": {}, + "source": [ + "# Combining large ocean-parcels datasets and choosing an optimal chunking. " + ] + }, + { + "cell_type": "markdown", + "id": "eb5792d0", + "metadata": {}, + "source": [ + "You might imagine that if you followed the instructions [on the making of parallel runs](https://github.com/OceanParcels/parcels/blob/dump_to_zarr/parcels/examples/documentation_MPI.ipynb) and the loading of the resulting dataset, you could just use the `dataset.to_zarr()` function to save the data to a single zarr datastore. This is true for small enough datasets. However, there is a bug in xarray which makes this ineffecient for large data sets. \n", + "\n", + "At the same time, it will often improve performance if large datasets are saved as a single zarr store, chunked appropriately, and the type of the variables in them modified. It is often also useful to add other variables to the dataset. This document describes how to do all this." + ] + }, + { + "cell_type": "markdown", + "id": "4010c8d0", + "metadata": {}, + "source": [ + "## Why are we doing this? And what chunk sizes should we choose?" + ] + }, + { + "cell_type": "markdown", + "id": "46cb22d5", + "metadata": {}, + "source": [ + "If you are running a relatively small case (perhaps 1/10 the size of the memory of your machine), nearly anything you do will work. However, as your problems get larger, it can help to write the data into a single zarr datastore, and to chunk that store appropriately. \n", + "\n", + "To illustrate this, here is the time it takes to retrieve all the results (with `ds['variableName'].values`) of some common data structures with different chunk sizes. (What is a chunk size? More on that below). The data in this example has 39 million trajectories starting over 120 times, and there are 250 observations, resulting in a directory size of 88Gb in double precision and 39 in single. In this table, \"trajectory:5e4, obs:10\" indicates that each chunk extends over 50,000 trajectories and 10 obs. The chunking in the original data is roughly a few thousand observations and 10 obs. \n", + "\n", + "|File type|open [s]|read 1 obs, all traj [s]|read 23 obs, all traj [s]|read 8000 contigous traj, all obs [s]|read traj that start at a given time, all obs [s]|\n", + "|---|---|---|---|---|---|\n", + "|Straigth from parcels|2.9|8.4|59.9|1.5|17.4|\n", + "|trajectory:5e4, obs:10|0.48|2.5|19.5|0.4|10.33|\n", + "|trajectory:5e4, obs:100|0.55|20.5|13.8|0.5|3.88|\n", + "|trajectory:5e5, obs:10|0.54|2.2|16.3|0.85|18.5|\n", + "|trajectory:5e5, obs:100|0.46|19.9|40.0|0.62|49.36|\n", + "\n", + "\n", + "You can see several things in this. It is always quicker to open a single file, and for all data access patterns, there is are chunksizes that are more efficient than the default output. Why is this?\n", + "\n", + "The chunksize determines how data is stored on disk. For the default Zarr datastore, each chunk of data is stored as a single compressed file. In netCDF, chunking is similar except that the compressed data is stored within a single file. In either case, if you must access any data from within a chunk, you must read the entire chunk from disk. \n", + "\n", + "So when we access one obs dimension and many trajectories, the chunking scheme that is elongated in the trajectory direction is fastest. When we get all the observation for a scattered set of trajectories, the chunking that is elongated in observations is the best. In general, the product of the two chunksizes (the number of data points in a chunk) should be hundreds of thousands to 10s of millions. A suboptimal chunking scheme is usually not tragic, but if you know how you will most often access the data, you can save considerable time. " + ] + }, + { + "cell_type": "markdown", + "id": "0d5d3385", + "metadata": {}, + "source": [ + "## How to save the output of an MPI ocean parcels run to a single zarr dataset" + ] + }, + { + "cell_type": "markdown", + "id": "b5001615", + "metadata": {}, + "source": [ + "First, we need to import the necessary modules, specify the directory `inputDir` which contains the output of the parcels run (the directory that has proc01, proc02 and so forth), the location of the ouput zarr file `outputDir` and a dictionary giving the chunk size for the `trajectory` and `obs` coordinates, `chunksize`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2622a91d", + "metadata": {}, + "outputs": [], + "source": [ + "import xarray as xr\n", + "from pylab import *\n", + "from numpy import *\n", + "from glob import glob\n", + "from os import path\n", + "import time\n", + "import dask\n", + "from dask.diagnostics import ProgressBar\n", + "\n", + "#first specify the directory in which the MPI code wrote its output\n", + "inputDir=('dataPathsTemp/'+\n", + " 'theAmericas_wholeGlobe_range100km_depthFrom_1m_to_500m_habitatTree_months01_to_02_fixed_1m/'+\n", + " '2007/tracks.zarr')\n", + "\n", + "\n", + "#specify chunksize and where the output zarr file should go; also set chunksize of output file\n", + "chunksize={'trajectory':5*int(1e4),'obs':10}; \n", + "outputDir='/home/pringle/jnkData/singleFile_5e4_X_10_example.zarr'" + ] + }, + { + "cell_type": "markdown", + "id": "33383cbe", + "metadata": {}, + "source": [ + "Now for large datasets, this code can take a while to run; for 36 million trajectories and 250 observations, it can take an hour and a half. I prefer not to accidently destroy data that takes more than an hour to create, so I put in a safety check and only let the code run if the output directory does not exist. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51a1414d", + "metadata": {}, + "outputs": [], + "source": [ + "#do not overwrite existing data sets\n", + "if path.exists(outputDir):\n", + " print('the ouput path',outputDir,'exists')\n", + " print('please delete if you want to replace it')\n", + " assert False,'stopping execution'" + ] + }, + { + "cell_type": "markdown", + "id": "b8818397", + "metadata": {}, + "source": [ + "It will often be useful to change the [`dtype`](https://numpy.org/doc/stable/reference/generated/numpy.dtype.html) of the output data. Doing so can save a great deal of disk space. For example, the input data for this example is 88Gb in size, but by changing lat, lon and z to single precision, I can make the file about half as big. \n", + "\n", + "This comes at the cost of some accuracy. Float64 has 14 digits of accuracy, float32 has 7. For latitude and longitude, going from float64 to float32 increases the error by the circumfrence of the Earth devided 1e7, or about 1m. This is good enough for what I am doing. However, a year of time has about 3.15e7 seconds, and we often want to know within a second when a particle is released (to avoid floating point issues when picking out particles that start at a specific time). So the 3.15e7/1e7 error (a few seconds) in the time coordinate could cause problems. So I don't want to reduce the precision of time. \n", + "\n", + "There is one other important issue. Due to a bug in xarray, it is much slower to save datasets with a datetime64 variable in them. So time here will be given as float64. If (as we do below) the attribute data is preserved, it will still appear as a datetime type when the data file is loaded\n", + "\n", + "To change precision, put an entry into the dictionary `varType` whose key is the name of the variable, and whose value is the type you wish the variable to be cast to:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9ca2bb15", + "metadata": {}, + "outputs": [], + "source": [ + "varType={\n", + " 'lat':dtype('float32'),\n", + " 'lon':dtype('float32'),\n", + " 'time':dtype('float64'), #to avoid bug in xarray\n", + " 'z':dtype('float32'),\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "c26e9ba7", + "metadata": {}, + "source": [ + "Now we need to read in the data as discussed in the section on making an MPI run. However, note that `xr.open_zarr()` is given the `decode_times=False` option, which prevents the time variable from being converted into a datetime64[ns] object. This is neccessary due to a bug in xarray. As discussed above, when the data set is read back in, time will again be interpreted as a datetime." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "e7dd9f61", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "opening data from multiple process files\n", + " done opening in 6.37\n" + ] + } + ], + "source": [ + "print('opening data from multiple process files')\n", + "tic=time.time()\n", + "files = glob(path.join(inputDir, \"proc*\")); \n", + "dataIn = xr.concat([xr.open_zarr(f,decode_times=False) for f in files], dim='trajectory', \n", + " compat='no_conflicts', coords='minimal') \n", + "print(' done opening in %5.2f'%(time.time()-tic))" + ] + }, + { + "cell_type": "markdown", + "id": "f93a60ff", + "metadata": {}, + "source": [ + "Now we can take advantage of the `.astype` operator to change the type of the variables. This is a lazy operator, and it will only be applied to the data when the data values are requested below, when the data is written to a new zarr store. " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6819cd84", + "metadata": {}, + "outputs": [], + "source": [ + "for v in varType.keys():\n", + " dataIn[v]=dataIn[v].astype(varType[v])" + ] + }, + { + "cell_type": "markdown", + "id": "f8410589", + "metadata": {}, + "source": [ + "The dataset is then rechunked to our desired shape. This does not actually do anything right now, but will when the data is written below. Before doing this, it is useful to remove the per-variable chunking metadata, because of inconsistencies which arrise due to (I think) each MPI process output having a different chunking. This is explained in more detail in https://github.com/dcs4cop/xcube/issues/347 " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "9a56c3cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "re-chunking\n", + " done in 9.15590238571167\n" + ] + } + ], + "source": [ + "print('re-chunking')\n", + "tic=time.time()\n", + "for v in dataIn.variables:\n", + " if 'chunks' in dataIn[v].encoding:\n", + " del dataIn[v].encoding['chunks']\n", + "dataIn=dataIn.chunk(chunksize)\n", + "print(' done in',time.time()-tic)" + ] + }, + { + "cell_type": "markdown", + "id": "6f59018b", + "metadata": {}, + "source": [ + "The dataset `dataIn` is now ready to be written back out with dataIn.to_zarr(). Because this can take a while, it is nice to delay computation and then compute() the resulting object with a progress bar, so we know how long we have to get a cup of coffee or tea. " + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "de5415ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 33m 9ss\n" + ] + } + ], + "source": [ + "delayedObj=dataIn.to_zarr(outputDir,compute=False)\n", + "with ProgressBar():\n", + " results=delayedObj.compute()" + ] + }, + { + "cell_type": "markdown", + "id": "9080025f", + "metadata": {}, + "source": [ + "We can now load the zarr data set we have created, and see what is in it, compared to what was in the input dataset. Note that since we have not used \"decode_times=False\", the time coordinate appears as a datetime object. " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "3157592c", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The original data\n", + " \n", + "Dimensions: (trajectory: 39363539, obs: 250)\n", + "Coordinates:\n", + " * obs (obs) int32 0 1 2 3 4 5 6 7 ... 242 243 244 245 246 247 248 249\n", + " * trajectory (trajectory) int64 0 22 32 40 ... 39363210 39363255 39363379\n", + "Data variables:\n", + " age (trajectory, obs) float32 dask.array\n", + " lat (trajectory, obs) float64 dask.array\n", + " lon (trajectory, obs) float64 dask.array\n", + " time (trajectory, obs) datetime64[ns] dask.array\n", + " z (trajectory, obs) float64 dask.array\n", + "Attributes:\n", + " Conventions: CF-1.6/CF-1.7\n", + " feature_type: trajectory\n", + " ncei_template_version: NCEI_NetCDF_Trajectory_Template_v2.0\n", + " parcels_mesh: spherical\n", + " parcels_version: 2.3.2.dev137 \n", + "\n", + "The new dataSet\n", + " \n", + "Dimensions: (trajectory: 39363539, obs: 250)\n", + "Coordinates:\n", + " * obs (obs) int32 0 1 2 3 4 5 6 7 ... 242 243 244 245 246 247 248 249\n", + " * trajectory (trajectory) int64 0 22 32 40 ... 39363210 39363255 39363379\n", + "Data variables:\n", + " age (trajectory, obs) float32 dask.array\n", + " lat (trajectory, obs) float32 dask.array\n", + " lon (trajectory, obs) float32 dask.array\n", + " time (trajectory, obs) datetime64[ns] dask.array\n", + " z (trajectory, obs) float32 dask.array\n", + "Attributes:\n", + " Conventions: CF-1.6/CF-1.7\n", + " feature_type: trajectory\n", + " ncei_template_version: NCEI_NetCDF_Trajectory_Template_v2.0\n", + " parcels_mesh: spherical\n", + " parcels_version: 2.3.2.dev137\n" + ] + } + ], + "source": [ + "dataOriginal=xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', \n", + " compat='no_conflicts', coords='minimal') \n", + "dataProcessed=xr.open_zarr(outputDir)\n", + "print('The original data\\n',dataOriginal,'\\n\\nThe new dataSet\\n',dataProcessed)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74f5a9f5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/parcels/examples/documentation_MPI.ipynb b/parcels/examples/documentation_MPI.ipynb index 8b3cba3c3..687736a05 100644 --- a/parcels/examples/documentation_MPI.ipynb +++ b/parcels/examples/documentation_MPI.ipynb @@ -41,7 +41,74 @@ "\n", "Parcels will then split the `ParticleSet` into `` smaller ParticleSets, based on a `sklearn.cluster.KMeans` clustering. Each of those smaller `ParticleSets` will be executed by one of the `` MPI processors.\n", "\n", - "Note that in principle this means that all MPI processors need access to the full `FieldSet`, which can be Gigabytes in size for large global datasets. Therefore, efficient parallelisation only works if at the same time we also chunk the `FieldSet` into smaller domains" + "Note that in principle this means that all MPI processors need access to the full `FieldSet`, which can be Gigabytes in size for large global datasets. Therefore, efficient parallelisation only works if at the same time we also chunk the `FieldSet` into smaller domains." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reading in the ParticleFile data in zarr format\n", + "\n", + "For efficiency, each processor will write its own data to a `zarr`-store. If the name of your `ParticleFile` is `fname`, then these stores will be located at `fname/proc00.zarr`, `fname/proc01.zarr`, etc.\n", + "\n", + "Reading in these stores and merging them into one `xarray.Dataset` can be done with" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from glob import glob\n", + "from os import path\n", + "\n", + "files = glob(path.join(fname, \"proc*\"))\n", + "ds = xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', compat='no_conflicts', coords='minimal')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that, if you have added particles during the `execute()` (for example because you used `repeatdt`), then the trajectories will not be ordered monotonically. While this may not be a problem, this will result in a different Dataset than a single-core simulation. If you do want the outputs of the MPI run to be the same as the single-core run, add `.sortby(['trajectory'])` at the end of the `xr.concat()` command" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ds = xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', \n", + " compat='no_conflicts', coords='minimal').sortby(['trajectory'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that if you want, you can save this new DataSet with the `.to_zarr()` or `.to_netcdf()` methods. \n", + "\n", + "When using `.to_zarr()`, then it further analyses may be sped up by first rechunking the DataSet, by using `ds.chunk()`. Note that in some cases, you will first need to remove the chunks encoding information manually, using a code like below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for v in ds.variables:\n", + " del ds[v].encoding['chunks']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For small projects, the above instructions are sufficient. If your project is large, then it is helpful to combine the `proc*` directories into a single zarr dataset and to optimize the chunking for your analysis. What is \"large\"? If you find yourself running out of memory while doing your analyses, saving the results, or sorting the dataset, or if reading the data is taking longer than you can tolerate, your problem is \"large.\" Another rule of thumb is if the size of your output directory is 1/3 or more of the memory of your machine, your problem is large. Chunking and combining the `proc*` data in order to speed up analysis is discussed [in the documentation on runs with large output](https://nbviewer.org/github/OceanParcels/parcels/blob/master/parcels/examples/documentation_LargeRunsOutput.ipynb). " ] }, { @@ -321,7 +388,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -335,7 +402,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.7" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/parcels/examples/example_decaying_moving_eddy.py b/parcels/examples/example_decaying_moving_eddy.py index 1df622d18..155e4d485 100644 --- a/parcels/examples/example_decaying_moving_eddy.py +++ b/parcels/examples/example_decaying_moving_eddy.py @@ -70,7 +70,7 @@ def decaying_moving_example(fieldset, outfile, mode='scipy', method=AdvectionRK4 @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_rotation_example(mode, tmpdir): - outfile = tmpdir.join('DecayingMovingParticle.nc') + outfile = tmpdir.join('DecayingMovingParticle.zarr') fieldset = decaying_moving_eddy_fieldset() pset = decaying_moving_example(fieldset, outfile, mode=mode) vals = true_values(pset[0].time, start_lon, start_lat) # Calculate values for the particle. @@ -79,7 +79,7 @@ def test_rotation_example(mode, tmpdir): if __name__ == "__main__": fset_filename = 'decaying_moving_eddy' - outfile = 'DecayingMovingParticle.nc' + outfile = 'DecayingMovingParticle.zarr' fieldset = decaying_moving_eddy_fieldset() fieldset.write(fset_filename) diff --git a/parcels/examples/example_globcurrent.py b/parcels/examples/example_globcurrent.py index 483b3fcc8..53a72529d 100644 --- a/parcels/examples/example_globcurrent.py +++ b/parcels/examples/example_globcurrent.py @@ -264,7 +264,7 @@ def DeleteParticle(particle, fieldset, time): @pytest.mark.parametrize('dt', [-300, 300]) @pytest.mark.parametrize('pid_offset', [0, 20]) def test_globcurrent_pset_fromfile(mode, dt, pid_offset, tmpdir): - filename = tmpdir.join("pset_fromparticlefile.nc") + filename = tmpdir.join("pset_fromparticlefile.zarr") fieldset = set_globcurrent_fieldset() ptype[mode].setLastID(pid_offset) diff --git a/parcels/examples/example_mitgcm.py b/parcels/examples/example_mitgcm.py index 854c73c7f..cde1ee092 100644 --- a/parcels/examples/example_mitgcm.py +++ b/parcels/examples/example_mitgcm.py @@ -48,7 +48,7 @@ def periodicBC(particle, fieldset, time): size=10, ) pfile = ParticleFile( - "MIT_particles_" + str(mode) + ".nc", pset, outputdt=delta(days=1) + "MIT_particles_" + str(mode) + ".zarr", pset, outputdt=delta(days=1), chunks=(len(pset), 1) ) kernels = AdvectionRK4 + pset.Kernel(periodicBC) pset.execute( @@ -61,8 +61,8 @@ def test_mitgcm_output_compare(): run_mitgcm_zonally_reentrant("scipy") run_mitgcm_zonally_reentrant("jit") - ds_jit = xr.open_dataset("MIT_particles_jit.nc") - ds_scipy = xr.open_dataset("MIT_particles_scipy.nc") + ds_jit = xr.open_zarr("MIT_particles_jit.zarr") + ds_scipy = xr.open_zarr("MIT_particles_scipy.zarr") np.testing.assert_allclose(ds_jit.lat.data, ds_scipy.lat.data) np.testing.assert_allclose(ds_jit.lon.data, ds_scipy.lon.data) diff --git a/parcels/examples/example_nemo_curvilinear.py b/parcels/examples/example_nemo_curvilinear.py index 5ccd726f1..b8dc45cf9 100644 --- a/parcels/examples/example_nemo_curvilinear.py +++ b/parcels/examples/example_nemo_curvilinear.py @@ -56,7 +56,7 @@ def periodicBC(particle, fieldSet, time): def make_plot(trajfile): - from netCDF4 import Dataset + import xarray as xr import matplotlib.pyplot as plt import cartopy @@ -66,10 +66,10 @@ def __init__(self): def load_particles_file(fname, varnames): T = ParticleData() - pfile = Dataset(fname, 'r') - T.id = pfile.variables['trajectory'][:] + ds = xr.open_zarr(fname) + T.id = ds['trajectory'][:] for v in varnames: - setattr(T, v, pfile.variables[v][:]) + setattr(T, v, ds[v][:]) return T T = load_particles_file(trajfile, ['lon', 'lat', 'time']) @@ -121,4 +121,4 @@ def test_nemo_3D_samegrid(): outfile = "nemo_particles" run_nemo_curvilinear(args.mode, outfile) - make_plot(outfile+'.nc') + make_plot(outfile+'.zarr') diff --git a/parcels/examples/example_stommel.py b/parcels/examples/example_stommel.py index 5cb510cd2..b203c1f85 100644 --- a/parcels/examples/example_stommel.py +++ b/parcels/examples/example_stommel.py @@ -81,7 +81,7 @@ def AgeP(particle, fieldset, time): def stommel_example(npart=1, mode='jit', verbose=False, method=AdvectionRK4, grid_type='A', - outfile="StommelParticle.nc", repeatdt=None, maxage=None, write_fields=True, pset_mode='soa'): + outfile="StommelParticle.zarr", repeatdt=None, maxage=None, write_fields=True, pset_mode='soa'): timer.fieldset = timer.Timer('FieldSet', parent=timer.stommel) fieldset = stommel_fieldset(grid_type=grid_type) if write_fields: @@ -158,7 +158,7 @@ def test_stommel_fieldset(pset_mode, mode, grid_type, tmpdir): help='Print particle information before and after execution') p.add_argument('-m', '--method', choices=('RK4', 'EE', 'RK45'), default='RK4', help='Numerical method used for advection') - p.add_argument('-o', '--outfile', default='StommelParticle.nc', + p.add_argument('-o', '--outfile', default='StommelParticle.zarr', help='Name of output file') p.add_argument('-r', '--repeatdt', default=None, type=int, help='repeatdt of the ParticleSet') diff --git a/parcels/examples/parcels_tutorial.ipynb b/parcels/examples/parcels_tutorial.ipynb index 163e2b041..8a000500f 100644 --- a/parcels/examples/parcels_tutorial.ipynb +++ b/parcels/examples/parcels_tutorial.ipynb @@ -178,7 +178,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The final step is to run (or 'execute') the `ParticelSet`. We run the particles using the `AdvectionRK4` kernel, which is a 4th order Runge-Kutte implementation that comes with Parcels. We run the particles for 6 days (using the `timedelta` function from `datetime`), at an RK4 timestep of 5 minutes. We store the trajectory information at an interval of 1 hour in a file called `EddyParticles.nc`. Because `time` was `not_yet_set`, the particles will be advected from the first date available in the `fieldset`, which is the default behaviour." + "The final step is to run (or 'execute') the `ParticelSet`. We run the particles using the `AdvectionRK4` kernel, which is a 4th order Runge-Kutte implementation that comes with Parcels. We run the particles for 6 days (using the `timedelta` function from `datetime`), at an RK4 timestep of 5 minutes. We store the trajectory information at an interval of 1 hour in a file called `EddyParticles.zarr`. Because `time` was `not_yet_set`, the particles will be advected from the first date available in the `fieldset`, which is the default behaviour." ] }, { @@ -195,7 +195,7 @@ } ], "source": [ - "output_file = pset.ParticleFile(name=\"EddyParticles.nc\", outputdt=timedelta(hours=1)) # the file name and the time step of the outputs\n", + "output_file = pset.ParticleFile(name=\"EddyParticles.zarr\", outputdt=timedelta(hours=1)) # the file name and the time step of the outputs\n", "pset.execute(AdvectionRK4, # the kernel (which defines how particles move)\n", " runtime=timedelta(days=6), # the total length of the run\n", " dt=timedelta(minutes=5), # the timestep of the kernel\n", @@ -251,7 +251,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The trajectory information of the particles can be written to the `EddyParticles.nc` file by using the `.export()` method on the output file. The trajectory can then be quickly plotted using the `plotTrajectoriesFile` function." + "The trajectories in the `EddyParticles.zarr` file can be quickly plotted using the `plotTrajectoriesFile` function." ] }, { @@ -273,8 +273,7 @@ } ], "source": [ - "output_file.export()\n", - "plotTrajectoriesFile('EddyParticles.nc');" + "plotTrajectoriesFile('EddyParticles.zarr');" ] }, { @@ -1151,7 +1150,7 @@ } ], "source": [ - "plotTrajectoriesFile('EddyParticles.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('EddyParticles.zarr', mode='movie2d_notebook')" ] }, { @@ -1180,7 +1179,7 @@ } ], "source": [ - "plotTrajectoriesFile('EddyParticles.nc', mode='hist2d', bins=[30, 20]);" + "plotTrajectoriesFile('EddyParticles.zarr', mode='hist2d', bins=[30, 20]);" ] }, { @@ -1226,7 +1225,7 @@ "metadata": {}, "outputs": [], "source": [ - "output_file = pset.ParticleFile(name=\"EddyParticles_Bwd.nc\", outputdt=timedelta(hours=1)) # the file name and the time step of the outputs\n", + "output_file = pset.ParticleFile(name=\"EddyParticles_Bwd.zarr\", outputdt=timedelta(hours=1)) # the file name and the time step of the outputs\n", "pset.execute(AdvectionRK4,\n", " dt=-timedelta(minutes=5), # negative timestep for backward run\n", " runtime=timedelta(days=6), # the run time\n", @@ -1329,7 +1328,7 @@ "\n", "k_WestVel = pset.Kernel(WestVel) # casting the WestVel function to a kernel object\n", "\n", - "output_file = pset.ParticleFile(name=\"EddyParticles_WestVel.nc\", outputdt=timedelta(hours=1))\n", + "output_file = pset.ParticleFile(name=\"EddyParticles_WestVel.zarr\", outputdt=timedelta(hours=1))\n", "pset.execute(AdvectionRK4 + k_WestVel, # simply add kernels using the + operator\n", " runtime=timedelta(days=2),\n", " dt=timedelta(minutes=5),\n", @@ -1362,8 +1361,7 @@ } ], "source": [ - "output_file.export()\n", - "plotTrajectoriesFile('EddyParticles_WestVel.nc');" + "plotTrajectoriesFile('EddyParticles_WestVel.zarr');" ] }, { @@ -1492,7 +1490,7 @@ } ], "source": [ - "output_file = pset.ParticleFile(name=\"GlobCurrentParticles.nc\", outputdt=timedelta(hours=6))\n", + "output_file = pset.ParticleFile(name=\"GlobCurrentParticles.zarr\", outputdt=timedelta(hours=6))\n", "pset.execute(AdvectionRK4,\n", " runtime=timedelta(days=10),\n", " dt=timedelta(minutes=5),\n", @@ -1532,8 +1530,7 @@ } ], "source": [ - "output_file.export()\n", - "plotTrajectoriesFile('GlobCurrentParticles.nc',\n", + "plotTrajectoriesFile('GlobCurrentParticles.zarr',\n", " tracerfile='GlobCurrent_example_data/20020101000000-GLOBCURRENT-L4-CUReul_hs-ALT_SUM-v02.0-fv01.0.nc',\n", " tracerlon='lon',\n", " tracerlat='lat',\n", @@ -1825,14 +1822,14 @@ "pset.execute(AdvectionRK4 + k_dist, # Add kernels using the + operator.\n", " runtime=timedelta(days=6),\n", " dt=timedelta(minutes=5),\n", - " output_file=pset.ParticleFile(name=\"GlobCurrentParticles_Dist.nc\", outputdt=timedelta(hours=1)))" + " output_file=pset.ParticleFile(name=\"GlobCurrentParticles_Dist.zarr\", outputdt=timedelta(hours=1)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "And finally print the distance in km that each particle has travelled (note that this is also stored in the `EddyParticles_Dist.nc` file)" + "And finally print the distance in km that each particle has travelled (note that this is also stored in the `EddyParticles_Dist.zarr` file)" ] }, { diff --git a/parcels/examples/tutorial_Argofloats.ipynb b/parcels/examples/tutorial_Argofloats.ipynb index d823792da..fdd3ef621 100644 --- a/parcels/examples/tutorial_Argofloats.ipynb +++ b/parcels/examples/tutorial_Argofloats.ipynb @@ -156,17 +156,15 @@ ], "source": [ "%matplotlib inline\n", - "import netCDF4\n", + "import xarray as xr\n", "from mpl_toolkits.mplot3d import Axes3D\n", "import matplotlib.pyplot as plt\n", "\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "\n", - "nc = netCDF4.Dataset(\"argo_float.nc\")\n", - "x = nc.variables[\"lon\"][:].squeeze()\n", - "y = nc.variables[\"lat\"][:].squeeze()\n", - "z = nc.variables[\"z\"][:].squeeze()\n", - "nc.close()\n", + "ds = xr.open_zarr(\"argo_float.zarr\")\n", + "x = ds[\"lon\"][:].squeeze()\n", + "y = ds[\"lat\"][:].squeeze()\n", + "z = ds[\"z\"][:].squeeze()\n", + "ds.close()\n", "\n", "fig = plt.figure(figsize=(13,10))\n", "ax = plt.axes(projection='3d')\n", diff --git a/parcels/examples/tutorial_NestedFields.ipynb b/parcels/examples/tutorial_NestedFields.ipynb index f9292f22b..186638344 100644 --- a/parcels/examples/tutorial_NestedFields.ipynb +++ b/parcels/examples/tutorial_NestedFields.ipynb @@ -133,11 +133,10 @@ ], "source": [ "pset = ParticleSet(fieldset, pclass=JITParticle, lon=[0], lat=[1000])\n", - "output_file = pset.ParticleFile(name='NestedFieldParticle.nc', outputdt=50)\n", + "output_file = pset.ParticleFile(name='NestedFieldParticle.zarr', outputdt=50)\n", "pset.execute(AdvectionRK4, runtime=14000, dt=10, output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file\n", "\n", - "plt = plotTrajectoriesFile('NestedFieldParticle.nc', show_plt=False)\n", + "plt = plotTrajectoriesFile('NestedFieldParticle.zarr', show_plt=False)\n", "plt.plot([0,2e3,2e3,0,0],[0,0,2e3,2e3,0], c='orange')\n", "plt.plot([-2e3,18e3,18e3,-2e3,-2e3],[-1e3,-1e3,3e3,3e3,-1e3], c='green');" ] diff --git a/parcels/examples/tutorial_SummedFields.ipynb b/parcels/examples/tutorial_SummedFields.ipynb index 63d7c04ea..55b0bb226 100644 --- a/parcels/examples/tutorial_SummedFields.ipynb +++ b/parcels/examples/tutorial_SummedFields.ipynb @@ -82,10 +82,10 @@ ], "source": [ "pset = ParticleSet(fieldset_flow, pclass=JITParticle, lon=[0], lat=[900])\n", - "output_file = pset.ParticleFile(name='SummedFieldParticle_flow.nc', outputdt=1)\n", + "output_file = pset.ParticleFile(name='SummedFieldParticle_flow.zarr', outputdt=1)\n", "pset.execute(AdvectionRK4, runtime=10, dt=1, output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('SummedFieldParticle_flow.nc');" + "\n", + "plotTrajectoriesFile('SummedFieldParticle_flow.zarr');" ] }, { @@ -148,10 +148,10 @@ ], "source": [ "pset = ParticleSet(fieldset_stokes, pclass=JITParticle, lon=[0], lat=[900])\n", - "output_file = pset.ParticleFile(name='SummedFieldParticle_stokes.nc', outputdt=1)\n", + "output_file = pset.ParticleFile(name='SummedFieldParticle_stokes.zarr', outputdt=1)\n", "pset.execute(AdvectionRK4, runtime=10, dt=1, output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('SummedFieldParticle_stokes.nc');" + "\n", + "plotTrajectoriesFile('SummedFieldParticle_stokes.zarr');" ] }, { @@ -202,10 +202,10 @@ ], "source": [ "pset = ParticleSet(fieldset_sum, pclass=JITParticle, lon=[0], lat=[900])\n", - "output_file = pset.ParticleFile(name='SummedFieldParticle_sum.nc', outputdt=1)\n", + "output_file = pset.ParticleFile(name='SummedFieldParticle_sum.zarr', outputdt=1)\n", "pset.execute(AdvectionRK4, runtime=10, dt=1, output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('SummedFieldParticle_sum.nc');" + "\n", + "plotTrajectoriesFile('SummedFieldParticle_sum.zarr');" ] }, { diff --git a/parcels/examples/tutorial_analyticaladvection.ipynb b/parcels/examples/tutorial_analyticaladvection.ipynb index 1b25befc0..c31680bad 100644 --- a/parcels/examples/tutorial_analyticaladvection.ipynb +++ b/parcels/examples/tutorial_analyticaladvection.ipynb @@ -149,7 +149,7 @@ "\n", "pset = ParticleSet(fieldsetRR, pclass=MyParticle, lon=0, lat=4e3, time=0)\n", "\n", - "output = pset.ParticleFile(name='radialAnalytical.nc', outputdt=delta(hours=1))\n", + "output = pset.ParticleFile(name='radialAnalytical.zarr', outputdt=delta(hours=1))\n", "pset.execute(pset.Kernel(UpdateR) + AdvectionAnalytical,\n", " runtime=delta(hours=24),\n", " dt=np.inf, # needs to be set to np.inf for Analytical Advection\n", @@ -192,7 +192,7 @@ ], "source": [ "output.close()\n", - "plotTrajectoriesFile('radialAnalytical.nc')\n", + "plotTrajectoriesFile('radialAnalytical.zarr')\n", "\n", "print('Particle radius at start of run %f' % pset.radius_start[0])\n", "print('Particle radius at end of run %f' % pset.radius[0])\n", @@ -265,7 +265,7 @@ "X, Y = np.meshgrid(np.arange(0.15, 1.85, 0.1), np.arange(0.15, 0.85, 0.1))\n", "psetAA = ParticleSet(fieldsetDG, pclass=ScipyParticle, lon=X, lat=Y)\n", "\n", - "output = psetAA.ParticleFile(name='doublegyreAA.nc', outputdt=0.1)\n", + "output = psetAA.ParticleFile(name='doublegyreAA.zarr', outputdt=0.1)\n", "psetAA.execute(AdvectionAnalytical,\n", " dt=np.inf, # needs to be set to np.inf for Analytical Advection\n", " runtime=3,\n", @@ -1832,7 +1832,7 @@ ], "source": [ "output.close()\n", - "plotTrajectoriesFile('doublegyreAA.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('doublegyreAA.zarr', mode='movie2d_notebook')" ] }, { @@ -2008,7 +2008,7 @@ "\n", "psetAA = ParticleSet(fieldsetBJ, pclass=ScipyParticle, lon=X, lat=Y, time=0)\n", "\n", - "output = psetAA.ParticleFile(name='bickleyjetAA.nc', outputdt=delta(hours=1))\n", + "output = psetAA.ParticleFile(name='bickleyjetAA.zarr', outputdt=delta(hours=1))\n", "psetAA.execute(AdvectionAnalytical+psetAA.Kernel(ZonalBC),\n", " dt=np.inf,\n", " runtime=delta(days=1),\n", @@ -3161,7 +3161,7 @@ ], "source": [ "output.close()\n", - "plotTrajectoriesFile('bickleyjetAA.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('bickleyjetAA.zarr', mode='movie2d_notebook')" ] }, { diff --git a/parcels/examples/tutorial_delaystart.ipynb b/parcels/examples/tutorial_delaystart.ipynb index f333ad5e8..3559a1d9b 100644 --- a/parcels/examples/tutorial_delaystart.ipynb +++ b/parcels/examples/tutorial_delaystart.ipynb @@ -103,10 +103,9 @@ } ], "source": [ - "output_file = pset.ParticleFile(name=\"DelayParticle_time.nc\", outputdt=delta(hours=1))\n", + "output_file = pset.ParticleFile(name=\"DelayParticle_time.zarr\", outputdt=delta(hours=1))\n", "pset.execute(AdvectionRK4, runtime=delta(hours=24), dt=delta(minutes=5),\n", - " output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file" + " output_file=output_file)" ] }, { @@ -550,7 +549,7 @@ } ], "source": [ - "plotTrajectoriesFile('DelayParticle_time.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('DelayParticle_time.zarr', mode='movie2d_notebook')" ] }, { @@ -1163,8 +1162,7 @@ } ], "source": [ - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('DelayParticle_releasedt.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('DelayParticle_releasedt.zarr', mode='movie2d_notebook')" ] }, { @@ -1718,8 +1716,7 @@ "pset.execute(AdvectionRK4, runtime=delta(hours=15), dt=delta(minutes=5),\n", " output_file=output_file)\n", "\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('DelayParticle_releasedt_9hrs.nc', mode='movie2d_notebook')" + "plotTrajectoriesFile('DelayParticle_releasedt_9hrs.zarr', mode='movie2d_notebook')" ] }, { @@ -1767,7 +1764,7 @@ } ], "source": [ - "outfilepath = \"DelayParticle_nonmatchingtime.nc\"\n", + "outfilepath = \"DelayParticle_nonmatchingtime.zarr\"\n", "\n", "pset = ParticleSet(fieldset=fieldset, pclass=JITParticle,\n", " lat=[3e3]*3, lon=[3e3]*3, time=[0, 1, 2])\n", @@ -1899,7 +1896,7 @@ " particle.mass = particle.mass / 2.\n", "\n", "pset = ParticleSet(fieldset=fieldset, pclass=GrowingParticle, lon=0, lat=0)\n", - "outfile = ParticleFile('growingparticles.nc', pset, outputdt=1)\n", + "outfile = ParticleFile('growingparticles.zarr', pset, outputdt=1)\n", "\n", "for t in range(40):\n", " pset.execute(GrowParticles, runtime=1, dt=1, output_file=outfile)\n", @@ -1907,8 +1904,7 @@ " if p.splittime > 0:\n", " pset.add(ParticleSet(fieldset=fieldset, pclass=GrowingParticle, lon=0, lat=0, \n", " time=p.splittime, mass=p.splitmass))\n", - " p.splittime = -1 # reset splittime\n", - "outfile.close()" + " p.splittime = -1 # reset splittime" ] }, { @@ -1939,7 +1935,7 @@ } ], "source": [ - "ds = xr.open_dataset('growingparticles.nc')\n", + "ds = xr.open_zarr('growingparticles.zarr')\n", "plt.plot(ds.time.values[:].astype('timedelta64[s]').T, ds.mass.T)\n", "plt.grid()\n", "plt.xlabel('Time')\n", diff --git a/parcels/examples/tutorial_diffusion.ipynb b/parcels/examples/tutorial_diffusion.ipynb index 2a3e5ffc6..9fd09fbcb 100644 --- a/parcels/examples/tutorial_diffusion.ipynb +++ b/parcels/examples/tutorial_diffusion.ipynb @@ -225,15 +225,15 @@ "source": [ "dt = 0.001\n", "testParticles = get_test_particles()\n", - "output_file = testParticles.ParticleFile(name=\"M1_out.nc\",\n", + "output_file = testParticles.ParticleFile(name=\"M1_out.zarr\",\n", + " chunks=(len(testParticles), 1),\n", " outputdt=timedelta(seconds=dt))\n", "ParcelsRandom.seed(1636) # Random seed for reproducibility\n", "testParticles.execute(AdvectionDiffusionM1,\n", " runtime=timedelta(seconds=0.3),\n", " dt=timedelta(seconds=dt),\n", " output_file=output_file,\n", - " verbose_progress=True)\n", - "output_file.close() # to write the output to a netCDF file, since `output_file` does not close automatically when using notebooks" + " verbose_progress=True)" ] }, { @@ -242,7 +242,7 @@ "metadata": {}, "outputs": [], "source": [ - "M1_out = xr.open_dataset(\"M1_out.nc\")" + "M1_out = xr.open_zarr(\"M1_out.zarr\")" ] }, { @@ -312,15 +312,15 @@ "source": [ "dt = 0.001\n", "testParticles = get_test_particles()\n", - "output_file = testParticles.ParticleFile(name=\"EM_out.nc\",\n", + "output_file = testParticles.ParticleFile(name=\"EM_out.zarr\",\n", + " chunks=(len(testParticles), 1),\n", " outputdt=timedelta(seconds=dt))\n", "ParcelsRandom.seed(1636) # Random seed for reproducibility\n", "testParticles.execute(AdvectionDiffusionEM,\n", " runtime=timedelta(seconds=0.3),\n", " dt=timedelta(seconds=dt),\n", " output_file=output_file,\n", - " verbose_progress=True)\n", - "output_file.close() # to write the output to a netCDF file, since `output_file` does not close automatically when using notebooks" + " verbose_progress=True)" ] }, { @@ -329,7 +329,7 @@ "metadata": {}, "outputs": [], "source": [ - "EM_out = xr.open_dataset(\"EM_out.nc\")" + "EM_out = xr.open_zarr(\"EM_out.zarr\")" ] }, { @@ -537,7 +537,7 @@ ], "source": [ "kernels = pset.Kernel(AdvectionRK4) + pset.Kernel(smagdiff)\n", - "output_file = pset.ParticleFile(name='Global_smagdiff.nc', outputdt=timedelta(hours=6))\n", + "output_file = pset.ParticleFile(name='Global_smagdiff.zarr', outputdt=timedelta(hours=6))\n", "\n", "pset.execute(kernels, runtime=timedelta(days=5), dt=timedelta(minutes=5), output_file=output_file, recovery={ErrorCode.ErrorOutOfBounds: DeleteParticle})\n", "pset.show(field=fieldset.U)" @@ -578,7 +578,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Save the output file and visualise the trajectories\n" + "Visualise the trajectories" ] }, { @@ -600,8 +600,7 @@ } ], "source": [ - "output_file.export()\n", - "plotTrajectoriesFile('Global_smagdiff.nc',\n", + "plotTrajectoriesFile('Global_smagdiff.zarr',\n", " tracerfile='GlobCurrent_example_data/20020120000000-GLOBCURRENT-L4-CUReul_hs-ALT_SUM-v02.0-fv01.0.nc',\n", " tracerlon='lon', tracerlat='lat', tracerfield='eastward_eulerian_current_velocity');" ] diff --git a/parcels/examples/tutorial_interaction.ipynb b/parcels/examples/tutorial_interaction.ipynb index c835090a6..509893ea4 100644 --- a/parcels/examples/tutorial_interaction.ipynb +++ b/parcels/examples/tutorial_interaction.ipynb @@ -101,7 +101,7 @@ " n.lon += dlon\n", " n.depth += ddepth\n", "\n", - " mutator[n.id].append((f, d_vec)) # add mutation to the mutator\n" + " mutator[n.id].append((f, d_vec)) # add mutation to the mutator" ] }, { @@ -141,13 +141,11 @@ " interaction_distance=0.5, # note the interaction_distance argument here\n", " attractor=attractor)\n", "\n", - "output_file = pset.ParticleFile(name=\"InteractingParticles.nc\", outputdt=1)\n", + "output_file = pset.ParticleFile(name=\"InteractingParticles.zarr\", outputdt=1)\n", "\n", "pset.execute(pyfunc=DiffusionUniformKh, \n", " pyfunc_inter=Pull, # note the pyfunc_inter here\n", - " runtime=60, dt=1, output_file=output_file)\n", - "\n", - "output_file.close()" + " runtime=60, dt=1, output_file=output_file)" ] }, { @@ -158,7 +156,7 @@ "outputs": [], "source": [ "%%capture\n", - "data_xarray = xr.open_dataset('InteractingParticles.nc')\n", + "data_xarray = xr.open_zarr('InteractingParticles.zarr')\n", "data_attr = data_xarray.where(data_xarray['attractor']==1, drop=True)\n", "data_other = data_xarray.where(data_xarray['attractor']==0, drop=True)\n", "\n", @@ -31273,13 +31271,11 @@ " lon=X, lat=Y,\n", " interaction_distance=0.05) # note the interaction_distance argument here\n", "\n", - "output_file = pset.ParticleFile(name=\"MergingParticles.nc\", outputdt=1)\n", + "output_file = pset.ParticleFile(name=\"MergingParticles.zarr\", outputdt=1)\n", "\n", "pset.execute(pyfunc=DiffusionUniformKh, \n", " pyfunc_inter=pset.InteractionKernel(NearestNeighborWithinRange) + MergeWithNearestNeighbor, # note the pyfunc_inter here\n", - " runtime=60, dt=1, output_file=output_file)\n", - "\n", - "output_file.close()" + " runtime=60, dt=1, output_file=output_file)" ] }, { @@ -31290,7 +31286,7 @@ "outputs": [], "source": [ "%%capture\n", - "data_xarray = xr.open_dataset('MergingParticles.nc')\n", + "data_xarray = xr.open_zarr('MergingParticles.zarr')\n", "\n", "timerange = np.arange(np.nanmin(data_xarray['time'].values),\n", " np.nanmax(data_xarray['time'].values), np.timedelta64(1, 's')) # timerange in nanoseconds\n", @@ -62047,7 +62043,7 @@ "\n", "# The function that creates the animation\n", "def Animate():\n", - " data_xarray = xr.open_dataset('InteractingParticles.nc')\n", + " data_xarray = xr.open_zarr('InteractingParticles.zarr')\n", " data_attr = data_xarray.where(data_xarray['ptype']==1, drop=True)\n", " data_other = data_xarray.where(data_xarray['ptype']==-1, drop=True)\n", "\n", @@ -86874,11 +86870,10 @@ " lon=X, lat=Y,\n", " interaction_distance=0.7, # note the interaction_distance argument here\n", " ptype=ptype)\n", - "output_file = pset.ParticleFile(name=\"InteractingParticles.nc\", outputdt=1)\n", + "output_file = pset.ParticleFile(name=\"InteractingParticles.zarr\", outputdt=1)\n", "pset.execute(#pyfunc=DiffusionUniformKh, \n", " pyfunc_inter=Move, # note the pyfunc_inter here\n", " runtime=60, dt=1, output_file=output_file)\n", - "output_file.close()\n", "anim = Animate() # Create animation\n", "HTML(anim.to_jshtml())" ] @@ -111541,12 +111536,11 @@ " lon=X, lat=Y,\n", " interaction_distance=0.7, # note the interaction_distance argument here\n", " ptype=ptype)\n", - "output_file = pset.ParticleFile(name=\"InteractingParticles.nc\", outputdt=1)\n", + "output_file = pset.ParticleFile(name=\"InteractingParticles.zarr\", outputdt=1)\n", "\n", "pset.execute(#pyfunc=DiffusionUniformKh, \n", " pyfunc_inter=Move, # note the pyfunc_inter here\n", " runtime=60, dt=1, output_file=output_file)\n", - "output_file.close()\n", "anim = Animate()\n", "HTML(anim.to_jshtml())" ] diff --git a/parcels/examples/tutorial_interpolation.ipynb b/parcels/examples/tutorial_interpolation.ipynb index 64f6b6581..8dbf10442 100644 --- a/parcels/examples/tutorial_interpolation.ipynb +++ b/parcels/examples/tutorial_interpolation.ipynb @@ -312,9 +312,8 @@ } ], "source": [ - "pfile = pset.ParticleFile(\"interpolation_offset.nc\", outputdt=1)\n", - "pset.execute(kernels, endtime=1, dt=1, output_file=pfile)\n", - "pfile.close()" + "pfile = pset.ParticleFile(\"interpolation_offset.zarr\", outputdt=1)\n", + "pset.execute(kernels, endtime=1, dt=1, output_file=pfile)" ] }, { @@ -347,7 +346,7 @@ "metadata": {}, "outputs": [], "source": [ - "ds = xr.open_dataset(\"interpolation_offset.nc\").isel(obs=1)\n", + "ds = xr.open_zarr(\"interpolation_offset.zarr\").isel(obs=1)\n", "for i in range(len(ds['p'])):\n", " assert np.isclose(ds['p'].values[i], calc_p(float(ds['time'].values[i])/1e9, ds['lat'].values[i], ds['lon'].values[i]))" ] @@ -358,13 +357,6 @@ "source": [ "As a bit of background for why sampling needs to be done this way: the reason is that the particles are already moved within the AdvectionRK4 kernel, but the time is not updated yet until all concatenated kernels are completed. " ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/parcels/examples/tutorial_nemo_curvilinear.ipynb b/parcels/examples/tutorial_nemo_curvilinear.ipynb index c0a9864de..b0933bc93 100644 --- a/parcels/examples/tutorial_nemo_curvilinear.ipynb +++ b/parcels/examples/tutorial_nemo_curvilinear.ipynb @@ -185,8 +185,7 @@ } ], "source": [ - "pfile.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile(\"nemo_particles.nc\");" + "plotTrajectoriesFile(\"nemo_particles.zarr\");" ] }, { @@ -206,13 +205,6 @@ "pset = ParticleSet.from_list(field_set, JITParticle, lon=lonp, lat=latp)\n", "pset.populate_indices()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/parcels/examples/tutorial_output.ipynb b/parcels/examples/tutorial_output.ipynb index 2e9b7eb51..05f7e43a8 100644 --- a/parcels/examples/tutorial_output.ipynb +++ b/parcels/examples/tutorial_output.ipynb @@ -58,47 +58,9 @@ "\n", "pset = ParticleSet(fieldset=fieldset, pclass=JITParticle, lon=lon, lat=lat, time=time)\n", "\n", - "output_file = pset.ParticleFile(name=\"Output.nc\", outputdt=delta(hours=2))\n", + "output_file = pset.ParticleFile(name=\"Output.zarr\", outputdt=delta(hours=2))\n", "pset.execute(AdvectionRK4, runtime=delta(hours=24), dt=delta(minutes=5),\n", - " output_file=output_file)\n", - "output_file.close() # export the trajectory data to a netcdf file" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Exporting trajectory data in `zarr` format" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Parcels can also output trajectories in [`zarr` format](https://zarr.readthedocs.io/en/stable/). Files in `zarr` are typically _much_ smaller in size than the default netcdf output, but may be slightly more challenging to handle (although `xarray` has a fairly seamless [`open_zarr()` method](https://docs.xarray.dev/en/stable/generated/xarray.open_zarr.html)). IN order to output to `zarr` format, simply make sure the extension of the output file name is `.zarr`. " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "sh: None: command not found\n", - "INFO: Compiled ArrayJITParticleAdvectionRK4 ==> /var/folders/s5/gxtkk3c12yqgd7hkt1b_s0vr0000gq/T/parcels-503/libf8e8a9d3cb076076d75b2b159b1317da_0.so\n" - ] - } - ], - "source": [ - "pset = ParticleSet(fieldset=fieldset, pclass=JITParticle, lon=lon, lat=lat, time=time)\n", - "\n", - "output_file = pset.ParticleFile(name=\"Output.zarr\", outputdt=delta(hours=2)) # note .zarr extension in name!\n", - "pset.execute(AdvectionRK4, runtime=delta(hours=24), dt=delta(minutes=5),\n", - " output_file=output_file)\n", - "output_file.close() # export the trajectory data to a netcdf file" + " output_file=output_file)" ] }, { @@ -106,75 +68,8 @@ "metadata": {}, "source": [ "## Reading the output file\n", - "### Using the netCDF4 package\n", - "The [`parcels.particlefile.ParticleFile`](https://oceanparcels.org/gh-pages/html/#parcels.particlefile.ParticleFile) class creates a netCDF file to store the particle trajectories. It uses the [**`netCDF4` package**](https://unidata.github.io/netcdf4-python/netCDF4/index.html), which is also suitable to open and read the files for analysis. The [`Dataset` class](https://unidata.github.io/netcdf4-python/netCDF4/index.html#netCDF4.Dataset) opens a netCDF file in reading mode by default. Data can be accessed with the `Dataset.variables` dictionary which can return (masked) numpy arrays." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "root group (NETCDF4 data model, file format HDF5):\n", - " feature_type: trajectory\n", - " Conventions: CF-1.6/CF-1.7\n", - " ncei_template_version: NCEI_NetCDF_Trajectory_Template_v2.0\n", - " parcels_version: v2.3.0-176-gbe3424c9\n", - " parcels_mesh: flat\n", - " dimensions(sizes): traj(10), obs(13)\n", - " variables(dimensions): float64 time(traj, obs), int64 trajectory(traj, obs), float32 lon(traj, obs), float32 lat(traj, obs), float32 z(traj, obs)\n", - " groups: \n" - ] - } - ], - "source": [ - "import netCDF4\n", "\n", - "data_netcdf4 = netCDF4.Dataset('Output.nc')\n", - "print(data_netcdf4)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0 0 0 0 0 0 0 0 0 0 0 0 0]\n", - " [1 1 1 1 1 1 1 1 1 1 1 1 --]\n", - " [2 2 2 2 2 2 2 2 2 2 2 -- --]\n", - " [3 3 3 3 3 3 3 3 3 3 -- -- --]\n", - " [4 4 4 4 4 4 4 4 4 -- -- -- --]\n", - " [5 5 5 5 5 5 5 5 -- -- -- -- --]\n", - " [6 6 6 6 6 6 6 -- -- -- -- -- --]\n", - " [7 7 7 7 7 7 -- -- -- -- -- -- --]\n", - " [8 8 8 8 8 -- -- -- -- -- -- -- --]\n", - " [9 9 9 9 -- -- -- -- -- -- -- -- --]]\n" - ] - } - ], - "source": [ - "trajectory_netcdf4 = data_netcdf4.variables['trajectory'][:]\n", - "time_netcdf4 = data_netcdf4.variables['time'][:]\n", - "lon_netcdf4 = data_netcdf4.variables['lon'][:]\n", - "lat_netcdf4 = data_netcdf4.variables['lat'][:]\n", - "print(trajectory_netcdf4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using the xarray package\n", - "An often-used alternative to netCDF4, which also comes with the parcels installation, is [**xarray**](http://xarray.pydata.org/en/stable/index.html). Its labelled arrays allow for intuitive and accessible handling of data stored in the netCDF format." + "Parcels exports output trajectories in [`zarr` format](https://zarr.readthedocs.io/en/stable/). Files in `zarr` are typically _much_ smaller in size than netcdf, although may be slightly more challenging to handle (but `xarray` has a fairly seamless [`open_zarr()` method](https://docs.xarray.dev/en/stable/generated/xarray.open_zarr.html))." ] }, { @@ -207,49 +102,10 @@ "source": [ "import xarray as xr\n", "\n", - "data_xarray = xr.open_dataset('Output.nc')\n", + "data_xarray = xr.open_zarr('Output.zarr')\n", "print(data_xarray)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that opening the `.zarr` file (see [Exporting trajectory data in zarr format](#Exporting-trajectory-data-in-zarr-format)) using `xr.open_zarr()` leads to a similar object" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Dimensions: (traj: 10, obs: 13)\n", - "Dimensions without coordinates: traj, obs\n", - "Data variables:\n", - " lat (traj, obs) float32 dask.array\n", - " lon (traj, obs) float32 dask.array\n", - " time (traj, obs) timedelta64[ns] dask.array\n", - " trajectory (traj, obs) float64 dask.array\n", - " z (traj, obs) float32 dask.array\n", - "Attributes:\n", - " Conventions: CF-1.6/CF-1.7\n", - " feature_type: trajectory\n", - " ncei_template_version: NCEI_NetCDF_Trajectory_Template_v2.0\n", - " parcels_mesh: flat\n", - " parcels_version: v2.3.0-176-gbe3424c9\n" - ] - } - ], - "source": [ - "data_xarray_zarr = xr.open_zarr('Output.zarr')\n", - "print(data_xarray_zarr)" - ] - }, { "cell_type": "code", "execution_count": 8, @@ -281,12 +137,21 @@ "print(data_xarray['trajectory'])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that if you are running Parcels on multiple processors with `mpirun`, you will need to concatenate the files of each processor, see also the [MPI documentation](https://nbviewer.jupyter.org/github/OceanParcels/parcels/blob/master/parcels/examples/documentation_MPI.ipynb#Reading-in-the-ParticleFile-data-in-zarr-format). \n", + "\n", + "Also, once you have loaded the data as an `xarray` DataSet using `xr.open_zarr()`, you can always save the file to NetCDF if you prefer with the `.to_netcdf()` method." + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Trajectory data structure\n", - "The data in the netCDF file are organised according to the [CF-conventions](http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#discrete-sampling-geometries) implemented with the [NCEI trajectory template](http://www.nodc.noaa.gov/data/formats/netcdf/v2.0/trajectoryIncomplete.cdl). The data is stored in a **two-dimensional array** with the dimensions **`traj`** and **`obs`**. Each particle trajectory is essentially stored as a time series where the coordinate data (**`lon`**, **`lat`**, **`time`**) are a function of the observation (`obs`).\n", + "The data zarr file are organised according to the [CF-conventions](http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#discrete-sampling-geometries) implemented with the [NCEI trajectory template](http://www.nodc.noaa.gov/data/formats/netcdf/v2.0/trajectoryIncomplete.cdl). The data is stored in a **two-dimensional array** with the dimensions **`traj`** and **`obs`**. Each particle trajectory is essentially stored as a time series where the coordinate data (**`lon`**, **`lat`**, **`time`**) are a function of the observation (`obs`).\n", "\n", "The output dataset used here contains **10 particles** and **13 observations**. Not every particle has 13 observations however; since we released particles at different times some particle trajectories are shorter than others." ] @@ -447,33 +312,6 @@ " mean_lat_x += [np.nanmean(data_xarray['lat'].where(data_xarray['time']==time).values)] # find the data that share the time" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Conditional selection is even easier in numpy arrays without the xarray formatting since it accepts the 2D boolean array that results from `time_netcdf4 == time` as a mask that you can use to directly select the data." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# Using netCDF4\n", - "mean_lon_n = []\n", - "mean_lat_n = []\n", - "\n", - "timerange = np.arange(np.nanmin(time_netcdf4), \n", - " np.nanmax(time_netcdf4)+delta(hours=2).total_seconds(), \n", - " delta(hours=2).total_seconds()) \n", - "\n", - "for time in timerange:\n", - " if np.all(np.any(time_netcdf4 == time, axis=1)): # if all trajectories share an observation at time\n", - " mean_lon_n += [np.mean(lon_netcdf4[time_netcdf4 == time])] # find the data that share the time\n", - " mean_lat_n += [np.mean(lat_netcdf4[time_netcdf4 == time])] # find the data that share the time" - ] - }, { "cell_type": "code", "execution_count": 15, @@ -498,9 +336,7 @@ "ax.set_ylabel('Meridional distance [m]')\n", "ax.set_xlabel('Zonal distance [m]')\n", "ax.grid()\n", - "ax.scatter(mean_lon_x,mean_lat_x,marker='^',label='xarray',s = 80)\n", - "ax.scatter(mean_lon_n,mean_lat_n,marker='o',label='netcdf')\n", - "plt.legend()\n", + "ax.scatter(mean_lon_x,mean_lat_x,marker='^',s = 80)\n", "plt.show()" ] }, diff --git a/parcels/examples/tutorial_parcels_structure.ipynb b/parcels/examples/tutorial_parcels_structure.ipynb index 1cc6492d4..3dc01965d 100644 --- a/parcels/examples/tutorial_parcels_structure.ipynb +++ b/parcels/examples/tutorial_parcels_structure.ipynb @@ -313,7 +313,7 @@ } ], "source": [ - "output_file = pset.ParticleFile(name=\"GCParticles.nc\", outputdt=3600) # the file name and the time step of the outputs\n", + "output_file = pset.ParticleFile(name=\"GCParticles.zarr\", outputdt=3600) # the file name and the time step of the outputs\n", "\n", "pset.execute(kernels, # the kernel (which defines how particles move)\n", " runtime=86400*24, # the total length of the run in seconds\n", @@ -321,23 +321,6 @@ " output_file=output_file)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "While executing the `ParticleSet`, parcels stores the data in **npy** files in an output folder. To take all the data and store them in a netcdf file, you can use [**`ParticleFile.export()`**](https://oceanparcels.org/gh-pages/html/#parcels.particlefile.ParticleFile.export) if you want to keep the folder with npy files; or [**`ParticleFile.close()`**](https://oceanparcels.org/gh-pages/html/#parcels.particlefile.ParticleFile.close) if you only want to keep the netcdf file:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "output_file.export()\n", - "output_file.close()" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/parcels/examples/tutorial_particle_field_interaction.ipynb b/parcels/examples/tutorial_particle_field_interaction.ipynb index a1b4fff6a..e5d84308e 100644 --- a/parcels/examples/tutorial_particle_field_interaction.ipynb +++ b/parcels/examples/tutorial_particle_field_interaction.ipynb @@ -49,6 +49,7 @@ "import numpy as np\n", "from datetime import timedelta as delta\n", "import netCDF4\n", + "import xarray as xr\n", "import matplotlib.pyplot as plt" ] }, @@ -229,7 +230,7 @@ "\n", "Before running the advection, we will execute the ```pset``` with the ```WriteInitial``` for ```dt=0```: this will write the initial condition of fieldset.C to a ```netCDF``` file.\n", "\n", - "While particle outputs will be written in a file named ```interaction.nc``` at every ```outputdt```, the field will be automatically written in ```netCDF``` files named ```interaction_wxyzC.nc```, with ```wxyz``` being the number of the output and ```C``` the ```FieldSet``` variable of our interest. Note that you can use tools like [ncrcat](https://linux.die.net/man/1/ncrcat) (on linux/macOS) to combine these separate files into one large ```netCDF``` file after the simualtion." + "While particle outputs will be written in a file named ```interaction.zarr``` at every ```outputdt```, the field will be automatically written in ```netCDF``` files named ```interaction_wxyzC.nc```, with ```wxyz``` being the number of the output and ```C``` the ```FieldSet``` variable of our interest. Note that you can use tools like [ncrcat](https://linux.die.net/man/1/ncrcat) (on linux/macOS) to combine these separate files into one large ```netCDF``` file after the simualtion." ] }, { @@ -247,16 +248,14 @@ } ], "source": [ - "output_file = pset.ParticleFile(name=r'interaction.nc', outputdt=delta(days=1))\n", + "output_file = pset.ParticleFile(name=r'interaction.zarr', outputdt=delta(days=1))\n", "\n", "pset.execute(WriteInitial, dt=0., output_file=output_file)\n", "\n", "pset.execute(AdvectionRK4 + pset.Kernel(Interaction), # the particle will FIRST be transported by currents and THEN interact with the field\n", " dt=delta(days=1),\n", " runtime=delta(days=24), # we are going to track the particle and save its trajectory and tracer concentration for 24 days\n", - " output_file=output_file)\n", - "\n", - "output_file.close()" + " output_file=output_file)" ] }, { @@ -298,11 +297,11 @@ } ], "source": [ - "pset_traj = netCDF4.Dataset(r'interaction.nc')\n", + "pset_traj = xr.open_zarr(r'interaction.zarr')\n", "\n", - "print(pset_traj['c'][:])\n", + "print(pset_traj['c'].values)\n", "\n", - "plotTrajectoriesFile('interaction.nc');" + "plotTrajectoriesFile('interaction.zarr');" ] }, { @@ -406,7 +405,7 @@ "field_cbar = plt.colorbar(fieldplot,ax=ax)\n", "field_cbar.ax.text(.6,.070,'$C_{field}$ concentration', rotation=270, fontsize=12)\n", "\n", - "particle = plt.scatter(pset_traj['lon'][:].data[0,:],pset_traj['lat'][:].data[0,:], c=pset_traj['c'][:].data[0,:],vmin=0, s=100, edgecolor='white') \n", + "particle = plt.scatter(pset_traj['lon'][:].values[0,:],pset_traj['lat'][:].values[0,:], c=pset_traj['c'][:].values[0,:],vmin=0, s=100, edgecolor='white') \n", "particle_cbar = plt.colorbar(particle,ax=ax, location = 'top')\n", "particle_cbar.ax.text(40,300,'$c_{particle}$ concentration', fontsize=12);" ] @@ -456,7 +455,7 @@ "\n", " fieldplot=ax[i,j].pcolormesh(x_centers[-28:-17,22:41],y_centers[-28:-17,22:41],c_results[-28:-18,22:40], vmin=0, vmax=0.2,cmap='viridis') \n", " \n", - " particle = ax[i,j].scatter(pset_traj['lon'][:].data[0,daycounter-1],pset_traj['lat'][:].data[0,daycounter-1], c=pset_traj['c'][:].data[0,daycounter-1],vmin=0, vmax=100, s=100, edgecolor='white') \n", + " particle = ax[i,j].scatter(pset_traj['lon'][:].values[0,daycounter-1],pset_traj['lat'][:].values[0,daycounter-1], c=pset_traj['c'][:].values[0,daycounter-1],vmin=0, vmax=100, s=100, edgecolor='white') \n", " # plotting particle location at current time step -- daycounter-1 due to different indexing\n", " \n", " ax[i,j].set_title('Day '+ str(daycounter-1))\n", diff --git a/parcels/examples/tutorial_periodic_boundaries.ipynb b/parcels/examples/tutorial_periodic_boundaries.ipynb index 0c97af31c..b3a4f5df0 100644 --- a/parcels/examples/tutorial_periodic_boundaries.ipynb +++ b/parcels/examples/tutorial_periodic_boundaries.ipynb @@ -156,8 +156,7 @@ } ], "source": [ - "output_file.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('PeriodicParticle.nc');" + "plotTrajectoriesFile('PeriodicParticle.zarr');" ] }, { diff --git a/parcels/examples/tutorial_sampling.ipynb b/parcels/examples/tutorial_sampling.ipynb index 8ab497cd7..f332fb8d0 100644 --- a/parcels/examples/tutorial_sampling.ipynb +++ b/parcels/examples/tutorial_sampling.ipynb @@ -214,11 +214,9 @@ "source": [ "pset.execute(sample_kernel, dt=0) # by only executing the sample kernel we record the initial temperature of the particles\n", "\n", - "output_file = pset.ParticleFile(name=\"InitZero.nc\", outputdt=delta(hours=1))\n", + "output_file = pset.ParticleFile(name=\"InitZero.zarr\", outputdt=delta(hours=1))\n", "pset.execute(AdvectionRK4 + sample_kernel, runtime=delta(hours=30), dt=delta(minutes=5),\n", - " output_file=output_file)\n", - "output_file.export() # export the trajectory data to a netcdf file\n", - "output_file.close()" + " output_file=output_file)" ] }, { @@ -247,7 +245,7 @@ } ], "source": [ - "Particle_data = xr.open_dataset(\"InitZero.nc\")\n", + "Particle_data = xr.open_zarr(\"InitZero.zarr\")\n", "\n", "plt.figure()\n", "ax = plt.axes()\n", @@ -372,10 +370,9 @@ "source": [ "pset.execute(sample_kernel, dt=0) # by only executing the sample kernel we record the initial temperature of the particles\n", "\n", - "output_file = pset.ParticleFile(name=\"WriteOnce.nc\", outputdt=delta(hours=1))\n", + "output_file = pset.ParticleFile(name=\"WriteOnce.zarr\", outputdt=delta(hours=1))\n", "pset.execute(AdvectionRK4, runtime=delta(hours=24), dt=delta(minutes=5),\n", - " output_file=output_file)\n", - "output_file.close()" + " output_file=output_file)" ] }, { @@ -404,7 +401,7 @@ } ], "source": [ - "Particle_data = xr.open_dataset(\"WriteOnce.nc\")\n", + "Particle_data = xr.open_zarr(\"WriteOnce.zarr\")\n", "\n", "plt.figure()\n", "ax = plt.axes()\n", @@ -533,7 +530,7 @@ "pset = ParticleSet(fieldset=fieldset, pclass=SampleParticleInitZero, lon=[], lat=[], time=[]) # Using SampleParticleInitZero\n", "kernels = AdvectionRK4 + sample_kernel\n", "\n", - "output_file = pset.ParticleFile(name=\"RepeatLoop.nc\") # Do not specify the outputdt yet, so we can manually write the output\n", + "output_file = pset.ParticleFile(name=\"RepeatLoop.zarr\") # Do not specify the outputdt yet, so we can manually write the output\n", "\n", "for time in np.arange(0, runtime, outputdt):\n", " if np.isclose(np.fmod(time, repeatdt), 0): # time is a multiple of repeatdt\n", @@ -546,9 +543,7 @@ " pset.execute(kernels, runtime=outputdt, dt=delta(minutes=5))\n", " print('Length of pset at time %d: %d' % (time, len(pset)))\n", " \n", - "output_file.write(pset, time+outputdt) \n", - "\n", - "output_file.close()" + "output_file.write(pset, time+outputdt)" ] }, { @@ -574,7 +569,7 @@ } ], "source": [ - "Particle_data = xr.open_dataset(\"RepeatLoop.nc\")\n", + "Particle_data = xr.open_zarr(\"RepeatLoop.zarr\")\n", "print(Particle_data.time[:,0].values / np.timedelta64(1, 'h')) # The initial hour at which each particle is released\n", "assert np.allclose(Particle_data.time[:,0].values / np.timedelta64(1, 'h'), [int(k/10)*6 for k in range(40)])" ] diff --git a/parcels/examples/tutorial_timevaryingdepthdimensions.ipynb b/parcels/examples/tutorial_timevaryingdepthdimensions.ipynb index 17f83e832..947ae7011 100644 --- a/parcels/examples/tutorial_timevaryingdepthdimensions.ipynb +++ b/parcels/examples/tutorial_timevaryingdepthdimensions.ipynb @@ -131,8 +131,7 @@ "pfile = pset.ParticleFile(\"SwashParticles\", outputdt=delta(seconds=0.05))\n", "pset.execute(AdvectionRK4, dt=delta(seconds=0.005), output_file=pfile)\n", "\n", - "pfile.export() # export the trajectory data to a netcdf file\n", - "plotTrajectoriesFile('SwashParticles.nc');" + "plotTrajectoriesFile('SwashParticles.zarr');" ] }, { diff --git a/parcels/particle.py b/parcels/particle.py index 767f7bd8a..fdc1c6251 100644 --- a/parcels/particle.py +++ b/parcels/particle.py @@ -85,6 +85,11 @@ def __init__(self, pclass): def __repr__(self): return "PType<%s>::%s" % (self.name, self.variables) + def __getitem__(self, item): + for v in self.variables: + if v.name == item: + return v + @property def _cache_key(self): return "-".join(["%s:%s" % (v.name, v.dtype) for v in self.variables]) @@ -180,8 +185,8 @@ class ScipyParticle(_Particle): lat = Variable('lat', dtype=np.float32) depth = Variable('depth', dtype=np.float32) time = Variable('time', dtype=np.float64) - id = Variable('id', dtype=np.int64) - fileid = Variable('fileid', dtype=np.int32, initial=-1, to_write=False) + id = Variable('id', dtype=np.int64, to_write='once') + once_written = Variable('once_written', dtype=np.int32, initial=0, to_write=False) # np.bool not implemented in JIT dt = Variable('dt', dtype=np.float64, to_write=False) state = Variable('state', dtype=np.int32, initial=StateCode.Evaluate, to_write=False) next_dt = Variable('_next_dt', dtype=np.float64, initial=np.nan, to_write=False) @@ -195,7 +200,7 @@ def __init__(self, lon, lat, pid, fieldset=None, ngrids=None, depth=0., time=0., type(self).time.initial = time type(self).id.initial = pid _Particle.lastID = max(_Particle.lastID, pid) - type(self).fileid.initial = -1 + type(self).once_written.initial = 0 type(self).dt.initial = None type(self).next_dt.initial = np.nan diff --git a/parcels/particlefile/baseparticlefile.py b/parcels/particlefile/baseparticlefile.py index c31fe4287..ec98169fb 100644 --- a/parcels/particlefile/baseparticlefile.py +++ b/parcels/particlefile/baseparticlefile.py @@ -1,13 +1,14 @@ -"""Module controlling the writing of ParticleSets to NetCDF file""" -import os -import random -import shutil -import string +"""Module controlling the writing of ParticleSets to Zarr file""" from abc import ABC from abc import abstractmethod -import gzip - +from datetime import timedelta as delta +import os import numpy as np +import xarray as xr +import zarr + +from parcels.tools.loggers import logger +from parcels.tools.statuscodes import OperationCode try: from mpi4py import MPI @@ -17,12 +18,6 @@ from parcels._version import version as parcels_version except: raise EnvironmentError('Parcels version can not be retrieved. Have you run ''python setup.py install''?') -try: - from os import getuid -except: - # Windows does not have getuid(), so define to simply return 'tmp' - def getuid(): - return 'tmp' __all__ = ['BaseParticleFile'] @@ -43,94 +38,71 @@ class BaseParticleFile(ABC): :param outputdt: Interval which dictates the update frequency of file output while ParticleFile is given as an argument of ParticleSet.execute() It is either a timedelta object or a positive double. + :param chunks: Tuple (trajs, obs) to control the size of chunks in the zarr output. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False - :param convert_at_end: Boolean to convert npy files to netcdf at end of run. Default is True - :param tempwritedir: directories to write temporary files to during executing. - Default is out-XXXXXX where Xs are random capitals. Files for individual - processors are written to subdirectories 0, 1, 2 etc under tempwritedir - :param pset_info: dictionary of info on the ParticleSet, stored in tempwritedir/XX/pset_info.npy, - used to create NetCDF file from npy-files. + :param create_new_zarrfile: Boolean to create a new file. Default is True """ write_ondelete = None - convert_at_end = None outputdt = None lasttime_written = None - dataset = None - name = None particleset = None parcels_mesh = None time_origin = None lonlatdepth_dtype = None - var_names = None - var_dtypes = None - file_list = None - var_names_once = None - var_dtypes_once = None - file_list_once = None - maxid_written = -1 - tempwritedir_base = None - tempwritedir = None - - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=True, - tempwritedir=None, pset_info=None): + + def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_ondelete=False, + create_new_zarrfile=True): self.write_ondelete = write_ondelete - self.convert_at_end = convert_at_end self.outputdt = outputdt + self.chunks = chunks self.lasttime_written = None # variable to check if time has been written already - self.dataset = None - if pset_info: - for v in pset_info.keys(): - setattr(self, v, pset_info[v]) - else: - self.name = name - self.particleset = particleset - self.parcels_mesh = 'spherical' - if self.particleset.fieldset is not None: - self.parcels_mesh = self.particleset.fieldset.gridset.grids[0].mesh - self.time_origin = self.particleset.time_origin - self.lonlatdepth_dtype = self.particleset.collection.lonlatdepth_dtype - self.var_names = [] - self.var_dtypes = [] - self.var_names_once = [] - self.var_dtypes_once = [] - for v in self.particleset.collection.ptype.variables: - if v.to_write == 'once': - self.var_names_once += [v.name] - self.var_dtypes_once += [v.dtype] - elif v.to_write is True: - self.var_names += [v.name] - self.var_dtypes += [v.dtype] - if len(self.var_names_once) > 0: - self.written_once = [] - self.file_list_once = [] - - self.file_list = [] + self.particleset = particleset + self.parcels_mesh = 'spherical' + if self.particleset.fieldset is not None: + self.parcels_mesh = self.particleset.fieldset.gridset.grids[0].mesh + self.time_origin = self.particleset.time_origin + self.lonlatdepth_dtype = self.particleset.collection.lonlatdepth_dtype + self.maxids = 0 + self.obs_written = np.empty((0,), dtype=int) + self.pids_written = {} + self.create_new_zarrfile = create_new_zarrfile + self.vars_to_write = {} + for var in self.particleset.collection.ptype.variables: + if var.to_write: + self.vars_to_write[var.name] = var.dtype + self.mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 + + # Reset once-written flag of each particle, in case new ParticleFile created for a ParticleSet + particleset.collection.setallvardata('once_written', 0) self.metadata = {"feature_type": "trajectory", "Conventions": "CF-1.6/CF-1.7", "ncei_template_version": "NCEI_NetCDF_Trajectory_Template_v2.0", "parcels_version": parcels_version, "parcels_mesh": self.parcels_mesh} - tmp_dir = tempwritedir - if tempwritedir is None: - tmp_dir = os.path.join(os.path.dirname(str(self.name)), "out-%s" % ''.join(random.choice(string.ascii_uppercase) for _ in range(8))) - else: - tmp_dir = tempwritedir - - if MPI: - mpi_rank = MPI.COMM_WORLD.Get_rank() - self.tempwritedir_base = MPI.COMM_WORLD.bcast(tmp_dir, root=0) + # Create dictionary to translate datatypes and fill_values + self.fmt_map = {np.float16: 'f2', np.float32: 'f4', np.float64: 'f8', + np.bool_: 'i1', np.int8: 'i1', np.int16: 'i2', + np.int32: 'i4', np.int64: 'i8', np.uint8: 'u1', + np.uint16: 'u2', np.uint32: 'u4', np.uint64: 'u8'} + self.fill_value_map = {np.float16: np.nan, np.float32: np.nan, np.float64: np.nan, + np.bool_: np.iinfo(np.int8).max, np.int8: np.iinfo(np.int8).max, + np.int16: np.iinfo(np.int16).max, np.int32: np.iinfo(np.int32).max, + np.int64: np.iinfo(np.int64).max, np.uint8: np.iinfo(np.uint8).max, + np.uint16: np.iinfo(np.uint16).max, np.uint32: np.iinfo(np.uint32).max, + np.uint64: np.iinfo(np.uint64).max} + + extension = os.path.splitext(str(name))[1] + if extension in ['.nc', '.nc4']: + raise RuntimeError('Output in NetCDF is not supported anymore. Use .zarr extension for ParticleFile name.') + if MPI and MPI.COMM_WORLD.Get_size() > 1: + self.fname = os.path.join(name, f"proc{self.mpi_rank:02d}.zarr") + if extension in ['.zarr']: + logger.warning(f'The ParticleFile name contains .zarr extension, but zarr files will be written per processor in MPI mode at {self.fname}') else: - self.tempwritedir_base = tmp_dir - mpi_rank = 0 - self.tempwritedir = os.path.join(self.tempwritedir_base, "%d" % mpi_rank) - - if not os.path.exists(self.tempwritedir): - os.makedirs(self.tempwritedir) - elif pset_info is None: - raise IOError("output directory %s already exists. Please remove the directory." % self.tempwritedir) + self.fname = name if extension in ['.zarr'] else "%s.zarr" % name @abstractmethod def _reserved_var_names(self): @@ -139,26 +111,6 @@ def _reserved_var_names(self): """ pass - def open_output_file(self, data_shape): - """Initialise file for trajectory output. - The output follows the format outlined in the Discrete Sampling Geometries - section of the CF-conventions: - http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#discrete-sampling-geometries - The current implementation is based on the NCEI template: - http://www.nodc.noaa.gov/data/formats/netcdf/v2.0/trajectoryIncomplete.cdl - - :param data_shape: shape of the variables in the output file - """ - extension = os.path.splitext(str(self.name))[1] - self.fname = self.name if extension in ['.nc', '.nc4', '.zarr'] else "%s.nc" % self.name - self.outputformat = extension - if os.path.exists(str(self.fname)): - if 'zarr' in self.outputformat: - shutil.rmtree(str(self.fname)) - else: - os.remove(str(self.fname)) - self.attrs = self._create_variables_attribute_dict() - def _create_variables_attribute_dict(self): """ creates the dictionary with variable attributes. @@ -192,34 +144,20 @@ def _create_variables_attribute_dict(self): attrs['time']['units'] = "seconds since " + str(self.time_origin) attrs['time']['calendar'] = 'standard' if self.time_origin.calendar == 'np_datetime64' else self.time_origin.calendar - for vname, dtype in zip(self.var_names, self.var_dtypes): + for vname in self.vars_to_write: if vname not in self._reserved_var_names(): - attrs[vname] = {"_FillValue": self.fill_value_map[dtype], + attrs[vname] = {"_FillValue": self.fill_value_map[self.vars_to_write[vname]], "long_name": "", "standard_name": vname, "units": "unknown"} - for vname, dtype in zip(self.var_names_once, self.var_dtypes_once): - attrs[vname] = {"_FillValue": self.fill_value_map[dtype], - "long_name": "", - "standard_name": vname, - "units": "unknown"} - return attrs def __del__(self): - if self.convert_at_end: - self.close() + self.close() def close(self, delete_tempfiles=True): - """Close the ParticleFile object by exporting and then deleting - the temporary npy files""" - self.export() - mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 - if mpi_rank == 0: - if delete_tempfiles: - self.delete_tempwritedir(tempwritedir=self.tempwritedir_base) - self.convert_at_end = False + pass def add_metadata(self, name, message): """Add metadata to :class:`parcels.particleset.ParticleSet` @@ -229,91 +167,120 @@ def add_metadata(self, name, message): """ self.metadata[name] = message - def dump_dict_to_npy(self, data_dict, data_dict_once): - """Buffer data to set of temporary numpy files, using np.save""" - - if not os.path.exists(self.tempwritedir): - os.makedirs(self.tempwritedir) - - if len(data_dict) > 0: - tmpfilename = os.path.join(self.tempwritedir, str(len(self.file_list)) + ".npy.gz") - with gzip.open(tmpfilename, 'wb') as f: - np.save(f, data_dict) - self.file_list.append(tmpfilename) - - if len(data_dict_once) > 0: - tmpfilename = os.path.join(self.tempwritedir, str(len(self.file_list)) + '_once.npy.gz') - with gzip.open(tmpfilename, 'wb') as f: - np.save(f, data_dict_once) - self.file_list_once.append(tmpfilename) + def _convert_varout_name(self, var): + if var == 'depth': + return 'z' + elif var == 'id': + return 'trajectory' + else: + return var - @abstractmethod - def get_pset_info_attributes(self): - """ - returns the main attributes of the pset_info.npy file. + def write_once(self, var): + return self.particleset.collection.ptype[var].to_write == 'once' - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - return None - - def dump_psetinfo_to_npy(self): - """ - function writes the major attributes and values to a pset information file (*.npy). - """ - pset_info = {} - attrs_to_dump = self.get_pset_info_attributes() - if attrs_to_dump is None: - return - for a in attrs_to_dump: - if hasattr(self, a): - pset_info[a] = getattr(self, a) - with open(os.path.join(self.tempwritedir, 'pset_info.npy'), 'wb') as f: - np.save(f, pset_info) + def _extend_zarr_dims(self, Z, store, dtype, axis): + if axis == 1: + a = np.full((Z.shape[0], self.chunks[1]), np.nan, dtype=dtype) + obs = zarr.group(store=store, overwrite=False)["obs"] + if len(obs) == Z.shape[1]: + obs.append(np.arange(self.chunks[1])+obs[-1]+1) + else: + extra_trajs = max(self.maxids - Z.shape[0], self.chunks[0]) + if len(Z.shape) == 2: + a = np.full((extra_trajs, Z.shape[1]), np.nan, dtype=dtype) + else: + a = np.full((extra_trajs,), np.nan, dtype=dtype) + Z.append(a, axis=axis) + zarr.consolidate_metadata(store) def write(self, pset, time, deleted_only=False): - """Write all data from one time step to a temporary npy-file - using a python dictionary. The data is saved in the folder 'out'. + """Write all data from one time step to the zarr file :param pset: ParticleSet object to write :param time: Time at which to write ParticleSet :param deleted_only: Flag to write only the deleted Particles """ - data_dict, data_dict_once = pset.to_dict(self, time, deleted_only=deleted_only) - self.dump_dict_to_npy(data_dict, data_dict_once) - self.dump_psetinfo_to_npy() - - @abstractmethod - def read_from_npy(self, file_list, time_steps, var): - """ - Read NPY-files for one variable using a loop over all files. - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - - :param file_list: List that contains all file names in the output directory - :param time_steps: Number of time steps that were written in out directory - :param var: name of the variable to read - """ - return None - - @abstractmethod - def export(self): - """ - Exports outputs in temporary NPY-files to NetCDF file - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - pass - - def delete_tempwritedir(self, tempwritedir=None): - """Deleted all temporary npy files - - :param tempwritedir Optional path of the directory to delete - """ - if tempwritedir is None: - tempwritedir = self.tempwritedir - if os.path.exists(tempwritedir): - shutil.rmtree(tempwritedir) + time = time.total_seconds() if isinstance(time, delta) else time + + if self.lasttime_written != time and (self.write_ondelete is False or deleted_only is not False): + if pset.collection._ncount == 0: + logger.warning("ParticleSet is empty on writing as array at time %g" % time) + return + + if deleted_only is not False: + if type(deleted_only) not in [list, np.ndarray] and deleted_only in [True, 1]: + indices_to_write = np.where(np.isin(pset.collection.getvardata('state'), [OperationCode.Delete]))[0] + elif type(deleted_only) == np.ndarray: + if set(deleted_only).issubset([0, 1]): + indices_to_write = np.where(deleted_only)[0] + else: + indices_to_write = deleted_only + elif type(deleted_only) == list: + indices_to_write = np.array(deleted_only) + else: + indices_to_write = pset.collection._to_write_particles(pset.collection._data, time) + self.lasttime_written = time + + if len(indices_to_write) > 0: + pids = pset.collection.getvardata('id', indices_to_write) + to_add = sorted(set(pids) - set(self.pids_written.keys())) + for i, pid in enumerate(to_add): + self.pids_written[pid] = self.maxids + i + ids = np.array([self.pids_written[p] for p in pids], dtype=int) + self.maxids = len(self.pids_written) + + once_ids = np.where(pset.collection.getvardata('once_written', indices_to_write) == 0)[0] + ids_once = ids[once_ids] + indices_to_write_once = indices_to_write[once_ids] + pset.collection.setvardata('once_written', indices_to_write_once, np.ones(len(ids_once))) + + if self.maxids > len(self.obs_written): + self.obs_written = np.append(self.obs_written, np.zeros((self.maxids-len(self.obs_written)), dtype=int)) + + if self.create_new_zarrfile: + if self.chunks is None: + self.chunks = (len(ids), 1) + elif self.chunks[0] > len(ids): + logger.warning(f'Chunk size for trajectory ({self.chunks[0]}) is larger than length of initial set to write. ' + f'Reducing ParticleFile chunks to ({len(ids)}, {self.chunks[1]})') + self.chunks = (len(ids), self.chunks[1]) + if (self.maxids > len(ids)) or (self.maxids > self.chunks[0]): + arrsize = (self.maxids, self.chunks[1]) + else: + arrsize = self.chunks + ds = xr.Dataset(attrs=self.metadata, coords={"trajectory": ("trajectory", pids), + "obs": ("obs", np.arange(arrsize[1], dtype=np.int32))}) + attrs = self._create_variables_attribute_dict() + for var in self.vars_to_write: + varout = self._convert_varout_name(var) + if varout not in ['trajectory']: # because 'trajectory' is written as coordinate + if self.write_once(var): + data = np.full((arrsize[0],), np.nan, dtype=self.vars_to_write[var]) + data[ids_once] = pset.collection.getvardata(var, indices_to_write_once) + dims = ["trajectory"] + else: + data = np.full(arrsize, np.nan, dtype=self.vars_to_write[var]) + data[ids, 0] = pset.collection.getvardata(var, indices_to_write) + dims = ["trajectory", "obs"] + ds[varout] = xr.DataArray(data=data, dims=dims, attrs=attrs[varout]) + ds[varout].encoding['chunks'] = self.chunks[0] if self.write_once(var) else self.chunks + ds.to_zarr(self.fname, mode='w') + self.create_new_zarrfile = False + else: + store = zarr.DirectoryStore(self.fname) + Z = zarr.group(store=store, overwrite=False) + obs = self.obs_written[np.array(ids)] + for var in self.vars_to_write: + varout = self._convert_varout_name(var) + if self.maxids > Z[varout].shape[0]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=0) + if self.write_once(var): + if len(ids_once) > 0: + Z[varout].vindex[ids_once] = pset.collection.getvardata(var, indices_to_write_once) + else: + if max(obs) >= Z[varout].shape[1]: + self._extend_zarr_dims(Z[varout], store, dtype=self.vars_to_write[var], axis=1) + Z[varout].vindex[ids, obs] = pset.collection.getvardata(var, indices_to_write) + + self.obs_written[np.array(ids)] += 1 diff --git a/parcels/particlefile/particlefileaos.py b/parcels/particlefile/particlefileaos.py index bcff77290..81186b0d1 100644 --- a/parcels/particlefile/particlefileaos.py +++ b/parcels/particlefile/particlefileaos.py @@ -1,14 +1,5 @@ """Module controlling the writing of ParticleSets to NetCDF file""" -import os -from glob import glob import numpy as np -import xarray as xr -import gzip - -try: - from mpi4py import MPI -except: - MPI = None from parcels.particlefile.baseparticlefile import BaseParticleFile @@ -23,20 +14,13 @@ class ParticleFileAOS(BaseParticleFile): :param outputdt: Interval which dictates the update frequency of file output while ParticleFile is given as an argument of ParticleSet.execute() It is either a timedelta object or a positive double. + :param chunks: Tuple (trajs, obs) to control the size of chunks in the zarr output. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False - :param convert_at_end: Boolean to convert npy files to netcdf at end of run. Default is True - :param tempwritedir: directories to write temporary files to during executing. - Default is out-XXXXXX where Xs are random capitals. Files for individual - processors are written to subdirectories 0, 1, 2 etc under tempwritedir - :param pset_info: dictionary of info on the ParticleSet, stored in tempwritedir/XX/pset_info.npy, - used to create NetCDF file from npy-files. """ - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=True, - tempwritedir=None, pset_info=None): + def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_ondelete=False): super(ParticleFileAOS, self).__init__(name=name, particleset=particleset, outputdt=outputdt, - write_ondelete=write_ondelete, convert_at_end=convert_at_end, - tempwritedir=tempwritedir, pset_info=pset_info) + chunks=chunks, write_ondelete=write_ondelete) def __del__(self): super(ParticleFileAOS, self).__del__() @@ -46,134 +30,3 @@ def _reserved_var_names(self): returns the reserved dimension names not to be written just once. """ return ['time', 'lat', 'lon', 'depth', 'id'] - - def _create_trajectory_records(self, coords): - super(ParticleFileAOS, self)._create_trajectory_records(coords=coords) - - def get_pset_info_attributes(self): - """ - returns the main attributes of the pset_info.npy file. - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - attributes = ['name', 'var_names', 'var_dtypes', 'var_names_once', 'var_dtypes_once', - 'time_origin', 'lonlatdepth_dtype', 'file_list', 'file_list_once', - 'parcels_mesh', 'metadata'] - return attributes - - def read_from_npy(self, file_list, n_timesteps, var, dtype): - """ - Read NPY-files for one variable using a loop over all files. - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - - :param file_list: List that contains all file names in the output directory - :param n_timesteps: Dictionary with (for each particle) number of time steps that were written in out directory - :param var: name of the variable to read - """ - max_timesteps = max(n_timesteps.values()) if n_timesteps.keys() else 0 - fill_value = self.fill_value_map[dtype] - data = fill_value * np.ones((len(n_timesteps), max_timesteps), dtype=dtype) - time_index = np.zeros(len(n_timesteps)) - id_index = {} - count = 0 - for i in sorted(n_timesteps.keys()): - id_index[i] = count - count += 1 - - # loop over all files - for npyfile in file_list: - try: - with gzip.open(npyfile, 'rb') as f: - data_dict = np.load(f, allow_pickle=True).item() - except NameError: - raise RuntimeError('Cannot combine npy files into netcdf file because your ParticleFile is ' - 'still open on interpreter shutdown.\nYou can use ' - '"parcels_convert_npydir_to_netcdf %s" to convert these to ' - 'a NetCDF file yourself.\nTo avoid this error, make sure you ' - 'close() your ParticleFile at the end of your script.' % self.tempwritedir) - for ii, i in enumerate(data_dict["id"]): - id_ind = id_index[i] - t_ind = int(time_index[id_ind]) if 'once' not in file_list[0] else 0 - data[id_ind, t_ind] = data_dict[var][ii] - time_index[id_ind] = time_index[id_ind] + 1 - - if dtype == np.bool_: - data = data.astype(np.bool_) - # remove rows and columns that are completely filled with nan values - return data[time_index > 0, :] - - def export(self): - """ - Exports outputs in temporary NPY-files to output file (either netcdf or zarr) - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - if MPI: - # The export can only start when all threads are done. - MPI.COMM_WORLD.Barrier() - if MPI.COMM_WORLD.Get_rank() > 0: - return # export only on threat 0 - - # Create dictionary to translate datatypes and fill_values - self.fmt_map = {np.float16: 'f2', np.float32: 'f4', np.float64: 'f8', - np.bool_: 'i1', np.int8: 'i1', np.int16: 'i2', - np.int32: 'i4', np.int64: 'i8', np.uint8: 'u1', - np.uint16: 'u2', np.uint32: 'u4', np.uint64: 'u8'} - self.fill_value_map = {np.float16: np.nan, np.float32: np.nan, np.float64: np.nan, - np.bool_: np.iinfo(np.int8).max, np.int8: np.iinfo(np.int8).max, - np.int16: np.iinfo(np.int16).max, np.int32: np.iinfo(np.int32).max, - np.int64: np.iinfo(np.int64).max, np.uint8: np.iinfo(np.uint8).max, - np.uint16: np.iinfo(np.uint16).max, np.uint32: np.iinfo(np.uint32).max, - np.uint64: np.iinfo(np.uint64).max} - - # Retrieve all temporary writing directories and sort them in numerical order - temp_names = sorted(glob(os.path.join("%s" % self.tempwritedir_base, "*")), - key=lambda x: int(os.path.basename(x))) - - if len(temp_names) == 0: - raise RuntimeError("No npy files found in %s" % self.tempwritedir_base) - - n_timesteps = {} - global_file_list = [] - if len(self.var_names_once) > 0: - global_file_list_once = [] - for tempwritedir in temp_names: - if os.path.exists(tempwritedir): - pset_info_local = np.load(os.path.join(tempwritedir, 'pset_info.npy'), allow_pickle=True).item() - for npyfile in pset_info_local['file_list']: - with gzip.open(npyfile, 'rb') as f: - tmp_dict = np.load(f, allow_pickle=True).item() - for i in tmp_dict['id']: - if i in n_timesteps: - n_timesteps[i] += 1 - else: - n_timesteps[i] = 1 - global_file_list += pset_info_local['file_list'] - if len(self.var_names_once) > 0: - global_file_list_once += pset_info_local['file_list_once'] - - ds = xr.Dataset(attrs=pset_info_local['metadata']) - for var, dtype in zip(self.var_names, self.var_dtypes): - data = self.read_from_npy(global_file_list, n_timesteps, var, dtype) - if var == self.var_names[0]: - self.open_output_file(data.shape) - varout = 'z' if var == 'depth' else var - varout = 'trajectory' if varout == 'id' else varout - ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=self.attrs[varout]) - - if len(self.var_names_once) > 0: - n_timesteps_once = {} - for i in n_timesteps: - n_timesteps_once[i] = 1 - for var in self.var_names_once: - data = self.read_from_npy(global_file_list_once, n_timesteps_once, var, dtype) - ds[var] = xr.DataArray(data=data.flatten(), dims=["traj"], attrs=self.attrs[var]) - - if 'zarr' in self.outputformat: - ds.to_zarr(self.fname) - else: - ds.to_netcdf(self.fname) diff --git a/parcels/particlefile/particlefilesoa.py b/parcels/particlefile/particlefilesoa.py index 2180f0b44..1cd088d99 100644 --- a/parcels/particlefile/particlefilesoa.py +++ b/parcels/particlefile/particlefilesoa.py @@ -1,14 +1,5 @@ """Module controlling the writing of ParticleSets to NetCDF file""" -import os -from glob import glob import numpy as np -import xarray as xr -import gzip - -try: - from mpi4py import MPI -except: - MPI = None from parcels.particlefile.baseparticlefile import BaseParticleFile @@ -23,20 +14,13 @@ class ParticleFileSOA(BaseParticleFile): :param outputdt: Interval which dictates the update frequency of file output while ParticleFile is given as an argument of ParticleSet.execute() It is either a timedelta object or a positive double. + :param chunks: Tuple (trajs, obs) to control the size of chunks in the zarr output. :param write_ondelete: Boolean to write particle data only when they are deleted. Default is False - :param convert_at_end: Boolean to convert npy files to netcdf at end of run. Default is True - :param tempwritedir: directories to write temporary files to during executing. - Default is out-XXXXXX where Xs are random capitals. Files for individual - processors are written to subdirectories 0, 1, 2 etc under tempwritedir - :param pset_info: dictionary of info on the ParticleSet, stored in tempwritedir/XX/pset_info.npy, - used to create NetCDF file from npy-files. """ - def __init__(self, name, particleset, outputdt=np.infty, write_ondelete=False, convert_at_end=True, - tempwritedir=None, pset_info=None): + def __init__(self, name, particleset, outputdt=np.infty, chunks=None, write_ondelete=False): super(ParticleFileSOA, self).__init__(name=name, particleset=particleset, outputdt=outputdt, - write_ondelete=write_ondelete, convert_at_end=convert_at_end, - tempwritedir=tempwritedir, pset_info=pset_info) + chunks=chunks, write_ondelete=write_ondelete) def __del__(self): super(ParticleFileSOA, self).__del__() @@ -45,136 +29,4 @@ def _reserved_var_names(self): """ returns the reserved dimension names not to be written just once. """ - return ['time', 'lat', 'lon', 'depth', 'id'] # , 'index' - - def _create_trajectory_records(self, coords): - super(ParticleFileSOA, self)._create_trajectory_records(coords=coords) - - def get_pset_info_attributes(self): - """ - returns the main attributes of the pset_info.npy file. - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - attributes = ['name', 'var_names', 'var_dtypes', 'var_names_once', 'var_dtypes_once', - 'time_origin', 'lonlatdepth_dtype', 'file_list', 'file_list_once', - 'parcels_mesh', 'metadata'] - return attributes - - def read_from_npy(self, file_list, n_timesteps, var, dtype): - """ - Read NPY-files for one variable using a loop over all files. - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - - :param file_list: List that contains all file names in the output directory - :param n_timesteps: Dictionary with (for each particle) number of time steps that were written in out directory - :param var: name of the variable to read - """ - max_timesteps = max(n_timesteps.values()) if n_timesteps.keys() else 0 - fill_value = self.fill_value_map[dtype] - data = fill_value * np.ones((len(n_timesteps), max_timesteps), dtype=dtype) - time_index = np.zeros(len(n_timesteps)) - id_index = {} - count = 0 - for i in sorted(n_timesteps.keys()): - id_index[i] = count - count += 1 - - # loop over all files - for npyfile in file_list: - try: - with gzip.open(npyfile, 'rb') as f: - data_dict = np.load(f, allow_pickle=True).item() - except NameError: - raise RuntimeError('Cannot combine npy files into netcdf file because your ParticleFile is ' - 'still open on interpreter shutdown.\nYou can use ' - '"parcels_convert_npydir_to_netcdf %s" to convert these to ' - 'a NetCDF file yourself.\nTo avoid this error, make sure you ' - 'close() your ParticleFile at the end of your script.' % self.tempwritedir) - for ii, i in enumerate(data_dict["id"]): - id_ind = id_index[i] - t_ind = int(time_index[id_ind]) if 'once' not in file_list[0] else 0 - data[id_ind, t_ind] = data_dict[var][ii] - time_index[id_ind] = time_index[id_ind] + 1 - - if dtype == np.bool_: - data = data.astype(np.bool_) - # remove rows and columns that are completely filled with nan values - return data[time_index > 0, :] - - def export(self): - """ - Exports outputs in temporary NPY-files to output file (either netcdf or zarr) - - Attention: - For ParticleSet structures other than SoA, and structures where ID != index, this has to be overridden. - """ - - if MPI: - # The export can only start when all threads are done. - MPI.COMM_WORLD.Barrier() - if MPI.COMM_WORLD.Get_rank() > 0: - return # export only on threat 0 - - # Create dictionary to translate datatypes and fill_values - self.fmt_map = {np.float16: 'f2', np.float32: 'f4', np.float64: 'f8', - np.bool_: 'i1', np.int8: 'i1', np.int16: 'i2', - np.int32: 'i4', np.int64: 'i8', np.uint8: 'u1', - np.uint16: 'u2', np.uint32: 'u4', np.uint64: 'u8'} - self.fill_value_map = {np.float16: np.nan, np.float32: np.nan, np.float64: np.nan, - np.bool_: np.iinfo(np.int8).max, np.int8: np.iinfo(np.int8).max, - np.int16: np.iinfo(np.int16).max, np.int32: np.iinfo(np.int32).max, - np.int64: np.iinfo(np.int64).max, np.uint8: np.iinfo(np.uint8).max, - np.uint16: np.iinfo(np.uint16).max, np.uint32: np.iinfo(np.uint32).max, - np.uint64: np.iinfo(np.uint64).max} - - # Retrieve all temporary writing directories and sort them in numerical order - temp_names = sorted(glob(os.path.join("%s" % self.tempwritedir_base, "*")), - key=lambda x: int(os.path.basename(x))) - - if len(temp_names) == 0: - raise RuntimeError("No npy files found in %s" % self.tempwritedir_base) - - n_timesteps = {} - global_file_list = [] - if len(self.var_names_once) > 0: - global_file_list_once = [] - for tempwritedir in temp_names: - if os.path.exists(tempwritedir): - pset_info_local = np.load(os.path.join(tempwritedir, 'pset_info.npy'), allow_pickle=True).item() - for npyfile in pset_info_local['file_list']: - with gzip.open(npyfile, 'rb') as f: - tmp_dict = np.load(f, allow_pickle=True).item() - for i in tmp_dict['id']: - if i in n_timesteps: - n_timesteps[i] += 1 - else: - n_timesteps[i] = 1 - global_file_list += pset_info_local['file_list'] - if len(self.var_names_once) > 0: - global_file_list_once += pset_info_local['file_list_once'] - - ds = xr.Dataset(attrs=pset_info_local['metadata']) - for var, dtype in zip(self.var_names, self.var_dtypes): - data = self.read_from_npy(global_file_list, n_timesteps, var, dtype) - if var == self.var_names[0]: - self.open_output_file(data.shape) - varout = 'z' if var == 'depth' else var - varout = 'trajectory' if varout == 'id' else varout - ds[varout] = xr.DataArray(data=data, dims=["traj", "obs"], attrs=self.attrs[varout]) - - if len(self.var_names_once) > 0: - n_timesteps_once = {} - for i in n_timesteps: - n_timesteps_once[i] = 1 - for var in self.var_names_once: - data = self.read_from_npy(global_file_list_once, n_timesteps_once, var, dtype) - ds[var] = xr.DataArray(data=data.flatten(), dims=["traj"], attrs=self.attrs[var]) - - if 'zarr' in self.outputformat: - ds.to_zarr(self.fname) - else: - ds.to_netcdf(self.fname) + return ['time', 'lat', 'lon', 'depth', 'id'] diff --git a/parcels/particleset/baseparticleset.py b/parcels/particleset/baseparticleset.py index 9e5c99807..20f18e681 100644 --- a/parcels/particleset/baseparticleset.py +++ b/parcels/particleset/baseparticleset.py @@ -436,9 +436,7 @@ def execute(self, pyfunc=AdvectionRK4, pyfunc_inter=None, endtime=None, runtime= if verbose_progress is None and time_module.time() - walltime_start > 10: # Showing progressbar if runtime > 10 seconds if output_file: - logger.info('Temporary output files are stored in %s.' % output_file.tempwritedir_base) - logger.info('You can use "parcels_convert_npydir_to_netcdf %s" to convert these ' - 'to a NetCDF file during the run.' % output_file.tempwritedir_base) + logger.info('Output files are stored in %s.' % output_file.fname) pbar = self.__create_progressbar(_starttime, endtime) verbose_progress = True @@ -487,7 +485,7 @@ def execute(self, pyfunc=AdvectionRK4, pyfunc_inter=None, endtime=None, runtime= if hasattr(fld, 'to_write') and fld.to_write: if fld.grid.tdim > 1: raise RuntimeError('Field writing during execution only works for Fields with one snapshot in time') - fldfilename = str(output_file.name).replace('.nc', '_%.4d' % fld.to_write) + fldfilename = str(output_file.fname).replace('.zarr', '_%.4d' % fld.to_write) fld.write(fldfilename) fld.to_write += 1 if abs(time - next_output) < tol: diff --git a/parcels/particleset/particlesetaos.py b/parcels/particleset/particlesetaos.py index ffde9c293..a5f584a72 100644 --- a/parcels/particleset/particlesetaos.py +++ b/parcels/particleset/particlesetaos.py @@ -507,9 +507,9 @@ def monte_carlo_sample(cls, start_field, size, mode='monte_carlo'): @classmethod def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime=None, repeatdt=None, lonlatdepth_dtype=None, **kwargs): - """Initialise the ParticleSet from a netcdf ParticleFile. - This creates a new ParticleSet based on the last locations and time of all particles - in the netcdf ParticleFile. Particle IDs are preserved if restart=True + """Initialise the ParticleSet from a zarr ParticleFile. + This creates a new ParticleSet based on locations of all particles written + in a zarr ParticleFile at a certain time. Particle IDs are preserved if restart=True :param fieldset: :mod:`parcels.fieldset.FieldSet` object from which to sample velocity :param pclass: mod:`parcels.particle.JITParticle` or :mod:`parcels.particle.ScipyParticle` @@ -517,6 +517,9 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime :param filename: Name of the particlefile from which to read initial conditions :param restart: Boolean to signal if pset is used for a restart (default is True). In that case, Particle IDs are preserved. + :param restarttime: time at which the Particles will be restarted. Default is the last time written. + Alternatively, restarttime could be a time value (including np.datetime64) or + a callable function such as np.nanmin. The last is useful when running with dt < 0. :param repeatdt: Optional interval (in seconds) on which to repeat the release of the ParticleSet :param lonlatdepth_dtype: Floating precision for lon, lat, depth particle coordinates. It is either np.float32 or np.float64. Default is np.float32 if fieldset.U.interp_method is 'linear' @@ -527,7 +530,7 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime 'setting a new repeatdt will start particles from the _new_ particle ' 'locations.' % filename) - pfile = xr.open_dataset(str(filename), decode_cf=True) + pfile = xr.open_zarr(str(filename)) pfile_vars = [v for v in pfile.data_vars] vars = {} @@ -535,7 +538,7 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime for v in pclass.getPType().variables: if v.name in pfile_vars: vars[v.name] = np.ma.filled(pfile.variables[v.name], np.nan) - elif v.name not in ['xi', 'yi', 'zi', 'ti', 'dt', '_next_dt', 'depth', 'id', 'fileid', 'state'] \ + elif v.name not in ['xi', 'yi', 'zi', 'ti', 'dt', '_next_dt', 'depth', 'id', 'once_written', 'state'] \ and v.to_write: raise RuntimeError('Variable %s is in pclass but not in the particlefile' % v.name) to_write[v.name] = v.to_write @@ -570,17 +573,6 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime depth=vars['depth'], time=vars['time'], pid_orig=vars['id'], lonlatdepth_dtype=lonlatdepth_dtype, repeatdt=repeatdt, **kwargs) - def to_dict(self, pfile, time, deleted_only=False): - """ - Convert all Particle data from one time step to a python dictionary. - :param pfile: ParticleFile object requesting the conversion - :param time: Time at which to write ParticleSet - :param deleted_only: Flag to write only the deleted Particles - returns two dictionaries: one for all variables to be written each outputdt, and one for all variables to be written once - """ - return self._collection.toDictionary(pfile=pfile, time=time, - deleted_only=deleted_only) - def __iadd__(self, particles): """Add particles to the ParticleSet. Note that this is an incremental add, the particles will be added to the ParticleSet diff --git a/parcels/particleset/particlesetsoa.py b/parcels/particleset/particlesetsoa.py index 85a3425fe..9b376799d 100644 --- a/parcels/particleset/particlesetsoa.py +++ b/parcels/particleset/particlesetsoa.py @@ -414,9 +414,9 @@ def monte_carlo_sample(cls, start_field, size, mode='monte_carlo'): @classmethod def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime=None, repeatdt=None, lonlatdepth_dtype=None, **kwargs): - """Initialise the ParticleSet from a netcdf ParticleFile. + """Initialise the ParticleSet from a zarr ParticleFile. This creates a new ParticleSet based on locations of all particles written - in a netcdf ParticleFile at a certain time. Particle IDs are preserved if restart=True + in a zarr ParticleFile at a certain time. Particle IDs are preserved if restart=True :param fieldset: :mod:`parcels.fieldset.FieldSet` object from which to sample velocity :param pclass: mod:`parcels.particle.JITParticle` or :mod:`parcels.particle.ScipyParticle` @@ -438,7 +438,7 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime 'setting a new repeatdt will start particles from the _new_ particle ' 'locations.' % filename) - pfile = xr.open_dataset(str(filename), decode_cf=True) + pfile = xr.open_zarr(str(filename)) pfile_vars = [v for v in pfile.data_vars] vars = {} @@ -446,7 +446,7 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime for v in pclass.getPType().variables: if v.name in pfile_vars: vars[v.name] = np.ma.filled(pfile.variables[v.name], np.nan) - elif v.name not in ['xi', 'yi', 'zi', 'ti', 'dt', '_next_dt', 'depth', 'id', 'fileid', 'state'] \ + elif v.name not in ['xi', 'yi', 'zi', 'ti', 'dt', '_next_dt', 'depth', 'id', 'once_written', 'state'] \ and v.to_write: raise RuntimeError('Variable %s is in pclass but not in the particlefile' % v.name) to_write[v.name] = v.to_write @@ -481,17 +481,6 @@ def from_particlefile(cls, fieldset, pclass, filename, restart=True, restarttime depth=vars['depth'], time=vars['time'], pid_orig=vars['id'], lonlatdepth_dtype=lonlatdepth_dtype, repeatdt=repeatdt, **kwargs) - def to_dict(self, pfile, time, deleted_only=False): - """ - Convert all Particle data from one time step to a python dictionary. - :param pfile: ParticleFile object requesting the conversion - :param time: Time at which to write ParticleSet - :param deleted_only: Flag to write only the deleted Particles - returns two dictionaries: one for all variables to be written each outputdt, and one for all variables to be written once - """ - return self._collection.toDictionary(pfile=pfile, time=time, - deleted_only=deleted_only) - def compute_neighbor_tree(self, time, dt): active_mask = self.active_particles_mask(time, dt) diff --git a/parcels/scripts/convert_npydir_to_netcdf.py b/parcels/scripts/convert_npydir_to_netcdf.py deleted file mode 100644 index 53d0c8a78..000000000 --- a/parcels/scripts/convert_npydir_to_netcdf.py +++ /dev/null @@ -1,55 +0,0 @@ -from argparse import ArgumentParser -from glob import glob -from os import path - -import numpy as np - -# == here those classes need to be impported to parse available ParticleFile classes and create the type from its name == # -from parcels import ParticleFile, ParticleFileSOA, ParticleFileAOS # NOQA - - -def convert_npydir_to_netcdf(tempwritedir_base, delete_tempfiles=False, pfile_class=None): - """Convert npy files in tempwritedir to a NetCDF file - :param tempwritedir_base: directory where the directories for temporary npy files - are stored (can be obtained from ParticleFile.tempwritedir_base attribute) - """ - - tempwritedir = sorted(glob(path.join("%s" % tempwritedir_base, "*")), - key=lambda x: int(path.basename(x)))[0] - pyset_file = path.join(tempwritedir, 'pset_info.npy') - if not path.isdir(tempwritedir): - raise ValueError('Output directory "%s" does not exist' % tempwritedir) - if not path.isfile(pyset_file): - raise ValueError('Output directory "%s" does not contain a pset_info.npy file' % tempwritedir) - - pset_info = np.load(pyset_file, allow_pickle=True).item() - pfconstructor = ParticleFile if pfile_class is None else pfile_class - pfile = pfconstructor(None, None, pset_info=pset_info, tempwritedir=tempwritedir_base, convert_at_end=False) - pfile.close(delete_tempfiles) - - -def main(tempwritedir_base=None, delete_tempfiles=False): - if tempwritedir_base is None: - p = ArgumentParser(description="""Script to convert temporary npy output files to NetCDF""") - p.add_argument('tempwritedir', help='Name of directory where temporary npy files are stored ' - '(not including numbered subdirectories)') - p.add_argument('-d', '--delete_tempfiles', default=False, - help='Flag to delete temporary files at end of call (default False)') - p.add_argument('-c', '--pfclass_name', default='ParticleFileSOA', - help='Class name of the stored particle file (default ParticleFileSOA)') - args = p.parse_args() - tempwritedir_base = args.tempwritedir - pfclass = ParticleFile - if hasattr(args, 'delete_tempfiles'): - delete_tempfiles = args.delete_tempfiles - if hasattr(args, 'pfclass_name'): - try: - pfclass = locals()[args.pfclass_name] - except: - pfclass = ParticleFile - - convert_npydir_to_netcdf(tempwritedir_base, delete_tempfiles, pfile_class=pfclass) - - -if __name__ == "__main__": - main() diff --git a/parcels/scripts/plottrajectoriesfile.py b/parcels/scripts/plottrajectoriesfile.py index 0d181c4e4..f215c43db 100644 --- a/parcels/scripts/plottrajectoriesfile.py +++ b/parcels/scripts/plottrajectoriesfile.py @@ -38,9 +38,9 @@ def plotTrajectoriesFile(filename, mode='2d', tracerfile=None, tracerfield='P', environ["HDF5_USE_FILE_LOCKING"] = "FALSE" try: - pfile = xr.open_dataset(str(filename), decode_cf=True) + pfile = xr.open_zarr(str(filename), decode_cf=True) except: - pfile = xr.open_dataset(str(filename), decode_cf=False) + pfile = xr.open_zarr(str(filename), decode_cf=False) lon = np.ma.filled(pfile.variables['lon'], np.nan) lat = np.ma.filled(pfile.variables['lat'], np.nan) time = np.ma.filled(pfile.variables['time'], np.nan) @@ -71,7 +71,7 @@ def plotTrajectoriesFile(filename, mode='2d', tracerfile=None, tracerfield='P', if mode == '3d': from mpl_toolkits.mplot3d import Axes3D # noqa plt.clf() # clear the figure - ax = fig.gca(projection='3d') + ax = plt.axes(projection='3d') for p in range(len(lon)): ax.plot(lon[p, :], lat[p, :], z[p, :], '.-') ax.set_xlabel('Longitude') diff --git a/tests/test_advection.py b/tests/test_advection.py index 32c63aabd..f981c17f2 100644 --- a/tests/test_advection.py +++ b/tests/test_advection.py @@ -6,9 +6,9 @@ import numpy as np import pytest import math -from netCDF4 import Dataset from datetime import timedelta as delta from parcels import logger +import xarray as xr pset_modes = ['soa', 'aos'] ptype = {'scipy': ScipyParticle, 'jit': JITParticle} @@ -489,21 +489,18 @@ def test_uniform_analytical(pset_mode, mode, u, v, w, direction, tmpdir): x0, y0, z0 = 6.1, 6.2, 20 pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=x0, lat=y0, depth=z0) - outfile_path = tmpdir.join("uniformanalytical.nc") - outfile = pset.ParticleFile(name=outfile_path, outputdt=1) + outfile_path = tmpdir.join("uniformanalytical.zarr") + outfile = pset.ParticleFile(name=outfile_path, outputdt=1, chunks=(1, 1)) pset.execute(AdvectionAnalytical, runtime=4, dt=direction, output_file=outfile) - outfile.close() assert np.abs(pset.lon - x0 - 4 * u * direction) < 1e-6 assert np.abs(pset.lat - y0 - 4 * v * direction) < 1e-6 if w: assert np.abs(pset.depth - z0 - 4 * w * direction) < 1e-4 - dataset = Dataset(outfile_path, 'r', 'NETCDF4') - times = dataset.variables['time'][:] - timeref = direction * np.arange(0, 5) - logger.info("analytical - time: {}".format(times)) - logger.info("analytical - reference: {}".format(timeref)) - assert np.allclose(times, timeref) - lons = dataset.variables['lon'][:] + ds = xr.open_zarr(outfile_path, mask_and_scale=False) + times = ds['time'][:].values.astype('timedelta64[s]')[0] + timeref = direction * np.arange(0, 5).astype('timedelta64[s]') + assert np.allclose(times, timeref, atol=np.timedelta64(1, 'ms')) + lons = ds['lon'][:].values assert np.allclose(lons, x0+direction*u*np.arange(0, 5)) diff --git a/tests/test_fieldset.py b/tests/test_fieldset.py index e0fd94f16..6a48593be 100644 --- a/tests/test_fieldset.py +++ b/tests/test_fieldset.py @@ -580,7 +580,7 @@ def SampleUV2(particle, fieldset, time): @pytest.mark.parametrize('pset_mode', pset_modes) def test_fieldset_write(pset_mode, tmpdir): - filepath = tmpdir.join("fieldset_write.nc") + filepath = tmpdir.join("fieldset_write.zarr") xdim, ydim = 3, 4 lon = np.linspace(0., 10., xdim, dtype=np.float32) lat = np.linspace(0., 10., ydim, dtype=np.float32) @@ -604,7 +604,7 @@ def UpdateU(particle, fieldset, time): assert fieldset.U.data[0, 1, 0] == 11 - da = xr.open_dataset(str(filepath).replace('.nc', '_0005U.nc')) + da = xr.open_dataset(str(filepath).replace('.zarr', '_0005U.nc')) assert np.allclose(fieldset.U.data, da['U'].values) diff --git a/tests/test_mpirun.py b/tests/test_mpirun.py index 9dece91f8..589fe51c9 100644 --- a/tests/test_mpirun.py +++ b/tests/test_mpirun.py @@ -1,8 +1,9 @@ from os import path, system -from netCDF4 import Dataset +from glob import glob import numpy as np import pytest import sys +import xarray as xr try: from mpi4py import MPI except: @@ -11,27 +12,32 @@ @pytest.mark.skipif(sys.platform.startswith("darwin"), reason="skipping macOS test as problem with file in pytest") @pytest.mark.parametrize('pset_mode', ['soa', 'aos']) -@pytest.mark.parametrize('repeatdt', [200*86400, 10*86400]) -@pytest.mark.parametrize('maxage', [600*86400, 10*86400]) -def test_mpi_run(pset_mode, tmpdir, repeatdt, maxage): +@pytest.mark.parametrize('repeatdt, maxage', [(20*86400, 600*86400), (10*86400, 10*86400)]) +@pytest.mark.parametrize('nump', [4, 8]) +def test_mpi_run(pset_mode, tmpdir, repeatdt, maxage, nump): if MPI: stommel_file = path.join(path.dirname(__file__), '..', 'parcels', 'examples', 'example_stommel.py') - outputMPI = tmpdir.join('StommelMPI.nc') - outputNoMPI = tmpdir.join('StommelNoMPI.nc') + outputMPI = tmpdir.join('StommelMPI') + outputNoMPI = tmpdir.join('StommelNoMPI.zarr') - system('mpirun -np 2 python %s -p 4 -o %s -r %d -a %d -psm %s' % (stommel_file, outputMPI, repeatdt, maxage, pset_mode)) - system('python %s -p 4 -o %s -r %d -a %d -psm %s' % (stommel_file, outputNoMPI, repeatdt, maxage, pset_mode)) + system('mpirun -np 2 python %s -p %d -o %s -r %d -a %d -psm %s' % (stommel_file, nump, outputMPI, repeatdt, maxage, pset_mode)) + system('python %s -p %d -o %s -r %d -a %d -psm %s' % (stommel_file, nump, outputNoMPI, repeatdt, maxage, pset_mode)) - ncfile1 = Dataset(outputMPI, 'r', 'NETCDF4') - ncfile2 = Dataset(outputNoMPI, 'r', 'NETCDF4') + files = glob(path.join(outputMPI, "proc*")) + ds1 = xr.concat([xr.open_zarr(f) for f in files], dim='trajectory', + compat='no_conflicts', coords='minimal').sortby(['trajectory']) - for v in ncfile2.variables.keys(): - assert np.allclose(ncfile1.variables[v][:], ncfile2.variables[v][:]) + ds2 = xr.open_zarr(outputNoMPI) - for a in ncfile2.ncattrs(): + for v in ds2.variables.keys(): + if v == 'time': + continue # skip because np.allclose does not work well on np.datetime64 + assert np.allclose(ds1.variables[v][:], ds2.variables[v][:], equal_nan=True) + + for a in ds2.attrs: if a != 'parcels_version': - assert getattr(ncfile1, a) == getattr(ncfile2, a) + assert ds1.attrs[a] == ds2.attrs[a] - ncfile1.close() - ncfile2.close() + ds1.close() + ds2.close() diff --git a/tests/test_particle_file.py b/tests/test_particle_file.py index 8ab28b576..c57c637a1 100644 --- a/tests/test_particle_file.py +++ b/tests/test_particle_file.py @@ -6,9 +6,7 @@ import numpy as np import pytest import os -from netCDF4 import Dataset import cftime -import random as py_random import xarray as xr pset_modes = ['soa', 'aos'] @@ -33,43 +31,10 @@ def fieldset_ficture(xdim=40, ydim=100): return fieldset(xdim=xdim, ydim=ydim) -def close_and_compare_netcdffiles(filepath, ofile, assystemcall=False): - if assystemcall: - os.system('parcels_convert_npydir_to_netcdf %s' % ofile.tempwritedir_base) - else: - import parcels.scripts.convert_npydir_to_netcdf as convert - convert.convert_npydir_to_netcdf(ofile.tempwritedir_base, pfile_class=ofile.__class__) - - engine = 'zarr' if 'zarr' in str(filepath) else 'netcdf4' - ncfile1 = xr.open_dataset(filepath, engine=engine) - - ofile.name = filepath + 'b.nc' - ofile.export() - - if engine == 'zarr': - assert os.path.getsize(filepath) < os.path.getsize(ofile.name) # zarr expected to be smaller filesize - else: - assert os.path.getsize(filepath) == os.path.getsize(ofile.name) - - ncfile2 = xr.open_dataset(filepath + 'b.nc') - for v in ncfile2.keys(): - if v == 'time': - assert np.allclose(ncfile1[v].values, ncfile2[v].values, atol=np.timedelta64(1, 's'), equal_nan=True) - else: - assert np.allclose(ncfile1[v].values, ncfile2[v].values, equal_nan=True) - - for a in ncfile2.attrs: - if a != 'parcels_version': - assert getattr(ncfile1, a) == getattr(ncfile2, a) - - ncfile2.close() - return ncfile1 - - @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_pfile_array_remove_particles(fieldset, pset_mode, mode, tmpdir, npart=10): - filepath = tmpdir.join("pfile_array_remove_particles.nc") + filepath = tmpdir.join("pfile_array_remove_particles.zarr") pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=np.linspace(0, 1, npart), lat=0.5*np.ones(npart), time=0) @@ -79,16 +44,17 @@ def test_pfile_array_remove_particles(fieldset, pset_mode, mode, tmpdir, npart=1 for p in pset: p.time = 1 pfile.write(pset, 1) - ncfile = close_and_compare_netcdffiles(filepath, pfile) - timearr = ncfile.variables['time'][:] + + ds = xr.open_zarr(filepath) + timearr = ds['time'][:] assert (np.isnat(timearr[3, 1])) and (np.isfinite(timearr[3, 0])) - ncfile.close() + ds.close() @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_pfile_set_towrite_False(fieldset, pset_mode, mode, tmpdir, npart=10): - filepath = tmpdir.join("pfile_set_towrite_False.nc") + filepath = tmpdir.join("pfile_set_towrite_False.zarr") pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=np.linspace(0, 1, npart), lat=0.5*np.ones(npart)) @@ -100,11 +66,12 @@ def Update_lon(particle, fieldset, time): particle.lon += 0.1 pset.execute(Update_lon, runtime=10, output_file=pfile) - ncfile = close_and_compare_netcdffiles(filepath, pfile) - assert 'time' in ncfile.variables - assert 'depth' not in ncfile.variables - assert 'lat' not in ncfile.variables - ncfile.close() + + ds = xr.open_zarr(filepath) + assert 'time' in ds + assert 'depth' not in ds + assert 'lat' not in ds + ds.close() # For pytest purposes, we need to reset to original status pset.set_variable_write_status('depth', True) @@ -113,28 +80,35 @@ def Update_lon(particle, fieldset, time): @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) -def test_pfile_array_remove_all_particles(fieldset, pset_mode, mode, tmpdir, npart=10): +@pytest.mark.parametrize('chunks_obs', [1, None]) +def test_pfile_array_remove_all_particles(fieldset, pset_mode, mode, chunks_obs, tmpdir, npart=10): - filepath = tmpdir.join("pfile_array_remove_particles.nc") + filepath = tmpdir.join("pfile_array_remove_particles.zarr") pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=np.linspace(0, 1, npart), lat=0.5*np.ones(npart), time=0) - pfile = pset.ParticleFile(filepath) + chunks = (npart, chunks_obs) if chunks_obs else None + pfile = pset.ParticleFile(filepath, chunks=chunks) pfile.write(pset, 0) for _ in range(npart): pset.remove_indices(-1) pfile.write(pset, 1) pfile.write(pset, 2) - ncfile = close_and_compare_netcdffiles(filepath, pfile) - assert ncfile.variables['time'][:].shape == (npart, 1) - ncfile.close() + + ds = xr.open_zarr(filepath) + assert np.allclose(ds['time'][:, 0], np.timedelta64(0, 's'), atol=np.timedelta64(1, 'ms')) + if chunks_obs is not None: + assert ds['time'][:].shape == chunks + else: + assert ds['time'][:].shape[0] == npart + assert np.all(np.isnan(ds['time'][:, 1:])) + ds.close() @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) -@pytest.mark.parametrize('assystemcall', [True, False]) -def test_variable_written_ondelete(fieldset, pset_mode, mode, tmpdir, assystemcall, npart=3): - filepath = tmpdir.join("pfile_on_delete_written_variables.nc") +def test_variable_written_ondelete(fieldset, pset_mode, mode, tmpdir, npart=3): + filepath = tmpdir.join("pfile_on_delete_written_variables.zarr") def move_west(particle, fieldset, time): tmp1, tmp2 = fieldset.UV[time, particle.depth, particle.lat, particle.lon] # to trigger out-of-bounds error @@ -152,22 +126,22 @@ def DeleteP(particle, fieldset, time): pset = pset_type[pset_mode]['pset'](fieldset, pclass=ptype[mode], lon=lon, lat=lat) - outfile = pset.ParticleFile(name=filepath, write_ondelete=True) + outfile = pset.ParticleFile(name=filepath, write_ondelete=True, chunks=(len(pset), 1)) outfile.add_metadata('runtime', runtime) pset.execute(move_west, runtime=runtime, dt=dt, output_file=outfile, recovery={ErrorCode.ErrorOutOfBounds: DeleteP}) - ncfile = close_and_compare_netcdffiles(filepath, outfile, assystemcall=assystemcall) - assert ncfile.runtime == runtime - lon = ncfile.variables['lon'][:] - assert (lon.size == noutside) - ncfile.close() + ds = xr.open_zarr(filepath) + assert ds.runtime == runtime + lon = ds['lon'][:] + assert (sum(np.isfinite(lon)) == noutside) + ds.close() @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_variable_write_double(fieldset, pset_mode, mode, tmpdir): - filepath = tmpdir.join("pfile_variable_write_double.nc") + filepath = tmpdir.join("pfile_variable_write_double.zarr") def Update_lon(particle, fieldset, time): particle.lon += 0.1 @@ -176,19 +150,19 @@ def Update_lon(particle, fieldset, time): ofile = pset.ParticleFile(name=filepath, outputdt=0.00001) pset.execute(pset.Kernel(Update_lon), endtime=0.001, dt=0.00001, output_file=ofile) - ncfile = close_and_compare_netcdffiles(filepath, ofile) - lons = ncfile.variables['lon'][:] + ds = xr.open_zarr(filepath) + lons = ds['lon'][:] assert (isinstance(lons.values[0, 0], np.float64)) - ncfile.close() + ds.close() @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) -def test_write_dtypes_pfile(fieldset, mode, pset_mode, tmpdir): - filepath = tmpdir.join("pfile_dtypes.nc") +def test_write_dtypes_pfile(fieldset, pset_mode, mode, tmpdir): + filepath = tmpdir.join("pfile_dtypes.zarr") dtypes = ['float32', 'float64', 'int32', 'uint32', 'int64', 'uint64'] - if mode == 'scipy' or pset_mode == 'soa': + if mode == 'scipy': dtypes.extend(['bool_', 'int8', 'uint8', 'int16', 'uint16']) # Not implemented in AoS JIT class MyParticle(ptype[mode]): @@ -196,21 +170,20 @@ class MyParticle(ptype[mode]): # need an exec() here because we need to dynamically set the variable name exec(f'v_{d} = Variable("v_{d}", dtype=np.{d}, initial=0.)') - pset = pset_type[pset_mode]['pset'](fieldset, pclass=MyParticle, lon=0, lat=0) + pset = pset_type[pset_mode]['pset'](fieldset, pclass=MyParticle, lon=0, lat=0, time=0) pfile = pset.ParticleFile(name=filepath, outputdt=1) pfile.write(pset, 0) - pfile.close() - ncfile = Dataset(filepath, 'r', 'NETCDF4') # using netCDF4.Dataset here because xarray does not observe all dtypes correctly + + ds = xr.open_zarr(filepath, mask_and_scale=False) # Note masking issue at https://stackoverflow.com/questions/68460507/xarray-loading-int-data-as-float for d in dtypes: - nc_fmt = d if d != 'bool_' else 'i1' - assert ncfile.variables[f'v_{d}'].dtype == nc_fmt + assert ds[f'v_{d}'].dtype == d @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) @pytest.mark.parametrize('npart', [1, 2, 5]) def test_variable_written_once(fieldset, pset_mode, mode, tmpdir, npart): - filepath = tmpdir.join("pfile_once_written_variables.nc") + filepath = tmpdir.join("pfile_once_written_variables.zarr") def Update_v(particle, fieldset, time): particle.v_once += 1. @@ -227,11 +200,11 @@ class MyParticle(ptype[mode]): pset.execute(pset.Kernel(Update_v), endtime=1, dt=0.1, output_file=ofile) assert np.allclose(pset.v_once - time - pset.age*10, 0, atol=1e-5) - ncfile = close_and_compare_netcdffiles(filepath, ofile) - vfile = np.ma.filled(ncfile.variables['v_once'][:], np.nan) + ds = xr.open_zarr(filepath) + vfile = np.ma.filled(ds['v_once'][:], np.nan) assert (vfile.shape == (npart, )) assert np.allclose(vfile, time) - ncfile.close() + ds.close() @pytest.mark.parametrize('type', ['repeatdt', 'timearr']) @@ -253,7 +226,7 @@ class MyParticle(ptype[mode]): elif type == 'timearr': pset = pset_type[pset_mode]['pset'](fieldset, lon=np.zeros(runtime), lat=np.zeros(runtime), pclass=MyParticle, time=list(range(runtime))) outfilepath = tmpdir.join("pfile_repeated_release.zarr") - pfile = pset.ParticleFile(outfilepath, outputdt=abs(dt)) + pfile = pset.ParticleFile(outfilepath, outputdt=abs(dt), chunks=(1, 1)) def IncrLon(particle, fieldset, time): particle.sample_var += 1. @@ -262,8 +235,8 @@ def IncrLon(particle, fieldset, time): for i in range(runtime): pset.execute(IncrLon, dt=dt, runtime=1., output_file=pfile) - ncfile = close_and_compare_netcdffiles(outfilepath, pfile) - samplevar = ncfile.variables['sample_var'][:] + ds = xr.open_zarr(outfilepath) + samplevar = ds['sample_var'][:] if type == 'repeatdt': assert samplevar.shape == (runtime // repeatdt+1, min(maxvar+1, runtime)+1) assert np.allclose(pset.sample_var, np.arange(maxvar, -1, -repeatdt)) @@ -274,13 +247,13 @@ def IncrLon(particle, fieldset, time): assert np.allclose([p for p in samplevar[:, k] if np.isfinite(p)], k) filesize = os.path.getsize(str(outfilepath)) assert filesize < 1024 * 65 # test that chunking leads to filesize less than 65KB - ncfile.close() + ds.close() @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_write_timebackward(fieldset, pset_mode, mode, tmpdir): - outfilepath = tmpdir.join("pfile_write_timebackward.nc") + outfilepath = tmpdir.join("pfile_write_timebackward.zarr") def Update_lon(particle, fieldset, time): particle.lon -= 0.1 * particle.dt @@ -290,9 +263,11 @@ def Update_lon(particle, fieldset, time): pfile = pset.ParticleFile(name=outfilepath, outputdt=1.) pset.execute(pset.Kernel(Update_lon), runtime=4, dt=-1., output_file=pfile) - ncfile = close_and_compare_netcdffiles(outfilepath, pfile) - trajs = ncfile.variables['trajectory'][:, 0] - assert np.all(np.diff(trajs) > 0) # all particles written in order of traj ID + ds = xr.open_zarr(outfilepath) + trajs = ds['trajectory'][:] + assert trajs.values.dtype == 'int64' + assert np.all(np.diff(trajs.values) < 0) # all particles written in order of release + ds.close() def test_set_calendar(): @@ -302,32 +277,12 @@ def test_set_calendar(): assert _set_calendar('np_datetime64') == 'standard' -@pytest.mark.parametrize('pset_mode', pset_modes) -def test_error_duplicate_outputdir(fieldset, tmpdir, pset_mode): - outfilepath = tmpdir.join("error_duplicate_outputdir.nc") - pset1 = pset_type[pset_mode]['pset'](fieldset, pclass=JITParticle, lat=0, lon=0) - pset2 = pset_type[pset_mode]['pset'](fieldset, pclass=JITParticle, lat=0, lon=0) - - py_random.seed(1234) - pfile1 = pset1.ParticleFile(name=outfilepath, outputdt=1., convert_at_end=False) - - py_random.seed(1234) - error_thrown = False - try: - pset2.ParticleFile(name=outfilepath, outputdt=1., convert_at_end=False) - except IOError: - error_thrown = True - assert error_thrown - - pfile1.delete_tempwritedir() - - @pytest.mark.parametrize('pset_mode', pset_modes) @pytest.mark.parametrize('mode', ['scipy', 'jit']) def test_reset_dt(fieldset, pset_mode, mode, tmpdir): # Assert that p.dt gets reset when a write_time is not a multiple of dt # for p.dt=0.02 to reach outputdt=0.05 and endtime=0.1, the steps should be [0.2, 0.2, 0.1, 0.2, 0.2, 0.1], resulting in 6 kernel executions - filepath = tmpdir.join("pfile_reset_dt.nc") + filepath = tmpdir.join("pfile_reset_dt.zarr") def Update_lon(particle, fieldset, time): particle.lon += 0.1 diff --git a/tests/test_particle_sets.py b/tests/test_particle_sets.py index c23bcb3d1..e6648a955 100644 --- a/tests/test_particle_sets.py +++ b/tests/test_particle_sets.py @@ -70,7 +70,7 @@ class MyParticle(ptype[mode]): @pytest.mark.parametrize('mode', ['scipy', 'jit']) @pytest.mark.parametrize('restart', [True, False]) def test_pset_create_fromparticlefile(fieldset, pset_mode, mode, restart, tmpdir): - filename = tmpdir.join("pset_fromparticlefile.nc") + filename = tmpdir.join("pset_fromparticlefile.zarr") lon = np.linspace(0, 1, 10, dtype=np.float32) lat = np.linspace(1, 0, 10, dtype=np.float32) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 62533380c..757a104a0 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -25,7 +25,7 @@ def create_outputfiles(dir, pset_mode): y = (fieldset.U.lat[0] + x, fieldset.U.lat[-1] - x) lat = np.linspace(y[0], y[1], npart) - fp = dir.join("DelayParticle.nc") + fp = dir.join("DelayParticle.zarr") output_file = pset.ParticleFile(name=fp, outputdt=delaytime) for t in range(npart):