Skip to content

Commit

Permalink
Improve append (#67)
Browse files Browse the repository at this point in the history
* Substantial improvement to append

* Add data and test notebooks for append

* Get data_vars_mapping from data model

* Add another test notebook

* Update readme to reflect new functionality

* Rebase, and post-rebase tidy
  • Loading branch information
DPeterK authored May 26, 2021
1 parent 19f5dd4 commit a16ee09
Show file tree
Hide file tree
Showing 22 changed files with 1,200 additions and 117 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,7 @@ venv.bak/
# PyCharm
/shelf/
/workspace.xml
.idea
.idea

# VSCode
.vscode/
28 changes: 25 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,14 @@ append_files = ['file1.nc', 'file2.nc', 'file3.nc', 'file4.nc', 'file5.nc']
data_array_name = 'data'

writer.append(append_files, unlimited_dims, data_array_name,
baseline=append_files[0])
baselines={unlimited_dims: append_files[0]})
```

**Note:** The file used to calculate the offsets is not appended as well as being used to calculate
the offset. You will need to include the offset file in the append files as well!

**Note:** All such appends with a scalar append dimension must be supplied with a `baseline`
file to calculate the offset, even if an append has already successfully been carried out.
**Note:** All appends with a scalar append dimension must be supplied with a `baseline`
file to calculate the offset, even if an append has already successfully been carried out. You must also specify one baseline file per scalar append dimension, in a dictionary of `{"append_dim": baseline_file}`.

If you try and perform an append along a scalar dimension without providing a `baseline`
file to calculate the offset, you will encounter an error message:
Expand All @@ -177,6 +177,28 @@ file to calculate the offset, you will encounter an error message:
ValueError: Cannot determine scalar step without a baseline dataset.
```

##### 3b. Custom offsets between files being appended

You may occasionally need to override the offset between successive files being appended, for example to introduce some padding between files, or to handle unexpected short files. This can be done using
the `override_offsets` kwarg to `append`. As with specifying `baselines`, you need to pass a dictionary linking the named append dimension to the offset override you wish to apply to that dimension. For example:

```python
append_files = ['file1.nc', 'file2.nc', 'file3_short.nc', 'file4.nc', 'file5.nc']
expected_dim_len = 10
data_array_name = 'data'

writer.append(append_files, unlimited_dims, data_array_name,
override_offsets={unlimited_dims: expected_dim_len})
```

In this case, the third file is shorter than expected (as helpfully indicated in its filename), and the override offset allows us to pad the append dimension with missing data where the file runs short. We can use the `fill_missing_points` method to fill in the gap in the associated dimension coordinate after the append has completed:

```python
writer.fill_missing_points(unlimited_dims)
```

**Note:** you do not have to provide an override offset for every append dimension.

#### 4. Read Converted Arrays

We can use the `Reader` classes to read our TileDB array with Iris or Xarray:
Expand Down
27 changes: 17 additions & 10 deletions nctotdb/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def __init__(self, netcdf_filename):
self.shape = None
self.chunks = None

self._data_vars_mapping = None
self._nc_loaded = False
self._classified = False

Expand Down Expand Up @@ -65,7 +66,13 @@ def dataset_open(self):

@property
def data_vars_mapping(self):
return None
if self._data_vars_mapping is None:
self.data_vars_mapping = {metadata_hash(self, n): [self, n] for n in self.data_var_names}
return self._data_vars_mapping

@data_vars_mapping.setter
def data_vars_mapping(self, value):
self._data_vars_mapping = value

def populate(self):
with self.open_netcdf():
Expand Down Expand Up @@ -346,7 +353,6 @@ def __init__(self, data_models):
self._data_models = data_models

self._load()
self.verify()

self._primary_data_model = None
self._data_var_names = None
Expand Down Expand Up @@ -412,14 +418,19 @@ def data_vars_mapping(self):
def data_vars_mapping(self, value):
self._data_vars_mapping = value

@property
def scalar_coord_names(self):
dm_scalar_coords = []
for dm in self.data_models:
dm_scalar_coords += dm.scalar_coord_names
return list(set(dm_scalar_coords))

def _map_data_vars(self):
"""Create a mapping of data variable names to the data model supplying that data variable."""
dv_mapping = {}
for dm in self.data_models:
if dm is not None:
for name in dm.data_var_names:
hashed_name = metadata_hash(dm, name)
dv_mapping[hashed_name] = [dm, name]
dv_mapping.update(dm.data_vars_mapping)
return dv_mapping

@contextmanager
Expand Down Expand Up @@ -461,8 +472,4 @@ def dataset_open(self):
'open' if all the datasets that comprise it are open.
"""
return all([dm.dataset_open() for dm in self.data_models if dm is not None])

def verify(self):
"""Not implemented!"""
pass
return all([dm.dataset_open() for dm in self.data_models if dm is not None])
2 changes: 1 addition & 1 deletion nctotdb/readers/tiledb.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def to_iris(self, names=None, handle_nan=None):
iter_groups = self.groups

cubes = []
for group_path, group_array_paths in iter_groups.items():
for _, group_array_paths in iter_groups.items():
dim_paths, data_paths = self._get_arrays_and_dims(group_array_paths)
grid_mapping = self._get_grid_mapping(data_paths[0])
group_coords = self._load_group_dims(dim_paths, grid_mapping)
Expand Down
158 changes: 158 additions & 0 deletions nctotdb/tests/integration/append/test_basic_append.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "68f0e08e-ad4b-4eac-8f1d-66faeb2db68e",
"metadata": {},
"source": [
"# Test basic append\n",
"\n",
"Test basic append operation between two NetCDF datasets with a single append dimension and no scalar coordinates."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eec8e236-439d-43d5-89aa-bd19b668824b",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import tempfile\n",
"\n",
"import nctotdb"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "36bcfde3-6c7a-4d15-b8af-f994fbde6744",
"metadata": {},
"outputs": [],
"source": [
"data_path = os.path.join(os.path.dirname(os.path.abspath(\".\")), \"data\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71d1134b-760a-494d-abb7-ee9076014814",
"metadata": {},
"outputs": [],
"source": [
"base_dataset = os.path.join(data_path, \"xy_t0.nc\")\n",
"append_dataset = os.path.join(data_path, \"xy_t1.nc\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad501500-1114-455e-b18c-85c65cc05b7c",
"metadata": {},
"outputs": [],
"source": [
"ncdm = nctotdb.NCDataModel(base_dataset)\n",
"ncdm.populate()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "69fcd77e-cae3-48ca-a669-47bd2c8dc5e1",
"metadata": {},
"outputs": [],
"source": [
"fp = tempfile.TemporaryDirectory()\n",
"array_filepath = fp.name\n",
"array_filepath"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb3c3ef9-c354-47de-b8a7-28a48d1aed67",
"metadata": {},
"outputs": [],
"source": [
"tiledb_name = \"basic_append\"\n",
"append_dim = \"time\"\n",
"data_array_name = \"data\"\n",
"\n",
"writer = nctotdb.TileDBWriter(ncdm,\n",
" array_filepath=array_filepath,\n",
" array_name=tiledb_name,\n",
" unlimited_dims=append_dim)\n",
"writer.create_domains()"
]
},
{
"cell_type": "markdown",
"id": "0054013c-bbd1-4550-9bca-51395e19d239",
"metadata": {},
"source": [
"## Test 1. Append"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "68e69f75-4621-4c46-ac46-56ec6a0d4dbc",
"metadata": {},
"outputs": [],
"source": [
"writer.append([append_dataset], append_dim, data_array_name, verbose=False)"
]
},
{
"cell_type": "markdown",
"id": "ea593a04-b1b9-4881-85e4-32b461bb045a",
"metadata": {},
"source": [
"## Test 2. Load appended array as Iris cube"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "adb1d462-9fef-44c9-a370-c03cef3715ad",
"metadata": {},
"outputs": [],
"source": [
"reader = nctotdb.TileDBReader(tiledb_name,\n",
" array_filepath=array_filepath)\n",
"reader.to_iris()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "364d71de-7c35-40e8-8df1-6aa93d258740",
"metadata": {},
"outputs": [],
"source": [
"fp.cleanup()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit a16ee09

Please sign in to comment.