Improve append (#67)

* Substantial improvement to append * Add data and test notebooks for append * Get data_vars_mapping from data model * Add another test notebook * Update readme to reflect new functionality * Rebase, and post-rebase tidy
informatics-lab · May 26, 2021 · a16ee09 · a16ee09
1 parent 19f5dd4
commit a16ee09
Show file tree

Hide file tree

Showing 22 changed files with 1,200 additions and 117 deletions.
diff --git a/.gitignore b/.gitignore
@@ -79,4 +79,7 @@ venv.bak/
 # PyCharm
 /shelf/
 /workspace.xml
-.idea
+.idea
+
+# VSCode
+.vscode/
diff --git a/README.md b/README.md
@@ -161,14 +161,14 @@ append_files = ['file1.nc', 'file2.nc', 'file3.nc', 'file4.nc', 'file5.nc']
 data_array_name = 'data'
 
 writer.append(append_files, unlimited_dims, data_array_name,
-              baseline=append_files[0])
+              baselines={unlimited_dims: append_files[0]})
 ```
 
 **Note:** The file used to calculate the offsets is not appended as well as being used to calculate
 the offset. You will need to include the offset file in the append files as well!
 
-**Note:** All such appends with a scalar append dimension must be supplied with a `baseline`
-file to calculate the offset, even if an append has already successfully been carried out.
+**Note:** All appends with a scalar append dimension must be supplied with a `baseline`
+file to calculate the offset, even if an append has already successfully been carried out. You must also specify one baseline file per scalar append dimension, in a dictionary of `{"append_dim": baseline_file}`.
 
 If you try and perform an append along a scalar dimension without providing a `baseline`
 file to calculate the offset, you will encounter an error message:
@@ -177,6 +177,28 @@ file to calculate the offset, you will encounter an error message:
 ValueError: Cannot determine scalar step without a baseline dataset.
 ```
 
+##### 3b. Custom offsets between files being appended
+
+You may occasionally need to override the offset between successive files being appended, for example to introduce some padding between files, or to handle unexpected short files. This can be done using
+the `override_offsets` kwarg to `append`. As with specifying `baselines`, you need to pass a dictionary linking the named append dimension to the offset override you wish to apply to that dimension. For example:
+
+```python
+append_files = ['file1.nc', 'file2.nc', 'file3_short.nc', 'file4.nc', 'file5.nc']
+expected_dim_len = 10
+data_array_name = 'data'
+
+writer.append(append_files, unlimited_dims, data_array_name,
+              override_offsets={unlimited_dims: expected_dim_len})
+```
+
+In this case, the third file is shorter than expected (as helpfully indicated in its filename), and the override offset allows us to pad the append dimension with missing data where the file runs short. We can use the `fill_missing_points` method to fill in the gap in the associated dimension coordinate after the append has completed:
+
+```python
+writer.fill_missing_points(unlimited_dims)
+```
+
+**Note:** you do not have to provide an override offset for every append dimension.
+
 #### 4. Read Converted Arrays
 
 We can use the `Reader` classes to read our TileDB array with Iris or Xarray:

diff --git a/nctotdb/data_model.py b/nctotdb/data_model.py
@@ -30,6 +30,7 @@ def __init__(self, netcdf_filename):
         self.shape = None
         self.chunks = None
 
+        self._data_vars_mapping = None
         self._nc_loaded = False
         self._classified = False
 
@@ -65,7 +66,13 @@ def dataset_open(self):
 
     @property
     def data_vars_mapping(self):
-        return None
+        if self._data_vars_mapping is None:
+            self.data_vars_mapping = {metadata_hash(self, n): [self, n] for n in self.data_var_names}
+        return self._data_vars_mapping
+
+    @data_vars_mapping.setter
+    def data_vars_mapping(self, value):
+        self._data_vars_mapping = value
 
     def populate(self):
         with self.open_netcdf():
@@ -346,7 +353,6 @@ def __init__(self, data_models):
         self._data_models = data_models
 
         self._load()
-        self.verify()
 
         self._primary_data_model = None
         self._data_var_names = None
@@ -412,14 +418,19 @@ def data_vars_mapping(self):
     def data_vars_mapping(self, value):
         self._data_vars_mapping = value
 
+    @property
+    def scalar_coord_names(self):
+        dm_scalar_coords = []
+        for dm in self.data_models:
+            dm_scalar_coords += dm.scalar_coord_names
+        return list(set(dm_scalar_coords))
+
     def _map_data_vars(self):
         """Create a mapping of data variable names to the data model supplying that data variable."""
         dv_mapping = {}
         for dm in self.data_models:
             if dm is not None:
-                for name in dm.data_var_names:
-                    hashed_name = metadata_hash(dm, name)
-                    dv_mapping[hashed_name] = [dm, name]
+                dv_mapping.update(dm.data_vars_mapping)
         return dv_mapping
 
     @contextmanager
@@ -461,8 +472,4 @@ def dataset_open(self):
         'open' if all the datasets that comprise it are open.
 
         """
-        return all([dm.dataset_open() for dm in self.data_models if dm is not None])
-
-    def verify(self):
-        """Not implemented!"""
-        pass
+        return all([dm.dataset_open() for dm in self.data_models if dm is not None])
diff --git a/nctotdb/readers/tiledb.py b/nctotdb/readers/tiledb.py
@@ -481,7 +481,7 @@ def to_iris(self, names=None, handle_nan=None):
             iter_groups = self.groups
 
         cubes = []
-        for group_path, group_array_paths in iter_groups.items():
+        for _, group_array_paths in iter_groups.items():
             dim_paths, data_paths = self._get_arrays_and_dims(group_array_paths)
             grid_mapping = self._get_grid_mapping(data_paths[0])
             group_coords = self._load_group_dims(dim_paths, grid_mapping)

diff --git a/nctotdb/tests/integration/append/test_basic_append.ipynb b/nctotdb/tests/integration/append/test_basic_append.ipynb
@@ -0,0 +1,158 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "68f0e08e-ad4b-4eac-8f1d-66faeb2db68e",
+   "metadata": {},
+   "source": [
+    "# Test basic append\n",
+    "\n",
+    "Test basic append operation between two NetCDF datasets with a single append dimension and no scalar coordinates."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eec8e236-439d-43d5-89aa-bd19b668824b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import tempfile\n",
+    "\n",
+    "import nctotdb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36bcfde3-6c7a-4d15-b8af-f994fbde6744",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_path = os.path.join(os.path.dirname(os.path.abspath(\".\")), \"data\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71d1134b-760a-494d-abb7-ee9076014814",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_dataset = os.path.join(data_path, \"xy_t0.nc\")\n",
+    "append_dataset = os.path.join(data_path, \"xy_t1.nc\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad501500-1114-455e-b18c-85c65cc05b7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ncdm = nctotdb.NCDataModel(base_dataset)\n",
+    "ncdm.populate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69fcd77e-cae3-48ca-a669-47bd2c8dc5e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp = tempfile.TemporaryDirectory()\n",
+    "array_filepath = fp.name\n",
+    "array_filepath"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eb3c3ef9-c354-47de-b8a7-28a48d1aed67",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tiledb_name = \"basic_append\"\n",
+    "append_dim = \"time\"\n",
+    "data_array_name = \"data\"\n",
+    "\n",
+    "writer = nctotdb.TileDBWriter(ncdm,\n",
+    "                              array_filepath=array_filepath,\n",
+    "                              array_name=tiledb_name,\n",
+    "                              unlimited_dims=append_dim)\n",
+    "writer.create_domains()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0054013c-bbd1-4550-9bca-51395e19d239",
+   "metadata": {},
+   "source": [
+    "## Test 1. Append"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "68e69f75-4621-4c46-ac46-56ec6a0d4dbc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "writer.append([append_dataset], append_dim, data_array_name, verbose=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea593a04-b1b9-4881-85e4-32b461bb045a",
+   "metadata": {},
+   "source": [
+    "## Test 2. Load appended array as Iris cube"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "adb1d462-9fef-44c9-a370-c03cef3715ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reader = nctotdb.TileDBReader(tiledb_name,\n",
+    "                              array_filepath=array_filepath)\n",
+    "reader.to_iris()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "364d71de-7c35-40e8-8df1-6aa93d258740",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fp.cleanup()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}