From 20c4304d246f6f148f77e13ca99bd58b0cea8d24 Mon Sep 17 00:00:00 2001
From: Jayaram Kancherla <jayaram.kancherla@gmail.com>
Date: Tue, 2 Jan 2024 14:28:07 -0500
Subject: [PATCH] Split `BiocFrame` by a column (#91).

---
 src/biocframe/BiocFrame.py | 46 ++++++++++++++++++++++++++++++++++++++
 tests/test_methods.py      | 30 +++++++++++++++++++++++++
 2 files changed, 76 insertions(+)

diff --git a/src/biocframe/BiocFrame.py b/src/biocframe/BiocFrame.py
index 17c9b28..596f8d6 100644
--- a/src/biocframe/BiocFrame.py
+++ b/src/biocframe/BiocFrame.py
@@ -1088,6 +1088,52 @@ def copy(self):
         """Alias for :py:meth:`~__copy__`."""
         return self.__copy__()
 
+    ##########################
+    ######>> split by <<######
+    ##########################
+
+    def split(
+        self, name: str, only_indices: bool = False
+    ) -> Dict[str, Union["BiocFrame", List[int]]]:
+        """Split the object by a column.
+
+        Args:
+            group:
+                Name of the column to split by.
+
+            only_indices:
+                Whether to only return indices.
+                Defaults to False
+
+        Returns:
+            A dictionary of biocframe objects, with names representing the
+            group and the value the sliced frames.
+
+            if ``only_indices`` is True, the values contain the row indices
+            that map to the same group.
+        """
+        if name not in self._column_names:
+            raise ValueError(f"'{name}' is not a valid column name.")
+
+        _column = self.get_column(name)
+
+        _grps = {}
+        for i in range(len(self)):
+            _key = _column[i]
+            if _key not in _grps:
+                _grps[_key] = []
+
+            _grps[_key].append(i)
+
+        if only_indices is True:
+            return _grps
+
+        _sliced_grps = {}
+        for k, v in _grps.items():
+            _sliced_grps[k] = self[v,]
+
+        return _sliced_grps
+
     ################################
     ######>> pandas interop <<######
     ################################
diff --git a/tests/test_methods.py b/tests/test_methods.py
index 3408b95..e9da191 100644
--- a/tests/test_methods.py
+++ b/tests/test_methods.py
@@ -623,3 +623,33 @@ def test_set_names():
     with pytest.raises(ValueError) as ex:
         obj.set_column_names(["A", "A"])
     assert str(ex.value).find("duplicate column name") >= 0
+
+
+def test_bframe_split():
+    obj = {
+        "column1": [1, 2, 3],
+        "nested": [
+            {
+                "ncol1": [4, 5, 6],
+                "ncol2": ["a", "b", "c"],
+                "deep": {"dcol1": ["j", "k", "l"], "dcol2": ["a", "s", "l"]},
+            },
+            {
+                "ncol2": ["a"],
+                "deep": {"dcol1": ["j"], "dcol2": ["a"]},
+            },
+            {
+                "ncol1": [5, 6],
+                "ncol2": ["b", "c"],
+            },
+        ],
+        "column2": ["b", "n", "b"],
+    }
+
+    bframe = BiocFrame(obj)
+    split_frame = bframe.split("column2")
+
+    assert split_frame is not None
+    assert isinstance(split_frame, dict)
+    assert len(split_frame) == 2
+    assert len(split_frame["b"]) == 2