From 20c4304d246f6f148f77e13ca99bd58b0cea8d24 Mon Sep 17 00:00:00 2001 From: Jayaram Kancherla Date: Tue, 2 Jan 2024 14:28:07 -0500 Subject: [PATCH] Split `BiocFrame` by a column (#91). --- src/biocframe/BiocFrame.py | 46 ++++++++++++++++++++++++++++++++++++++ tests/test_methods.py | 30 +++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/src/biocframe/BiocFrame.py b/src/biocframe/BiocFrame.py index 17c9b28..596f8d6 100644 --- a/src/biocframe/BiocFrame.py +++ b/src/biocframe/BiocFrame.py @@ -1088,6 +1088,52 @@ def copy(self): """Alias for :py:meth:`~__copy__`.""" return self.__copy__() + ########################## + ######>> split by <<###### + ########################## + + def split( + self, name: str, only_indices: bool = False + ) -> Dict[str, Union["BiocFrame", List[int]]]: + """Split the object by a column. + + Args: + group: + Name of the column to split by. + + only_indices: + Whether to only return indices. + Defaults to False + + Returns: + A dictionary of biocframe objects, with names representing the + group and the value the sliced frames. + + if ``only_indices`` is True, the values contain the row indices + that map to the same group. + """ + if name not in self._column_names: + raise ValueError(f"'{name}' is not a valid column name.") + + _column = self.get_column(name) + + _grps = {} + for i in range(len(self)): + _key = _column[i] + if _key not in _grps: + _grps[_key] = [] + + _grps[_key].append(i) + + if only_indices is True: + return _grps + + _sliced_grps = {} + for k, v in _grps.items(): + _sliced_grps[k] = self[v,] + + return _sliced_grps + ################################ ######>> pandas interop <<###### ################################ diff --git a/tests/test_methods.py b/tests/test_methods.py index 3408b95..e9da191 100644 --- a/tests/test_methods.py +++ b/tests/test_methods.py @@ -623,3 +623,33 @@ def test_set_names(): with pytest.raises(ValueError) as ex: obj.set_column_names(["A", "A"]) assert str(ex.value).find("duplicate column name") >= 0 + + +def test_bframe_split(): + obj = { + "column1": [1, 2, 3], + "nested": [ + { + "ncol1": [4, 5, 6], + "ncol2": ["a", "b", "c"], + "deep": {"dcol1": ["j", "k", "l"], "dcol2": ["a", "s", "l"]}, + }, + { + "ncol2": ["a"], + "deep": {"dcol1": ["j"], "dcol2": ["a"]}, + }, + { + "ncol1": [5, 6], + "ncol2": ["b", "c"], + }, + ], + "column2": ["b", "n", "b"], + } + + bframe = BiocFrame(obj) + split_frame = bframe.split("column2") + + assert split_frame is not None + assert isinstance(split_frame, dict) + assert len(split_frame) == 2 + assert len(split_frame["b"]) == 2